summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.coveragerc3
-rw-r--r--.github/FUNDING.yml12
-rw-r--r--.gitignore35
-rw-r--r--.hgignore44
-rw-r--r--.hgtags67
-rw-r--r--.travis.yml81
-rw-r--r--CHANGES.txt4255
-rw-r--r--CREDITS.txt83
-rw-r--r--DD.py916
-rw-r--r--IDEAS.txt41
-rw-r--r--INSTALL.txt219
-rw-r--r--LICENSE.txt29
-rw-r--r--LICENSES.txt29
-rw-r--r--MANIFEST.in19
-rw-r--r--Makefile178
-rw-r--r--README.rst95
-rw-r--r--TODO.txt58
-rw-r--r--appveyor.yml44
-rw-r--r--benchmark/bench_etree.py452
-rw-r--r--benchmark/bench_objectify.py122
-rw-r--r--benchmark/bench_xpath.py93
-rw-r--r--benchmark/bench_xslt.py56
-rw-r--r--benchmark/benchbase.py541
-rw-r--r--bisect_crashes.py66
-rw-r--r--buildlibxml.py466
-rw-r--r--debian/changelog (renamed from changelog)0
-rw-r--r--debian/compat (renamed from compat)0
-rw-r--r--debian/control (renamed from control)0
-rw-r--r--debian/copyright (renamed from copyright)0
-rw-r--r--debian/python-lxml-doc.doc-base (renamed from python-lxml-doc.doc-base)0
-rwxr-xr-xdebian/rules (renamed from rules)0
-rw-r--r--debian/source/format (renamed from source/format)0
-rw-r--r--debian/watch (renamed from watch)0
-rw-r--r--doc/FAQ.txt1279
-rw-r--r--doc/api.txt667
-rw-r--r--doc/api/Makefile23
-rw-r--r--doc/api/conf.py56
-rw-r--r--doc/api/index.rst14
-rw-r--r--doc/build.txt342
-rw-r--r--doc/capi.txt122
-rw-r--r--doc/compatibility.txt196
-rw-r--r--doc/cssselect.txt126
-rw-r--r--doc/docstructure.py32
-rw-r--r--doc/element_classes.txt615
-rw-r--r--doc/elementsoup.txt222
-rw-r--r--doc/extensions.txt621
-rw-r--r--doc/html/flattr-badge-large.pngbin0 -> 1639 bytes
-rw-r--r--doc/html/paypal_btn_donateCC_LG.gifbin0 -> 2858 bytes
-rw-r--r--doc/html/paypal_btn_donateCC_LG.pngbin0 -> 2461 bytes
-rw-r--r--doc/html/proxies.pngbin0 -> 54518 bytes
-rw-r--r--doc/html/python-xml-title.pngbin0 -> 10553 bytes
-rw-r--r--doc/html/python-xml.pngbin0 -> 7310 bytes
-rw-r--r--doc/html/style.css399
-rw-r--r--doc/html/tagpython-big.pngbin0 -> 19383 bytes
-rw-r--r--doc/html5parser.txt80
-rw-r--r--doc/intro.txt82
-rw-r--r--doc/licenses/BSD.txt29
-rw-r--r--doc/licenses/GPL.txt340
-rw-r--r--doc/licenses/ZopePublicLicense.txt59
-rw-r--r--doc/licenses/elementtree.txt25
-rw-r--r--doc/lxml-source-howto.txt313
-rw-r--r--doc/lxml.mgp122
-rw-r--r--doc/lxml2.txt269
-rw-r--r--doc/lxmlhtml.txt766
-rw-r--r--doc/main.txt307
-rw-r--r--doc/memorymanagement.txt83
-rw-r--r--doc/mkhtml.py327
-rw-r--r--doc/mklatex.py334
-rw-r--r--doc/objectify.txt1409
-rw-r--r--doc/parsing.txt1062
-rw-r--r--doc/performance.txt863
-rw-r--r--doc/pubkey.asc36
-rw-r--r--doc/resolvers.txt283
-rwxr-xr-xdoc/rest2html.py63
-rw-r--r--doc/rest2latex.py66
-rw-r--r--doc/s5/Makefile11
-rw-r--r--doc/s5/ep2008/atom-example.xml20
-rw-r--r--doc/s5/ep2008/atom.py626
-rw-r--r--doc/s5/ep2008/atom.rng597
-rw-r--r--doc/s5/ep2008/atomgen.py27
-rw-r--r--doc/s5/ep2008/proxies.pngbin0 -> 53221 bytes
-rw-r--r--doc/s5/lxml-ep2008.txt1130
-rw-r--r--doc/s5/rst2s5.py92
-rw-r--r--doc/s5/tagpython.pngbin0 -> 20864 bytes
-rw-r--r--doc/s5/ui/default/blank.gifbin0 -> 49 bytes
-rw-r--r--doc/s5/ui/default/bodybg.gifbin0 -> 10119 bytes
-rw-r--r--doc/s5/ui/default/framing.css23
-rw-r--r--doc/s5/ui/default/iepngfix.htc42
-rw-r--r--doc/s5/ui/default/lxml-logo64.pngbin0 -> 8691 bytes
-rw-r--r--doc/s5/ui/default/opera.css7
-rw-r--r--doc/s5/ui/default/outline.css15
-rw-r--r--doc/s5/ui/default/pretty.css221
-rw-r--r--doc/s5/ui/default/print.css24
-rw-r--r--doc/s5/ui/default/s5-core.css9
-rw-r--r--doc/s5/ui/default/slides.css3
-rw-r--r--doc/s5/ui/default/slides.js552
-rw-r--r--doc/s5/ui/default/tagpython.pngbin0 -> 20864 bytes
-rw-r--r--doc/sax.txt137
-rw-r--r--doc/test.xml1
-rw-r--r--doc/tutorial.txt1508
-rw-r--r--doc/valgrind.txt3
-rw-r--r--doc/validation.txt677
-rw-r--r--doc/xpathxslt.txt785
-rwxr-xr-xdownload_artefacts.py136
-rw-r--r--requirements.txt1
-rw-r--r--samples/simple-ns.xml5
-rw-r--r--samples/simple.xml5
-rw-r--r--setup.py252
-rw-r--r--setupinfo.py551
-rw-r--r--src/lxml/ElementInclude.py244
-rw-r--r--src/lxml/__init__.pxd0
-rw-r--r--src/lxml/__init__.py23
-rw-r--r--src/lxml/_elementpath.py345
-rw-r--r--src/lxml/apihelpers.pxi1799
-rw-r--r--src/lxml/builder.pxd10
-rw-r--r--src/lxml/builder.py239
-rw-r--r--src/lxml/classlookup.pxi563
-rw-r--r--src/lxml/cleanup.pxi215
-rw-r--r--src/lxml/cssselect.py102
-rw-r--r--src/lxml/cvarargs.pxd8
-rw-r--r--src/lxml/debug.pxi91
-rw-r--r--src/lxml/docloader.pxi178
-rw-r--r--src/lxml/doctestcompare.py507
-rw-r--r--src/lxml/dtd.pxi472
-rw-r--r--src/lxml/etree.pyx3663
-rw-r--r--src/lxml/extensions.pxi871
-rw-r--r--src/lxml/html/ElementSoup.py10
-rw-r--r--src/lxml/html/__init__.py1948
-rw-r--r--src/lxml/html/_diffcommand.py88
-rw-r--r--src/lxml/html/_html5builder.py100
-rw-r--r--src/lxml/html/_setmixin.py56
-rw-r--r--src/lxml/html/builder.py133
-rw-r--r--src/lxml/html/clean.py779
-rw-r--r--src/lxml/html/defs.py135
-rw-r--r--src/lxml/html/diff.py884
-rw-r--r--src/lxml/html/formfill.py299
-rw-r--r--src/lxml/html/html5parser.py260
-rw-r--r--src/lxml/html/soupparser.py314
-rw-r--r--src/lxml/html/tests/__init__.py1
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_applet.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_blink.data8
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_crazy.data84
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_embed.data8
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_frame.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_iframe.data8
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_link.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_meta.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_object.data8
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onabort.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onblur.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onchange.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onclick.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_ondblclick.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onerror.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onfocus.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onkeydown.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onkeypress.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onkeyup.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onload.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onmousedown.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onmouseout.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onmouseover.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onmouseup.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onreset.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onresize.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onsubmit.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_onunload.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_script.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_script_cdata.data13
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_script_inline.data7
-rw-r--r--src/lxml/html/tests/feedparser-data/entry_content_style.data7
-rw-r--r--src/lxml/html/tests/hackers-org-data/background-image-plus.data8
-rw-r--r--src/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data10
-rw-r--r--src/lxml/html/tests/hackers-org-data/downlevel-hidden.data9
-rw-r--r--src/lxml/html/tests/hackers-org-data/html-plus-time.data12
-rw-r--r--src/lxml/html/tests/hackers-org-data/javascript-link.data15
-rw-r--r--src/lxml/html/tests/hackers-org-data/style-comment.data8
-rw-r--r--src/lxml/html/tests/hackers-org-data/style-expression.data10
-rw-r--r--src/lxml/html/tests/hackers-org-data/style-import.data8
-rw-r--r--src/lxml/html/tests/hackers-org-data/style-js-tag.data7
-rw-r--r--src/lxml/html/tests/hackers-org-data/style-url-js.data8
-rw-r--r--src/lxml/html/tests/hackers-org-data/xml-data-island.data10
-rw-r--r--src/lxml/html/tests/hackers-org-data/xml-embedded-js.data9
-rw-r--r--src/lxml/html/tests/hackers-org-data/xml-namespace.data.BROKEN16
-rw-r--r--src/lxml/html/tests/test_autolink.py10
-rw-r--r--src/lxml/html/tests/test_autolink.txt79
-rw-r--r--src/lxml/html/tests/test_basic.py12
-rw-r--r--src/lxml/html/tests/test_basic.txt236
-rw-r--r--src/lxml/html/tests/test_clean.py147
-rw-r--r--src/lxml/html/tests/test_clean.txt221
-rw-r--r--src/lxml/html/tests/test_clean_embed.txt39
-rw-r--r--src/lxml/html/tests/test_diff.py13
-rw-r--r--src/lxml/html/tests/test_diff.txt252
-rw-r--r--src/lxml/html/tests/test_elementsoup.py128
-rw-r--r--src/lxml/html/tests/test_feedparser_data.py95
-rw-r--r--src/lxml/html/tests/test_formfill.py7
-rw-r--r--src/lxml/html/tests/test_formfill.txt112
-rw-r--r--src/lxml/html/tests/test_forms.py10
-rw-r--r--src/lxml/html/tests/test_forms.txt239
-rw-r--r--src/lxml/html/tests/test_frames.py36
-rw-r--r--src/lxml/html/tests/test_html5parser.py430
-rw-r--r--src/lxml/html/tests/test_rewritelinks.py10
-rw-r--r--src/lxml/html/tests/test_rewritelinks.txt264
-rw-r--r--src/lxml/html/tests/test_select.py47
-rw-r--r--src/lxml/html/tests/test_xhtml.py10
-rw-r--r--src/lxml/html/tests/test_xhtml.txt30
-rw-r--r--src/lxml/html/tests/transform_feedparser_data.py109
-rw-r--r--src/lxml/html/usedoctest.py13
-rw-r--r--src/lxml/includes/__init__.pxd0
-rw-r--r--src/lxml/includes/__init__.py0
-rw-r--r--src/lxml/includes/c14n.pxd26
-rw-r--r--src/lxml/includes/config.pxd3
-rw-r--r--src/lxml/includes/dtdvalid.pxd18
-rw-r--r--src/lxml/includes/etree_defs.h418
-rw-r--r--src/lxml/includes/etreepublic.pxd237
-rw-r--r--src/lxml/includes/htmlparser.pxd56
-rw-r--r--src/lxml/includes/relaxng.pxd64
-rw-r--r--src/lxml/includes/schematron.pxd34
-rw-r--r--src/lxml/includes/tree.pxd480
-rw-r--r--src/lxml/includes/uri.pxd5
-rw-r--r--src/lxml/includes/xinclude.pxd22
-rw-r--r--src/lxml/includes/xmlerror.pxd851
-rw-r--r--src/lxml/includes/xmlparser.pxd249
-rw-r--r--src/lxml/includes/xmlschema.pxd35
-rw-r--r--src/lxml/includes/xpath.pxd135
-rw-r--r--src/lxml/includes/xslt.pxd191
-rw-r--r--src/lxml/isoschematron/__init__.py334
-rw-r--r--src/lxml/isoschematron/resources/rng/iso-schematron.rng709
-rw-r--r--src/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl75
-rw-r--r--src/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl77
-rw-r--r--src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl313
-rw-r--r--src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl1160
-rw-r--r--src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl55
-rw-r--r--src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl1796
-rw-r--r--src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl588
-rw-r--r--src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt84
-rw-r--r--src/lxml/iterparse.pxi438
-rw-r--r--src/lxml/lxml_endian.h14
-rw-r--r--src/lxml/nsclasses.pxi281
-rw-r--r--src/lxml/objectify.pyx1981
-rw-r--r--src/lxml/objectpath.pxi332
-rw-r--r--src/lxml/parser.pxi1901
-rw-r--r--src/lxml/parsertarget.pxi194
-rw-r--r--src/lxml/proxy.pxi619
-rw-r--r--src/lxml/public-api.pxi178
-rw-r--r--src/lxml/pyclasslookup.py3
-rw-r--r--src/lxml/python.pxd132
-rw-r--r--src/lxml/readonlytree.pxi565
-rw-r--r--src/lxml/relaxng.pxi163
-rw-r--r--src/lxml/sax.pxd16
-rw-r--r--src/lxml/sax.py278
-rw-r--r--src/lxml/saxparser.pxi867
-rw-r--r--src/lxml/schematron.pxi167
-rw-r--r--src/lxml/serializer.pxi1870
-rw-r--r--src/lxml/tests/__init__.py4
-rw-r--r--src/lxml/tests/c14n-20/c14nComment.xml4
-rw-r--r--src/lxml/tests/c14n-20/c14nDefault.xml3
-rw-r--r--src/lxml/tests/c14n-20/c14nPrefix.xml4
-rw-r--r--src/lxml/tests/c14n-20/c14nPrefixQname.xml7
-rw-r--r--src/lxml/tests/c14n-20/c14nPrefixQnameXpathElem.xml8
-rw-r--r--src/lxml/tests/c14n-20/c14nQname.xml6
-rw-r--r--src/lxml/tests/c14n-20/c14nQnameElem.xml6
-rw-r--r--src/lxml/tests/c14n-20/c14nQnameXpathElem.xml7
-rw-r--r--src/lxml/tests/c14n-20/c14nTrim.xml4
-rw-r--r--src/lxml/tests/c14n-20/doc.dtd6
-rw-r--r--src/lxml/tests/c14n-20/doc.xsl5
-rw-r--r--src/lxml/tests/c14n-20/inC14N1.xml14
-rw-r--r--src/lxml/tests/c14n-20/inC14N2.xml11
-rw-r--r--src/lxml/tests/c14n-20/inC14N3.xml18
-rw-r--r--src/lxml/tests/c14n-20/inC14N4.xml13
-rw-r--r--src/lxml/tests/c14n-20/inC14N5.xml12
-rw-r--r--src/lxml/tests/c14n-20/inC14N6.xml2
-rw-r--r--src/lxml/tests/c14n-20/inNsContent.xml4
-rw-r--r--src/lxml/tests/c14n-20/inNsDefault.xml3
-rw-r--r--src/lxml/tests/c14n-20/inNsPushdown.xml6
-rw-r--r--src/lxml/tests/c14n-20/inNsRedecl.xml3
-rw-r--r--src/lxml/tests/c14n-20/inNsSort.xml4
-rw-r--r--src/lxml/tests/c14n-20/inNsSuperfluous.xml4
-rw-r--r--src/lxml/tests/c14n-20/inNsXml.xml3
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N1_c14nComment.xml6
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N1_c14nDefault.xml4
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N2_c14nDefault.xml11
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N2_c14nTrim.xml1
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N3_c14nDefault.xml14
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N3_c14nPrefix.xml14
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N3_c14nTrim.xml1
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N4_c14nDefault.xml10
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N4_c14nTrim.xml2
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N5_c14nDefault.xml3
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N5_c14nTrim.xml1
-rw-r--r--src/lxml/tests/c14n-20/out_inC14N6_c14nDefault.xml1
-rw-r--r--src/lxml/tests/c14n-20/out_inNsContent_c14nDefault.xml4
-rw-r--r--src/lxml/tests/c14n-20/out_inNsContent_c14nPrefixQnameXpathElem.xml4
-rw-r--r--src/lxml/tests/c14n-20/out_inNsContent_c14nQnameElem.xml4
-rw-r--r--src/lxml/tests/c14n-20/out_inNsContent_c14nQnameXpathElem.xml4
-rw-r--r--src/lxml/tests/c14n-20/out_inNsDefault_c14nDefault.xml3
-rw-r--r--src/lxml/tests/c14n-20/out_inNsDefault_c14nPrefix.xml3
-rw-r--r--src/lxml/tests/c14n-20/out_inNsPushdown_c14nDefault.xml6
-rw-r--r--src/lxml/tests/c14n-20/out_inNsPushdown_c14nPrefix.xml6
-rw-r--r--src/lxml/tests/c14n-20/out_inNsRedecl_c14nDefault.xml3
-rw-r--r--src/lxml/tests/c14n-20/out_inNsRedecl_c14nPrefix.xml3
-rw-r--r--src/lxml/tests/c14n-20/out_inNsSort_c14nDefault.xml4
-rw-r--r--src/lxml/tests/c14n-20/out_inNsSort_c14nPrefix.xml4
-rw-r--r--src/lxml/tests/c14n-20/out_inNsSuperfluous_c14nDefault.xml4
-rw-r--r--src/lxml/tests/c14n-20/out_inNsSuperfluous_c14nPrefix.xml4
-rw-r--r--src/lxml/tests/c14n-20/out_inNsXml_c14nDefault.xml3
-rw-r--r--src/lxml/tests/c14n-20/out_inNsXml_c14nPrefix.xml3
-rw-r--r--src/lxml/tests/c14n-20/out_inNsXml_c14nPrefixQname.xml3
-rw-r--r--src/lxml/tests/c14n-20/out_inNsXml_c14nQname.xml3
-rw-r--r--src/lxml/tests/c14n-20/world.txt1
-rw-r--r--src/lxml/tests/common_imports.py284
-rw-r--r--src/lxml/tests/dummy_http_server.py84
-rw-r--r--src/lxml/tests/include/test_xinclude.xml4
-rw-r--r--src/lxml/tests/selftest.py1253
-rw-r--r--src/lxml/tests/selftest2.py452
-rw-r--r--src/lxml/tests/shakespeare.html526
-rw-r--r--src/lxml/tests/test-document.xslt10
-rw-r--r--src/lxml/tests/test-string.xml2
-rw-r--r--src/lxml/tests/test.dtd11
-rw-r--r--src/lxml/tests/test.rnc8
-rw-r--r--src/lxml/tests/test.sch8
-rw-r--r--src/lxml/tests/test.xml2
-rw-r--r--src/lxml/tests/test.xsd8
-rw-r--r--src/lxml/tests/test1.rng6
-rw-r--r--src/lxml/tests/test1.xslt9
-rw-r--r--src/lxml/tests/test2.rng13
-rw-r--r--src/lxml/tests/test2.xslt8
-rw-r--r--src/lxml/tests/test_broken.xml1
-rw-r--r--src/lxml/tests/test_builder.py44
-rw-r--r--src/lxml/tests/test_classlookup.py402
-rw-r--r--src/lxml/tests/test_css.py68
-rw-r--r--src/lxml/tests/test_doctestcompare.py133
-rw-r--r--src/lxml/tests/test_dtd.py415
-rw-r--r--src/lxml/tests/test_elementpath.py302
-rw-r--r--src/lxml/tests/test_elementtree.py4965
-rw-r--r--src/lxml/tests/test_errors.py77
-rw-r--r--src/lxml/tests/test_etree.py5381
-rw-r--r--src/lxml/tests/test_external_document.py106
-rw-r--r--src/lxml/tests/test_htmlparser.py663
-rw-r--r--src/lxml/tests/test_http_io.py125
-rw-r--r--src/lxml/tests/test_import.xsd10
-rw-r--r--src/lxml/tests/test_inc.xsd10
-rw-r--r--src/lxml/tests/test_incremental_xmlfile.py674
-rw-r--r--src/lxml/tests/test_io.py373
-rw-r--r--src/lxml/tests/test_isoschematron.py870
-rw-r--r--src/lxml/tests/test_nsclasses.py212
-rw-r--r--src/lxml/tests/test_objectify.py2681
-rw-r--r--src/lxml/tests/test_pyclasslookup.py351
-rw-r--r--src/lxml/tests/test_relaxng.py260
-rw-r--r--src/lxml/tests/test_sax.py416
-rw-r--r--src/lxml/tests/test_schematron.py82
-rw-r--r--src/lxml/tests/test_threading.py590
-rw-r--r--src/lxml/tests/test_unicode.py211
-rw-r--r--src/lxml/tests/test_xmlschema.py505
-rw-r--r--src/lxml/tests/test_xpathevaluator.py748
-rw-r--r--src/lxml/tests/test_xslt.py2093
-rw-r--r--src/lxml/usedoctest.py13
-rw-r--r--src/lxml/xinclude.pxi67
-rw-r--r--src/lxml/xmlerror.pxi1646
-rw-r--r--src/lxml/xmlid.pxi179
-rw-r--r--src/lxml/xmlschema.pxi211
-rw-r--r--src/lxml/xpath.pxi502
-rw-r--r--src/lxml/xslt.pxi971
-rw-r--r--src/lxml/xsltext.pxi242
-rw-r--r--test.py621
-rwxr-xr-xtools/manylinux/build-wheels.sh81
-rw-r--r--tools/xpathgrep.py334
-rw-r--r--tox.ini19
-rw-r--r--update-error-constants.py157
-rw-r--r--valgrind-python.supp480
-rw-r--r--versioninfo.py81
371 files changed, 93795 insertions, 0 deletions
diff --git a/.coveragerc b/.coveragerc
new file mode 100644
index 0000000..fe01daa
--- /dev/null
+++ b/.coveragerc
@@ -0,0 +1,3 @@
+[run]
+plugins = Cython.Coverage
+source = src
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 0000000..4c18401
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,12 @@
+# These are supported funding model platforms
+
+github: scoder # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: pypi/lxml # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..8f4bad9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,35 @@
+*.pyc
+.tox
+.idea
+build
+dist
+wheelhouse
+wheels
+venvs
+venv
+doc/html
+libs
+*.egg-info
+*.pdb
+*.so
+*.o
+*.pyd
+MANIFEST
+
+doc/api/lxml*.rst
+doc/api/_build/
+doc/s5/lxml-ep2008.html
+src/lxml/includes/lxml-version.h
+src/lxml/*.html
+src/lxml/html/*.c
+src/lxml/_elementpath.c
+src/lxml/builder.c
+src/lxml/etree.c
+src/lxml/etree.h
+src/lxml/etree_api.h
+src/lxml/lxml.etree.c
+src/lxml/lxml.etree.h
+src/lxml/lxml.etree_api.h
+src/lxml/objectify.c
+src/lxml/lxml.objectify.c
+src/lxml/sax.c
diff --git a/.hgignore b/.hgignore
new file mode 100644
index 0000000..7a702b2
--- /dev/null
+++ b/.hgignore
@@ -0,0 +1,44 @@
+syntax: glob
+
+*.pyc
+*.pyo
+__pycache__
+
+src/lxml/includes/lxml-version.h
+src/lxml/*.html
+src/lxml/html/*.c
+src/lxml/etree.c
+src/lxml/etree.h
+src/lxml/etree_api.h
+src/lxml/lxml.etree.c
+src/lxml/lxml.etree.h
+src/lxml/lxml.etree_api.h
+src/lxml/objectify.c
+src/lxml/lxml.objectify.c
+
+build/
+libs/
+dist/
+wheelhouse/
+wheels/
+venvs/
+venv/
+doc/html/
+cython_debug/
+.idea/
+.git/
+.gitrev
+.coverage
+funding.txt
+MANIFEST
+.tox
+*.orig
+*.rej
+*.dep
+*.swp
+*.pdb
+*.so
+*.o
+*.pyd
+*~
+*.egg-info
diff --git a/.hgtags b/.hgtags
new file mode 100644
index 0000000..45a05c4
--- /dev/null
+++ b/.hgtags
@@ -0,0 +1,67 @@
+40fdc2efbcf833c2d2de7a1ebff7cc0b634e3a0a lxml-2.3
+ea513f9a9811ee9b3991a1df0319b197b361e5cb lxml-0.5.1
+e0fa117052c57bb83d005b962ff8788605efeadc lxml-0.6
+802f612635d91469d9430bee819713ce7ecb30e2 lxml-0.7
+1623013df810d6b4363dd1daf9f7f6fe5603f458 lxml-0.9
+11e79f443fed94d91f90c8080a2c8a8afeb1ae94 lxml-1.0.beta
+a37777a46c55ae77a78266f57c3b6bca2ca04c5f lxml-1.0
+6a117f91ff2ac2824aeed1ccc87512608d131f47 lxml-1.1
+782bc8d9146fd9666879ace31cb4cf541a390173 lxml-1.1alpha
+4144bbe6f24822a7ce5392130b2c98354cd9847e lxml-1.1beta
+8205702eda77bb4a23d6789cc5ee94b4d36e65d4 lxml-1.2
+4d410818a0e10638bb5eb5b54a37350a3477629b lxml-2.0
+1dabace6188ee89b433f30b3838b6f2129698ba5 lxml-2.0alpha1
+9dec5f9222aea1b9c531cfcc7e68d2c328394247 lxml-2.0alpha2
+b7873fce37031508d6fb115d8c79abad00b9f219 lxml-2.0alpha3
+2ae894916b47710bbc79c139ff9f4a861ca5815c lxml-2.0alpha4
+c81c85642ca9eafe85630c46ef828828e692842e lxml-2.0alpha5
+7a9b9811fadbd32b34d2b3e07901e213daca98d3 lxml-2.0alpha6
+68ba2cbfc422d59bcba09216a7707153bd58b2e8 lxml-2.0beta1
+b1389dfc312b7d438fd673f7f7ee75d892ef81d7 lxml-2.0beta2
+9b2be5208b1ecf4cfe5fd3cd0de4a396267abd97 lxml-2.0.1
+c5790462867c207a3c78dd510055589bf2950f9d lxml-2.1alpha1
+d1f3cf7d078796553de3b276db580796d5aeb048 lxml-2.1beta1
+fb891a783f270aefd03df44105677d49d765f2b2 lxml-2.1beta2
+714552f48a53c2555994b6c56deb3de1e7ee702e lxml-2.1beta3
+89227c4d5809f866f4a54a791d2452dc0ccc8d3b lxml-2.1
+e38e2a1162010841eebb60174be16797e0d34a87 lxml-2.2
+3f730df23e58592418e22572fea5d8dfce7cf87d lxml-2.2.1
+376b4baba7c91b98fae1ebd07592b21e5e535ba8 lxml-2.2.2
+405a1fb3486ee8a9be7f37fd000553df21b95d5e lxml-2.3alpha1
+901463d324cda95df28b2cab3ee86f715103fa84 lxml-2.3alpha2
+c9fef2d447ca436a83fa41183b73ccc825052f48 lxml-2.3beta1
+5f5143534860cfba7fbe3ceab98dc749e79a4fc3 lxml-2.3.1
+65ce4c8efb51013363dcc7318c847fc2f28f2eb2 lxml-2.3.2
+945b29e5b54abf07897b46ebcb6d2227c05f8137 lxml-2.3.3
+cf0980063266b383d0403759993536eaa18ebe93 lxml-2.3.4
+c161cb55f4d4ebd93f5aee72ed73f267155bd894 lxml-2.3.5
+66c66707c7d8a89b99a24bb29c791dbe9dc860f1 lxml-2.3.6
+36e5b10c3ae6256e613554e7d71c34de0d71f385 lxml-3.0beta1
+5bd7af62e93207ff58d54fc83f96b078b621eef8 lxml-3.0alpha2
+6d41ed7c4b756792c9be44a5b8a383c10718016f lxml-3.0alpha1
+60dd2d56701944e05c7655d1f47c56657d7837b3 lxml-3.0
+22efeb405c9c4fb326541f56e431fd8e2686c435 lxml-3.0.1
+714ab3c31e40ab2fef58e2be523de1ef4cb2a8a0 lxml-3.0.2
+3e04be8a649395b193c70dd7d0e2b2b5acecb563 lxml-3.1beta1
+91c436e11e2a822154fef48abe64274646cfde45 lxml-3.1.0
+e408b1f0eca00cb226acd28cd169988f9690067f lxml-3.1.1
+862039d37b73e0250c9d8af6e5a689f6fe6321cd lxml-3.1.2
+76262b9d449e75624b9dea745364f87e2e99b2a3 lxml-3.2.0
+3ec87d40b6b4dbb81fa3d46bf7ba5210f7e6b92e lxml-3.2.2
+d65c1991be020e3c3ee9413ee1573b863eb69752 lxml-3.2.4
+958bfe4b6411aa7a50c98a261f1ab7d1c256333e lxml-3.3.1
+80d6c40625d3adc3dc06a251ce5cf8cfe7de18de lxml-3.3.2
+41635db399198a95af22e337d83aec0cbbe046a4 lxml-3.3.3
+b058684af98d002fb980ee43446a91233c630e0b lxml-3.3.4
+e0052eb7b9b7631b4adeb4e7a83e15de3f63c05b lxml-3.3.5
+c08b1c7e4e7f5240b288f19e8c3728222147a32e lxml-3.3.6
+c7fc22082dd434785c27343fc70ae91d7c117b46 lxml-3.4.1
+e22f32fd870f51281e89e98d7b22ec3f4d706663 lxml-3.4.2
+57a861c8a54acd891ee59a53bd7b9fea1398bbad lxml-3.4.3
+eaade2a0be84e3e1173e168e09773b86f9a290e9 lxml-3.4.4
+5d2123497105888f3a5c8cd0705a6840c51c6fd5 lxml-3.5.0b1
+2f6099adea6247cdc15e42a0192e0b3306cc9610 lxml-3.5.0
+853cdec748fc0318af26cecdc00756683aaa27a4 lxml-3.6.0
+2a83ab44c6599657519991773da53a45cbb60501 lxml-3.6.1
+e701fea467749465f6e9f80f0aa080048c895ee5 lxml-3.6.2
+1220d40cbfe354cbcd19f99abdd21df0ea649037 lxml-4.2.4
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..13ec41b
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,81 @@
+os: linux
+language: python
+
+cache:
+ pip: true
+ directories:
+ - $HOME/.ccache
+ - libs
+
+python:
+ - 3.9
+ - 2.7
+ - 3.8
+ - 3.7
+ - 3.6
+ - 3.5
+
+env:
+ global:
+ - USE_CCACHE=1
+ - CCACHE_SLOPPINESS=pch_defines,time_macros
+ - CCACHE_COMPRESS=1
+ - CCACHE_MAXSIZE=70M
+ - PATH="/usr/lib/ccache:$PATH"
+ - LIBXML2_VERSION=2.9.10
+ - LIBXSLT_VERSION=1.1.34
+ matrix:
+ - STATIC_DEPS=false
+ - STATIC_DEPS=true
+
+matrix:
+ include:
+ - python: 3.8
+ env:
+ - STATIC_DEPS=false
+ - EXTRA_DEPS="docutils pygments sphinx sphinx-rtd-theme"
+ script: make html
+ - python: 3.8
+ env:
+ - STATIC_DEPS=false
+ - EXTRA_DEPS="coverage<5"
+ - python: 3.8
+ env:
+ - STATIC_DEPS=true
+ - LIBXML2_VERSION=2.9.2 # minimum version requirements
+ - LIBXSLT_VERSION=1.1.27
+ - python: pypy
+ env: STATIC_DEPS=false
+ - python: pypy3
+ env: STATIC_DEPS=false
+ - python: 3.8
+ env: STATIC_DEPS=false
+ arch: arm64
+ - python: 3.8
+ env: STATIC_DEPS=true
+ arch: arm64
+ - python: 3.8
+ env: STATIC_DEPS=false
+ arch: ppc64le
+ - python: 3.8
+ env: STATIC_DEPS=true
+ arch: ppc64le
+ allow_failures:
+ - python: pypy
+ - python: pypy3
+
+install:
+ - pip install -U pip wheel
+ - if [ -z "${TRAVIS_PYTHON_VERSION##*-dev}" ];
+ then pip install --install-option=--no-cython-compile https://github.com/cython/cython/archive/master.zip;
+ else pip install -r requirements.txt;
+ fi
+ - pip install -U beautifulsoup4 cssselect html5lib rnc2rng ${EXTRA_DEPS}
+
+script:
+ - CFLAGS="-O0 -g -fPIC" python -u setup.py build_ext --inplace
+ $(if [ -n "${TRAVIS_PYTHON_VERSION##2.*}" -a -n "${TRAVIS_PYTHON_VERSION##3.[34]*}" ]; then echo -n " -j7 "; fi )
+ $(if [ -n "$EXTRA_DEPS" -a -z "${EXTRA_DEPS##*coverage*}" ]; then echo -n "--with-coverage"; fi )
+ - ccache -s || true
+ - CFLAGS="-O0 -g -fPIC" PYTHONUNBUFFERED=x make test
+ - ccache -s || true
diff --git a/CHANGES.txt b/CHANGES.txt
new file mode 100644
index 0000000..22f4d45
--- /dev/null
+++ b/CHANGES.txt
@@ -0,0 +1,4255 @@
+==============
+lxml changelog
+==============
+
+4.6.3 (2021-03-21)
+==================
+
+Bugs fixed
+----------
+
+* A vulnerability (CVE-2021-28957) was discovered in the HTML Cleaner by Kevin Chung,
+ which allowed JavaScript to pass through. The cleaner now removes the HTML5
+ ``formaction`` attribute.
+
+
+4.6.2 (2020-11-26)
+==================
+
+Bugs fixed
+----------
+
+* A vulnerability (CVE-2020-27783) was discovered in the HTML Cleaner by Yaniv Nizry,
+ which allowed JavaScript to pass through. The cleaner now removes more sneaky
+ "style" content.
+
+
+4.6.1 (2020-10-18)
+==================
+
+Bugs fixed
+----------
+
+* A vulnerability was discovered in the HTML Cleaner by Yaniv Nizry, which allowed
+ JavaScript to pass through. The cleaner now removes more sneaky "style" content.
+
+
+4.6.0 (2020-10-17)
+==================
+
+Features added
+--------------
+
+* GH#310: ``lxml.html.InputGetter`` supports ``__len__()`` to count the number of input fields.
+ Patch by Aidan Woolley.
+
+* ``lxml.html.InputGetter`` has a new ``.items()`` method to ease processing all input fields.
+
+* ``lxml.html.InputGetter.keys()`` now returns the field names in document order.
+
+* GH-309: The API documentation is now generated using ``sphinx-apidoc``.
+ Patch by Chris Mayo.
+
+Bugs fixed
+----------
+
+* LP#1869455: C14N 2.0 serialisation failed for unprefixed attributes
+ when a default namespace was defined.
+
+* ``TreeBuilder.close()`` raised ``AssertionError`` in some error cases where it
+ should have raised ``XMLSyntaxError``. It now raises a combined exception to
+ keep up backwards compatibility, while switching to ``XMLSyntaxError`` as an
+ interface.
+
+
+4.5.2 (2020-07-09)
+==================
+
+Bugs fixed
+----------
+
+* ``Cleaner()`` now validates that only known configuration options can be set.
+
+* LP#1882606: ``Cleaner.clean_html()`` discarded comments and PIs regardless of the
+ corresponding configuration option, if ``remove_unknown_tags`` was set.
+
+* LP#1880251: Instead of globally overwriting the document loader in libxml2, lxml now
+ sets it per parser run, which improves the interoperability with other users of libxml2
+ such as libxmlsec.
+
+* LP#1881960: Fix build in CPython 3.10 by using Cython 0.29.21.
+
+* The setup options "--with-xml2-config" and "--with-xslt-config" were accidentally renamed
+ to "--xml2-config" and "--xslt-config" in 4.5.1 and are now available again.
+
+
+4.5.1 (2020-05-19)
+==================
+
+Bugs fixed
+----------
+
+* LP#1570388: Fix failures when serialising documents larger than 2GB in some cases.
+
+* LP#1865141, GH#298: ``QName`` values were not accepted by the ``el.iter()`` method.
+ Patch by xmo-odoo.
+
+* LP#1863413, GH#297: The build failed to detect libraries on Linux that are only
+ configured via pkg-config.
+ Patch by Hugh McMaster.
+
+
+4.5.0 (2020-01-29)
+==================
+
+Features added
+--------------
+
+* A new function ``indent()`` was added to insert tail whitespace for pretty-printing
+ an XML tree.
+
+Bugs fixed
+----------
+
+* LP#1857794: Tail text of nodes that get removed from a document using item
+ deletion disappeared silently instead of sticking with the node that was removed.
+
+Other changes
+-------------
+
+* MacOS builds are 64-bit-only by default.
+ Set CFLAGS and LDFLAGS explicitly to override it.
+
+* Linux/MacOS Binary wheels now use libxml2 2.9.10 and libxslt 1.1.34.
+
+* LP#1840234: The package version number is now available as ``lxml.__version__``.
+
+
+4.4.3 (2020-01-28)
+==================
+
+Bugs fixed
+----------
+
+* LP#1844674: ``itertext()`` was missing tail text of comments and PIs since 4.4.0.
+
+
+4.4.2 (2019-11-25)
+==================
+
+Bugs fixed
+----------
+
+* LP#1835708: ``ElementInclude`` incorrectly rejected repeated non-recursive
+ includes as recursive.
+ Patch by Rainer Hausdorf.
+
+
+4.4.1 (2019-08-11)
+==================
+
+Bugs fixed
+----------
+
+* LP#1838252: The order of an OrderedDict was lost in 4.4.0 when passing it as
+ attrib mapping during element creation.
+
+* LP#1838521: The package metadata now lists the supported Python versions.
+
+
+4.4.0 (2019-07-27)
+==================
+
+Features added
+--------------
+
+* ``Element.clear()`` accepts a new keyword argument ``keep_tail=True`` to clear
+ everything but the tail text. This is helpful in some document-style use cases
+ and for clearing the current element in ``iterparse()`` and pull parsing.
+
+* When creating attributes or namespaces from a dict in Python 3.6+, lxml now
+ preserves the original insertion order of that dict, instead of always sorting
+ the items by name. A similar change was made for ElementTree in CPython 3.8.
+ See https://bugs.python.org/issue34160
+
+* Integer elements in ``lxml.objectify`` implement the ``__index__()`` special method.
+
+* GH#269: Read-only elements in XSLT were missing the ``nsmap`` property.
+ Original patch by Jan Pazdziora.
+
+* ElementInclude can now restrict the maximum inclusion depth via a ``max_depth``
+ argument to prevent content explosion. It is limited to 6 by default.
+
+* The ``target`` object of the XMLParser can have ``start_ns()`` and ``end_ns()``
+ callback methods to listen to namespace declarations.
+
+* The ``TreeBuilder`` has new arguments ``comment_factory`` and ``pi_factory`` to
+ pass factories for creating comments and processing instructions, as well as
+ flag arguments ``insert_comments`` and ``insert_pis`` to discard them from the
+ tree when set to false.
+
+* A `C14N 2.0 <https://www.w3.org/TR/xml-c14n2/>`_ implementation was added as
+ ``etree.canonicalize()``, a corresponding ``C14NWriterTarget`` class, and
+ a ``c14n2`` serialisation method.
+
+Bugs fixed
+----------
+
+* When writing to file paths that contain the URL escape character '%', the file
+ path could wrongly be mangled by URL unescaping and thus write to a different
+ file or directory. Code that writes to file paths that are provided by untrusted
+ sources, but that must work with previous versions of lxml, should best either
+ reject paths that contain '%' characters, or otherwise make sure that the path
+ does not contain maliciously injected '%XX' URL hex escapes for paths like '../'.
+
+* Assigning to Element child slices with negative step could insert the slice at
+ the wrong position, starting too far on the left.
+
+* Assigning to Element child slices with overly large step size could take very
+ long, regardless of the length of the actual slice.
+
+* Assigning to Element child slices of the wrong size could sometimes fail to
+ raise a ValueError (like a list assignment would) and instead assign outside
+ of the original slice bounds or leave parts of it unreplaced.
+
+* The ``comment`` and ``pi`` events in ``iterwalk()`` were never triggered, and
+ instead, comments and processing instructions in the tree were reported as
+ ``start`` elements. Also, when walking an ElementTree (as opposed to its root
+ element), comments and PIs outside of the root element are now reported.
+
+* LP#1827833: The RelaxNG compact syntax support was broken with recent versions
+ of ``rnc2rng``.
+
+* LP#1758553: The HTML elements ``source`` and ``track`` were added to the list
+ of empty tags in ``lxml.html.defs``.
+
+* Registering a prefix other than "xml" for the XML namespace is now rejected.
+
+* Failing to write XSLT output to a file could raise a misleading exception.
+ It now raises ``IOError``.
+
+Other changes
+-------------
+
+* Support for Python 3.4 was removed.
+
+* When using ``Element.find*()`` with prefix-namespace mappings, the empty string
+ is now accepted to define a default namespace, in addition to the previously
+ supported ``None`` prefix. Empty strings are more convenient since they keep
+ all prefix keys in a namespace dict strings, which simplifies sorting etc.
+
+* The ``ElementTree.write_c14n()`` method has been deprecated in favour of the
+ long preferred ``ElementTree.write(f, method="c14n")``. It will be removed
+ in a future release.
+
+
+4.3.5 (2019-07-27)
+==================
+
+* Rebuilt with Cython 0.29.13 to support Python 3.8.
+
+
+4.3.4 (2019-06-10)
+==================
+
+* Rebuilt with Cython 0.29.10 to support Python 3.8.
+
+
+4.3.3 (2019-03-26)
+==================
+
+Bugs fixed
+----------
+
+* Fix leak of output buffer and unclosed files in ``_XSLTResultTree.write_output()``.
+
+
+4.3.2 (2019-02-29)
+==================
+
+Bugs fixed
+----------
+
+* Crash in 4.3.1 when appending a child subtree with certain text nodes.
+
+Other changes
+-------------
+
+* Built with Cython 0.29.6.
+
+
+4.3.1 (2019-02-08)
+==================
+
+Bugs fixed
+----------
+
+* LP#1814522: Crash when appending a child subtree that contains unsubstituted
+ entity references.
+
+Other changes
+-------------
+
+* Built with Cython 0.29.5.
+
+
+4.3.0 (2019-01-04)
+==================
+
+Features added
+--------------
+
+* The module ``lxml.sax`` is compiled using Cython in order to speed it up.
+
+* GH#267: ``lxml.sax.ElementTreeProducer`` now preserves the namespace prefixes.
+ If two prefixes point to the same URI, the first prefix in alphabetical order
+ is used. Patch by Lennart Regebro.
+
+* Updated ISO-Schematron implementation to 2013 version (now MIT licensed)
+ and the corresponding schema to the 2016 version (with optional "properties").
+
+Other changes
+-------------
+
+* GH#270, GH#271: Support for Python 2.6 and 3.3 was removed.
+ Patch by hugovk.
+
+* The minimum dependency versions were raised to libxml2 2.9.2 and libxslt 1.1.27,
+ which were released in 2014 and 2012 respectively.
+
+* Built with Cython 0.29.2.
+
+
+4.2.6 (2019-01-02)
+==================
+
+Bugs fixed
+----------
+
+* LP#1799755: Fix a DeprecationWarning in Py3.7+.
+
+* Import warnings in Python 3.6+ were resolved.
+
+
+4.2.5 (2018-09-09)
+==================
+
+Bugs fixed
+----------
+
+* Javascript URLs that used URL escaping were not removed by the HTML cleaner.
+ Security problem found by Omar Eissa. (CVE-2018-19787)
+
+
+4.2.4 (2018-08-03)
+==================
+
+Features added
+--------------
+
+* GH#259: Allow using ``pkg-config`` for build configuration.
+ Patch by Patrick Griffis.
+
+Bugs fixed
+----------
+
+* LP#1773749, GH#268: Crash when moving an element to another document with
+ ``Element.insert()``.
+ Patch by Alexander Weggerle.
+
+
+4.2.3 (2018-06-27)
+==================
+
+Bugs fixed
+----------
+
+* Reverted GH#265: lxml links against zlib as a shared library again.
+
+
+4.2.2 (2018-06-22)
+==================
+
+Bugs fixed
+----------
+
+* GH#266: Fix sporadic crash during GC when parse-time schema validation is used
+ and the parser participates in a reference cycle.
+ Original patch by Julien Greard.
+
+* GH#265: lxml no longer links against zlib as a shared library, only on static builds.
+ Patch by Nehal J Wani.
+
+
+4.2.1 (2018-03-21)
+==================
+
+Bugs fixed
+----------
+
+* LP#1755825: ``iterwalk()`` failed to return the 'start' event for the initial
+ element if a tag selector is used.
+
+* LP#1756314: Failure to import 4.2.0 into PyPy due to a missing library symbol.
+
+* LP#1727864, GH#258: Add "-isysroot" linker option on MacOS as needed by XCode 9.
+
+
+4.2.0 (2018-03-13)
+==================
+
+Features added
+--------------
+
+* GH#255: ``SelectElement.value`` returns more standard-compliant and
+ browser-like defaults for non-multi-selects. If no option is selected, the
+ value of the first option is returned (instead of None). If multiple options
+ are selected, the value of the last one is returned (instead of that of the
+ first one). If no options are present (not standard-compliant)
+ ``SelectElement.value`` still returns ``None``.
+
+* GH#261: The ``HTMLParser()`` now supports the ``huge_tree`` option.
+ Patch by stranac.
+
+Bugs fixed
+----------
+
+* LP#1551797: Some XSLT messages were not captured by the transform error log.
+
+* LP#1737825: Crash at shutdown after an interrupted iterparse run with XMLSchema
+ validation.
+
+Other changes
+-------------
+
+
+4.1.1 (2017-11-04)
+==================
+
+* Rebuild with Cython 0.27.3 to improve support for Py3.7.
+
+
+4.1.0 (2017-10-13)
+==================
+
+Features added
+--------------
+
+* ElementPath supports text predicates for current node, like "[.='text']".
+
+* ElementPath allows spaces in predicates.
+
+* Custom Element classes and XPath functions can now be registered with a
+ decorator rather than explicit dict assignments.
+
+* Static Linux wheels are now built with link time optimisation (LTO) enabled.
+ This should have a beneficial impact on the overall performance by providing
+ a tighter compiler integration between lxml and libxml2/libxslt.
+
+Bugs fixed
+----------
+
+* LP#1722776: Requesting non-Element objects like comments from a document with
+ ``PythonElementClassLookup`` could fail with a TypeError.
+
+
+4.0.0 (2017-09-17)
+==================
+
+Features added
+--------------
+
+* The ElementPath implementation is now compiled using Cython,
+ which speeds up the ``.find*()`` methods quite significantly.
+
+* The modules ``lxml.builder``, ``lxml.html.diff`` and ``lxml.html.clean``
+ are also compiled using Cython in order to speed them up.
+
+* ``xmlfile()`` supports async coroutines using ``async with`` and ``await``.
+
+* ``iterwalk()`` has a new method ``skip_subtree()`` that prevents walking into
+ the descendants of the current element.
+
+* ``RelaxNG.from_rnc_string()`` accepts a ``base_url`` argument to
+ allow relative resource lookups.
+
+* The XSLT result object has a new method ``.write_output(file)`` that serialises
+ output data into a file according to the ``<xsl:output>`` configuration.
+
+Bugs fixed
+----------
+
+* GH#251: HTML comments were handled incorrectly by the soupparser.
+ Patch by mozbugbox.
+
+* LP#1654544: The html5parser no longer passes the ``useChardet`` option
+ if the input is a Unicode string, unless explicitly requested. When parsing
+ files, the default is to enable it when a URL or file path is passed (because
+ the file is then opened in binary mode), and to disable it when reading from
+ a file(-like) object.
+
+ Note: This is a backwards incompatible change of the default configuration.
+ If your code parses byte strings/streams and depends on character detection,
+ please pass the option ``guess_charset=True`` explicitly, which already worked
+ in older lxml versions.
+
+* LP#1703810: ``etree.fromstring()`` failed to parse UTF-32 data with BOM.
+
+* LP#1526522: Some RelaxNG errors were not reported in the error log.
+
+* LP#1567526: Empty and plain text input raised a TypeError in soupparser.
+
+* LP#1710429: Uninitialised variable usage in HTML diff.
+
+* LP#1415643: The closing tags context manager in ``xmlfile()`` could continue
+ to output end tags even after writing failed with an exception.
+
+* LP#1465357: ``xmlfile.write()`` now accepts and ignores None as input argument.
+
+* Compilation under Py3.7-pre failed due to a modified function signature.
+
+Other changes
+-------------
+
+* The main module source files were renamed from ``lxml.*.pyx`` to plain
+ ``*.pyx`` (e.g. ``etree.pyx``) to simplify their handling in the build
+ process. Care was taken to keep the old header files as fallbacks for
+ code that compiles against the public C-API of lxml, but it might still
+ be worth validating that third-party code does not notice this change.
+
+
+3.8.0 (2017-06-03)
+==================
+
+Features added
+--------------
+
+* ``ElementTree.write()`` has a new option ``doctype`` that writes out a
+ doctype string before the serialisation, in the same way as ``tostring()``.
+
+* GH#220: ``xmlfile`` allows switching output methods at an element level.
+ Patch by Burak Arslan.
+
+* LP#1595781, GH#240: added a PyCapsule Python API and C-level API for
+ passing externally generated libxml2 documents into lxml.
+
+* GH#244: error log entries have a new property ``path`` with an XPath
+ expression (if known, None otherwise) that points to the tree element
+ responsible for the error. Patch by Bob Kline.
+
+* The namespace prefix mapping that can be used in ElementPath now injects
+ a default namespace when passing a None prefix.
+
+Bugs fixed
+----------
+
+* GH#238: Character escapes were not hex-encoded in the ``xmlfile`` serialiser.
+ Patch by matejcik.
+
+* GH#229: fix for externally created XML documents. Patch by Theodore Dubois.
+
+* LP#1665241, GH#228: Form data handling in lxml.html no longer strips the
+ option values specified in form attributes but only the text values.
+ Patch by Ashish Kulkarni.
+
+* LP#1551797: revert previous fix for XSLT error logging as it breaks
+ multi-threaded XSLT processing.
+
+* LP#1673355, GH#233: ``fromstring()`` html5parser failed to parse byte strings.
+
+Other changes
+-------------
+
+* The previously undocumented ``docstring`` option in ``ElementTree.write()``
+ produces a deprecation warning and will eventually be removed.
+
+
+3.7.4 (2017-??-??)
+==================
+
+Bugs fixed
+----------
+
+* LP#1551797: revert previous fix for XSLT error logging as it breaks
+ multi-threaded XSLT processing.
+
+* LP#1673355, GH#233: ``fromstring()`` html5parser failed to parse byte strings.
+
+
+3.7.3 (2017-02-18)
+==================
+
+Bugs fixed
+----------
+
+* GH#218 was ineffective in Python 3.
+
+* GH#222: ``lxml.html.submit_form()`` failed in Python 3.
+ Patch by Jakub Wilk.
+
+
+3.7.2 (2017-01-08)
+==================
+
+* GH#220: ``xmlfile`` allows switching output methods at an element level.
+ Patch by Burak Arslan.
+
+Bugs fixed
+----------
+
+* Work around installation problems in recent Python 2.7 versions
+ due to FTP download failures.
+
+* GH#219: ``xmlfile.element()`` was not properly quoting attribute values.
+ Patch by Burak Arslan.
+
+* GH#218: ``xmlfile.element()`` was not properly escaping text content of
+ script/style tags. Patch by Burak Arslan.
+
+
+3.7.1 (2016-12-23)
+==================
+
+* No source changes, issued only to solve problems with the
+ binary packages released for 3.7.0.
+
+
+3.7.0 (2016-12-10)
+==================
+
+Features added
+--------------
+
+* GH#217: ``XMLSyntaxError`` now behaves more like its ``SyntaxError``
+ baseclass. Patch by Philipp A.
+
+* GH#216: ``HTMLParser()`` now supports the same ``collect_ids`` parameter
+ as ``XMLParser()``. Patch by Burak Arslan.
+
+* GH#210: Allow specifying a serialisation method in ``xmlfile.write()``.
+ Patch by Burak Arslan.
+
+* GH#203: New option ``default_doctype`` in ``HTMLParser`` that allows
+ disabling the automatic doctype creation. Patch by Shadab Zafar.
+
+* GH#201: Calling the method ``.set('attrname')`` without value argument
+ (or ``None``) on HTML elements creates an attribute without value that
+ serialises like ``<div attrname></div>``. Patch by Daniel Holth.
+
+* GH#197: Ignore form input fields in ``form_values()`` when they are
+ marked as ``disabled`` in HTML. Patch by Kristian Klemon.
+
+Bugs fixed
+----------
+
+* GH#206: File name and line number were missing from XSLT error messages.
+ Patch by Marcus Brinkmann.
+
+Other changes
+-------------
+
+* Log entries no longer allow anything but plain string objects as message text
+ and file name.
+
+* ``zlib`` is included in the list of statically built libraries.
+
+
+3.6.4 (2016-08-20)
+==================
+
+* GH#204, LP#1614693: build fix for MacOS-X.
+
+
+3.6.3 (2016-08-18)
+==================
+
+* LP#1614603: change linker flags to build multi-linux wheels
+
+
+3.6.2 (2016-08-18)
+==================
+
+* LP#1614603: release without source changes to provide cleanly built Linux wheels
+
+
+3.6.1 (2016-07-24)
+==================
+
+Features added
+--------------
+
+* GH#180: Separate option ``inline_style`` for Cleaner that only removes ``style``
+ attributes instead of all styles. Patch by Christian Pedersen.
+
+* GH#196: Windows build support for Python 3.5. Contribution by Maximilian Hils.
+
+Bugs fixed
+----------
+
+* GH#199: Exclude ``file`` fields from ``FormElement.form_values`` (as browsers do).
+ Patch by Tomas Divis.
+
+* GH#198, LP#1568167: Try to provide base URL from ``Resolver.resolve_string()``.
+ Patch by Michael van Tellingen.
+
+* GH#191: More accurate float serialisation in ``objectify.FloatElement``.
+ Patch by Holger Joukl.
+
+* LP#1551797: Repair XSLT error logging. Patch by Marcus Brinkmann.
+
+
+3.6.0 (2016-03-17)
+==================
+
+Features added
+--------------
+
+* GH#187: Now supports (only) version 5.x and later of PyPy.
+ Patch by Armin Rigo.
+
+* GH#181: Direct support for ``.rnc`` files in `RelaxNG()` if ``rnc2rng``
+ is installed. Patch by Dirkjan Ochtman.
+
+Bugs fixed
+----------
+
+* GH#189: Static builds honour FTP proxy configurations when downloading
+ the external libs. Patch by Youhei Sakurai.
+
+* GH#186: Soupparser failed to process entities in Python 3.x.
+ Patch by Duncan Morris.
+
+* GH#185: Rare encoding related ``TypeError`` on import was fixed.
+ Patch by Petr Demin.
+
+
+3.5.0 (2015-11-13)
+==================
+
+Bugs fixed
+----------
+
+* Unicode string results failed XPath queries in PyPy.
+
+* LP#1497051: HTML target parser failed to terminate on exceptions
+ and continued parsing instead.
+
+* Deprecated API usage in doctestcompare.
+
+
+3.5.0b1 (2015-09-18)
+====================
+
+Features added
+--------------
+
+* ``cleanup_namespaces()`` accepts a new argument ``keep_ns_prefixes``
+ that does not remove definitions of the provided prefix-namespace
+ mapping from the tree.
+
+* ``cleanup_namespaces()`` accepts a new argument ``top_nsmap`` that
+ moves definitions of the provided prefix-namespace mapping to the
+ top of the tree.
+
+* LP#1490451: ``Element`` objects gained a ``cssselect()`` method as
+ known from ``lxml.html``. Patch by Simon Sapin.
+
+* API functions and methods behave and look more like Python functions,
+ which allows introspection on them etc. One side effect to be aware of
+ is that the functions now bind as methods when assigned to a class
+ variable. A quick fix is to wrap them in ``staticmethod()`` (as for
+ normal Python functions).
+
+* ISO-Schematron support gained an option ``error_finder`` that allows
+ passing a filter function for picking validation errors from reports.
+
+* LP#1243600: Elements in ``lxml.html`` gained a ``classes`` property
+ that provides a set-like interface to the ``class`` attribute.
+ Original patch by masklinn.
+
+* LP#1341964: The soupparser now handles DOCTYPE declarations, comments
+ and processing instructions outside of the root element.
+ Patch by Olli Pottonen.
+
+* LP#1421512: The ``docinfo`` of a tree was made editable to allow
+ setting and removing the public ID and system ID of the DOCTYPE.
+ Patch by Olli Pottonen.
+
+* LP#1442427: More work-arounds for quirks and bugs in pypy and pypy3.
+
+* ``lxml.html.soupparser`` now uses BeautifulSoup version 4 instead
+ of version 3 if available.
+
+Bugs fixed
+----------
+
+* Memory errors that occur during tree adaptations (e.g. moving subtrees
+ to foreign documents) could leave the tree in a crash prone state.
+
+* Calling ``process_children()`` in an XSLT extension element without
+ an ``output_parent`` argument failed with a ``TypeError``.
+ Fix by Jens Tröger.
+
+* GH#162: Image data in HTML ``data`` URLs is considered safe and
+ no longer removed by ``lxml.html.clean`` JavaScript cleaner.
+
+* GH#166: Static build could link libraries in wrong order.
+
+* GH#172: Rely a bit more on libxml2 for encoding detection rather than
+ rolling our own in some cases. Patch by Olli Pottonen.
+
+* GH#159: Validity checks for names and string content were tightened
+ to detect the use of illegal characters early. Patch by Olli Pottonen.
+
+* LP#1421921: Comments/PIs before the DOCTYPE declaration were not
+ serialised. Patch by Olli Pottonen.
+
+* LP#659367: Some HTML DOCTYPE declarations were not serialised.
+ Patch by Olli Pottonen.
+
+* LP#1238503: lxml.doctestcompare is now consistent with stdlib's doctest
+ in how it uses ``+`` and ``-`` to refer to unexpected and missing output.
+
+* Empty prefixes are explicitly rejected when a namespace mapping is used
+ with ElementPath to avoid hiding bugs in user code.
+
+* Several problems with PyPy were fixed by switching to Cython 0.23.
+
+
+3.4.4 (2015-04-25)
+==================
+
+Bugs fixed
+----------
+
+* An ElementTree compatibility test added in lxml 3.4.3 that failed in
+ Python 3.4+ was removed again.
+
+
+3.4.3 (2015-04-15)
+==================
+
+Bugs fixed
+----------
+
+* Expression cache in ElementPath was ignored. Fix by Changaco.
+
+* LP#1426868: Passing a default namespace and a prefixed namespace mapping
+ as nsmap into ``xmlfile.element()`` raised a ``TypeError``.
+
+* LP#1421927: DOCTYPE system URLs were incorrectly quoted when containing
+ double quotes. Patch by Olli Pottonen.
+
+* LP#1419354: meta-redirect URLs were incorrectly processed by
+ ``iterlinks()`` if preceded by whitespace.
+
+
+3.4.2 (2015-02-07)
+==================
+
+Bugs fixed
+----------
+
+* LP#1415907: Crash when creating an XMLSchema from a non-root element
+ of an XML document.
+
+* LP#1369362: HTML cleaning failed when hitting processing instructions
+ with pseudo-attributes.
+
+* ``CDATA()`` wrapped content was rejected for tail text.
+
+* CDATA sections were not serialised as tail text of the top-level element.
+
+
+3.4.1 (2014-11-20)
+==================
+
+Features added
+--------------
+
+* New ``htmlfile`` HTML generator to accompany the incremental ``xmlfile``
+ serialisation API. Patch by Burak Arslan.
+
+Bugs fixed
+----------
+
+* ``lxml.sax.ElementTreeContentHandler`` did not initialise its superclass.
+
+
+3.4.0 (2014-09-10)
+==================
+
+Features added
+--------------
+
+* ``xmlfile(buffered=False)`` disables output buffering and flushes the
+ content after each API operation (starting/ending element blocks or writes).
+ A new method ``xf.flush()`` can alternatively be used to explicitly flush
+ the output.
+
+* ``lxml.html.document_fromstring`` has a new option ``ensure_head_body=True``
+ which will add an empty head and/or body element to the result document if
+ missing.
+
+* ``lxml.html.iterlinks`` now returns links inside meta refresh tags.
+
+* New ``XMLParser`` option ``collect_ids=False`` to disable ID hash table
+ creation. This can substantially speed up parsing of documents with many
+ different IDs that are not used.
+
+* The parser uses per-document hash tables for XML IDs. This reduces the
+ load of the global parser dict and speeds up parsing for documents with
+ many different IDs.
+
+* ``ElementTree.getelementpath(element)`` returns a structural ElementPath
+ expression for the given element, which can be used for lookups later.
+
+* ``xmlfile()`` accepts a new argument ``close=True`` to close file(-like)
+ objects after writing to them. Before, ``xmlfile()`` only closed the file
+ if it had opened it internally.
+
+* Allow "bytearray" type for ASCII text input.
+
+Bugs fixed
+----------
+
+Other changes
+-------------
+
+* LP#400588: decoding errors have become hard errors even in recovery mode.
+ Previously, they could lead to an internal tree representation in a mixed
+ encoding state, which lead to very late errors or even silently incorrect
+ behaviour during tree traversal or serialisation.
+
+* Requires Python 2.6, 2.7, 3.2 or later. No longer supports
+ Python 2.4, 2.5 and 3.1, use lxml 3.3.x for those.
+
+* Requires libxml2 2.7.0 or later and libxslt 1.1.23 or later,
+ use lxml 3.3.x with older versions.
+
+
+3.3.6 (2014-08-28)
+==================
+
+Bugs fixed
+----------
+
+* Prevent tree cycle creation when adding Elements as siblings.
+
+* LP#1361948: crash when deallocating Element siblings without parent.
+
+* LP#1354652: crash when traversing internally loaded documents in XSLT
+ extension functions.
+
+
+3.3.5 (2014-04-18)
+==================
+
+Bugs fixed
+----------
+
+* HTML cleaning could fail to strip javascript links that mix control
+ characters into the link scheme.
+
+
+3.3.4 (2014-04-03)
+==================
+
+Features added
+--------------
+
+* Source line numbers above 65535 are available on Elements when
+ using libxml2 2.9 or later.
+
+Bugs fixed
+----------
+
+* ``lxml.html.fragment_fromstring()`` failed for bytes input in Py3.
+
+Other changes
+-------------
+
+
+3.3.3 (2014-03-04)
+==================
+
+Bugs fixed
+----------
+
+* LP#1287118: Crash when using Element subtypes with ``__slots__``.
+
+Other changes
+-------------
+
+* The internal classes ``_LogEntry`` and ``_Attrib`` can no longer be
+ subclassed from Python code.
+
+
+3.3.2 (2014-02-26)
+==================
+
+Bugs fixed
+----------
+
+* The properties ``resolvers`` and ``version``, as well as the methods
+ ``set_element_class_lookup()`` and ``makeelement()``, were lost from
+ ``iterparse`` objects in 3.3.0.
+
+* LP#1222132: instances of ``XMLSchema``, ``Schematron`` and ``RelaxNG``
+ did not clear their local ``error_log`` before running a validation.
+
+* LP#1238500: lxml.doctestcompare mixed up "expected" and "actual" in
+ attribute values.
+
+* Some file I/O tests were failing in MS-Windows due to non-portable temp
+ file usage. Initial patch by Gabi Davar.
+
+* LP#910014: duplicate IDs in a document were not reported by DTD validation.
+
+* LP#1185332: ``tostring(method="html")`` did not use HTML serialisation
+ semantics for trailing tail text. Initial patch by Sylvain Viollon.
+
+* LP#1281139: ``.attrib`` value of Comments lost its mutation methods
+ in 3.3.0. Even though it is empty and immutable, it should still
+ provide the same interface as that returned for Elements.
+
+
+3.3.1 (2014-02-12)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* LP#1014290: HTML documents parsed with ``parser.feed()`` failed to find
+ elements during tag iteration.
+
+* LP#1273709: Building in PyPy failed due to missing support for
+ ``PyUnicode_Compare()`` and ``PyByteArray_*()`` in PyPy's C-API.
+
+* LP#1274413: Compilation in MSVC failed due to missing "stdint.h" standard
+ header file.
+
+* LP#1274118: iterparse() failed to parse BOM prefixed files.
+
+Other changes
+-------------
+
+
+3.3.0 (2014-01-26)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* The heuristic that distinguishes file paths from URLs was tightened
+ to produce less false negatives.
+
+Other changes
+-------------
+
+
+3.3.0beta5 (2014-01-18)
+=======================
+
+Features added
+--------------
+
+* The PEP 393 unicode parsing support gained a fallback for wchar strings
+ which might still be somewhat common on Windows systems.
+
+Bugs fixed
+----------
+
+* Several error handling problems were fixed throughout the code base that
+ could previously lead to exceptions being silently swallowed or not
+ properly reported.
+
+* The C-API function ``appendChild()`` is now deprecated as it does not
+ propagate exceptions (its return type is ``void``). The new function
+ ``appendChildToElement()`` was added as a safe replacement.
+
+* Passing a string into ``fromstringlist()`` raises an exception instead of
+ parsing the string character by character.
+
+Other changes
+-------------
+
+* Document cleanup code was simplified using the new GC features in
+ Cython 0.20.
+
+
+3.3.0beta4 (2014-01-12)
+=======================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* The (empty) value returned by the ``attrib`` property of Entity and Comment
+ objects was mutable.
+
+* Element class lookup wasn't available for the new pull parsers or when using
+ a custom parser target.
+
+* Setting Element attributes on instantiation with both the ``attrib`` argument
+ and keyword arguments could modify the mapping passed as ``attrib``.
+
+* LP#1266171: DTDs instantiated from internal/external subsets (i.e. through
+ the docinfo property) lost their attribute declarations.
+
+Other changes
+-------------
+
+* Built with Cython 0.20pre (gitrev 012ae82eb) to prepare support for
+ Python 3.4.
+
+
+3.3.0beta3 (2014-01-02)
+=======================
+
+Features added
+--------------
+
+* Unicode string parsing was optimised for Python 3.3 (PEP 393).
+
+Bugs fixed
+----------
+
+* HTML parsing of Unicode strings could misdecode the input on some platforms.
+
+* Crash in xmlfile() when closing open elements out of order in an error case.
+
+Other changes
+-------------
+
+
+3.3.0beta2 (2013-12-20)
+=======================
+
+Features added
+--------------
+
+* ``iterparse()`` supports the ``recover`` option.
+
+Bugs fixed
+----------
+
+* Crash in ``iterparse()`` for HTML parsing.
+
+* Crash in target parsing with attributes.
+
+Other changes
+-------------
+
+* The safety check in the read-only tree implementation (e.g. used by
+ ``PythonElementClassLookup``) raises a more appropriate ``ReferenceError``
+ for illegal access after tree disposal instead of an ``AssertionError``.
+ This should only impact test code that specifically checks the original
+ behaviour.
+
+
+3.3.0beta1 (2013-12-12)
+=======================
+
+Features added
+--------------
+
+* New option ``handle_failures`` in ``make_links_absolute()`` and
+ ``resolve_base_href()`` (lxml.html) that enables ignoring or
+ discarding links that fail to parse as URLs.
+
+* New parser classes ``XMLPullParser`` and ``HTMLPullParser`` for
+ incremental parsing, as implemented for ElementTree in Python 3.4.
+
+* ``iterparse()`` enables recovery mode by default for HTML parsing
+ (``html=True``).
+
+Bugs fixed
+----------
+
+* LP#1255132: crash when trying to run validation over non-Element (e.g.
+ comment or PI).
+
+* Error messages in the log and in exception messages that originated
+ from libxml2 could accidentally be picked up from preceding warnings
+ instead of the actual error.
+
+* The ``ElementMaker`` in lxml.objectify did not accept a dict as
+ argument for adding attributes to the element it's building. This
+ works as in lxml.builder now.
+
+* LP#1228881: ``repr(XSLTAccessControl)`` failed in Python 3.
+
+* Raise ``ValueError`` when trying to append an Element to itself or
+ to one of its own descendants, instead of running into an infinite
+ loop.
+
+* LP#1206077: htmldiff discarded whitespace from the output.
+
+* Compressed plain-text serialisation to file-like objects was broken.
+
+* lxml.html.formfill: Fix textarea form filling.
+ The textarea used to be cleared before the new content was set,
+ which removed the name attribute.
+
+
+Other changes
+-------------
+
+* Some basic API classes use freelists internally for faster
+ instantiation. This can speed up some ``iterparse()`` scenarios,
+ for example.
+
+* ``iterparse()`` was rewritten to use the new ``*PullParser``
+ classes internally instead of being a parser itself.
+
+
+3.2.5 (2014-01-02)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Crash in xmlfile() when closing open elements out of order in an error case.
+
+* Crash in target parsing with attributes.
+
+* LP#1255132: crash when trying to run validation over non-Element (e.g.
+ comment or PI).
+
+Other changes
+-------------
+
+
+3.2.4 (2013-11-07)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Memory leak when creating an XPath evaluator in a thread.
+
+* LP#1228881: ``repr(XSLTAccessControl)`` failed in Python 3.
+
+* Raise ``ValueError`` when trying to append an Element to itself or
+ to one of its own descendants.
+
+* LP#1206077: htmldiff discarded whitespace from the output.
+
+* Compressed plain-text serialisation to file-like objects was broken.
+
+Other changes
+-------------
+
+
+3.2.3 (2013-07-28)
+==================
+
+Bugs fixed
+----------
+
+* Fix support for Python 2.4 which was lost in 3.2.2.
+
+
+3.2.2 (2013-07-28)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* LP#1185701: spurious XMLSyntaxError after finishing iterparse().
+
+* Crash in lxml.objectify during xsi annotation.
+
+Other changes
+-------------
+
+* Return values of user provided element class lookup methods are now
+ validated against the type of the XML node they represent to prevent
+ API class mismatches.
+
+
+3.2.1 (2013-05-11)
+==================
+
+Features added
+--------------
+
+* The methods ``apply_templates()`` and ``process_children()`` of XSLT
+ extension elements have gained two new boolean options ``elements_only``
+ and ``remove_blank_text`` that discard either all strings or whitespace-only
+ strings from the result list.
+
+Bugs fixed
+----------
+
+* When moving Elements to another tree, the namespace cleanup mechanism
+ no longer drops namespace prefixes from attributes for which it finds
+ a default namespace declaration, to prevent them from appearing as
+ unnamespaced attributes after serialisation.
+
+* Returning non-type objects from a custom class lookup method could lead
+ to a crash.
+
+* Instantiating and using subtypes of Comments and ProcessingInstructions
+ crashed.
+
+Other changes
+-------------
+
+
+3.2.0 (2013-04-28)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* LP#690319: Leading whitespace could change the behaviour of the string
+ parsing functions in ``lxml.html``.
+
+* LP#599318: The string parsing functions in ``lxml.html`` are more robust
+ in the face of uncommon HTML content like framesets or missing body tags.
+ Patch by Stefan Seelmann.
+
+* LP#712941: I/O errors while trying to access files with paths that contain
+ non-ASCII characters could raise ``UnicodeDecodeError`` instead of properly
+ reporting the ``IOError``.
+
+* LP#673205: Parsing from in-memory strings disabled network access in the
+ default parser and made subsequent attempts to parse from a URL fail.
+
+* LP#971754: lxml.html.clean appends 'nofollow' to 'rel' attributes instead
+ of overwriting the current value.
+
+* LP#715687: lxml.html.clean no longer discards scripts that are explicitly
+ allowed by the user provided whitelist. Patch by Christine Koppelt.
+
+Other changes
+-------------
+
+
+3.1.2 (2013-04-12)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* LP#1136509: Passing attributes through the namespace-unaware API of
+ the sax bridge (i.e. the ``handler.startElement()`` method) failed
+ with a ``TypeError``. Patch by Mike Bayer.
+
+* LP#1123074: Fix serialisation error in XSLT output when converting
+ the result tree to a Unicode string.
+
+* GH#105: Replace illegal usage of ``xmlBufLength()`` in libxml2 2.9.0
+ by properly exported API function ``xmlBufUse()``.
+
+Other changes
+-------------
+
+
+3.1.1 (2013-03-29)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* LP#1160386: Write access to ``lxml.html.FormElement.fields`` raised
+ an AttributeError in Py3.
+
+* Illegal memory access during cleanup in incremental xmlfile writer.
+
+Other changes
+-------------
+
+* The externally useless class ``lxml.etree._BaseParser`` was removed
+ from the module dict.
+
+
+3.1.0 (2013-02-10)
+==================
+
+Features added
+--------------
+
+* GH#89: lxml.html.clean allows overriding the set of attributes that it
+ considers 'safe'. Patch by Francis Devereux.
+
+Bugs fixed
+----------
+
+* LP#1104370: ``copy.copy(el.attrib)`` raised an exception. It now returns
+ a copy of the attributes as a plain Python dict.
+
+* GH#95: When used with namespace prefixes, the ``el.find*()`` methods
+ always used the first namespace mapping that was provided for each
+ path expression instead of using the one that was actually passed
+ in for the current run.
+
+* LP#1092521, GH#91: Fix undefined C symbol in Python runtimes compiled
+ without threading support. Patch by Ulrich Seidl.
+
+Other changes
+-------------
+
+
+3.1beta1 (2012-12-21)
+=====================
+
+Features added
+--------------
+
+* New build-time option ``--with-unicode-strings`` for Python 2 that
+ makes the API always return Unicode strings for names and text
+ instead of byte strings for plain ASCII content.
+
+* New incremental XML file writing API ``etree.xmlfile()``.
+
+* E factory in lxml.objectify is callable to simplify the creation of
+ tags with non-identifier names without having to resort to getattr().
+
+Bugs fixed
+----------
+
+* When starting from a non-namespaced element in lxml.objectify, searching
+ for a child without explicitly specifying a namespace incorrectly found
+ namespaced elements with the requested local name, instead of restricting
+ the search to non-namespaced children.
+
+* GH#85: Deprecation warnings were fixed for Python 3.x.
+
+* GH#33: lxml.html.fromstring() failed to accept bytes input in Py3.
+
+* LP#1080792: Static build of libxml2 2.9.0 failed due to missing file.
+
+Other changes
+-------------
+
+* The externally useless class ``_ObjectifyElementMakerCaller`` was
+ removed from the module API of lxml.objectify.
+
+* LP#1075622: lxml.builder is faster for adding text to elements with
+ many children. Patch by Anders Hammarquist.
+
+
+3.0.2 (2012-12-14)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Fix crash during interpreter shutdown by switching to Cython 0.17.3 for building.
+
+Other changes
+-------------
+
+
+3.0.1 (2012-10-14)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* LP#1065924: Element proxies could disappear during garbage collection
+ in PyPy without proper cleanup.
+
+* GH#71: Failure to work with libxml2 2.6.x.
+
+* LP#1065139: static MacOS-X build failed in Py3.
+
+Other changes
+-------------
+
+
+3.0 (2012-10-08)
+================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* End-of-file handling was incorrect in iterparse() when reading from
+ a low-level C file stream and failed in libxml2 2.9.0 due to its
+ improved consistency checks.
+
+Other changes
+-------------
+
+* The build no longer uses Cython by default unless the generated C files
+ are missing. To use Cython, pass the option "--with-cython". To ignore
+ the fatal build error when Cython is required but not available (e.g. to
+ run special setup.py commands that do not actually run a build), pass
+ "--without-cython".
+
+
+3.0beta1 (2012-09-26)
+=====================
+
+Features added
+--------------
+
+* Python level access to (optional) libxml2 memory debugging features
+ to simplify debugging of memory leaks etc.
+
+Bugs fixed
+----------
+
+* Fix a memory leak in XPath by switching to Cython 0.17.1.
+
+* Some tests were adapted to work with PyPy.
+
+Other changes
+-------------
+
+* The code was adapted to work with the upcoming libxml2 2.9.0 release.
+
+
+3.0alpha2 (2012-08-23)
+======================
+
+Features added
+--------------
+
+* The ``.iter()`` method of elements now accepts ``tag`` arguments like
+ ``"{*}name"`` to search for elements with a given local name in any
+ namespace. With this addition, all combinations of wildcards now work
+ as expected:
+ ``"{ns}name"``, ``"{}name"``, ``"{*}name"``, ``"{ns}*"``, ``"{}*"``
+ and ``"{*}*"``. Note that ``"name"`` is equivalent to ``"{}name"``,
+ but ``"*"`` is ``"{*}*"``.
+ The same change applies to the ``.getiterator()``, ``.itersiblings()``,
+ ``.iterancestors()``, ``.iterdescendants()``, ``.iterchildren()``
+ and ``.itertext()`` methods;the ``strip_attributes()``,
+ ``strip_elements()`` and ``strip_tags()`` functions as well as the
+ ``iterparse()`` class. Patch by Simon Sapin.
+
+* C14N allows specifying the inclusive prefixes to be promoted
+ to top-level during exclusive serialisation.
+
+Bugs fixed
+----------
+
+* Passing long Unicode strings into the ``feed()`` parser interface
+ failed to read the entire string.
+
+Other changes
+-------------
+
+
+3.0alpha1 (2012-07-31)
+======================
+
+Features added
+--------------
+
+* Initial support for building in PyPy (through cpyext).
+
+* DTD objects gained an API that allows read access to their
+ declarations.
+
+* ``xpathgrep.py`` gained support for parsing line-by-line (e.g.
+ from grep output) and for surrounding the output with a new root
+ tag.
+
+* ``E-factory`` in ``lxml.builder`` accepts subtypes of known data
+ types (such as string subtypes) when building elements around them.
+
+* Tree iteration and ``iterparse()`` with a selective ``tag``
+ argument supports passing a set of tags. Tree nodes will be
+ returned by the iterators if they match any of the tags.
+
+Bugs fixed
+----------
+
+* The ``.find*()`` methods in ``lxml.objectify`` no longer use XPath
+ internally, which makes them faster in many cases (especially when
+ short circuiting after a single or couple of elements) and fixes
+ some behavioural differences compared to ``lxml.etree``. Note that
+ this means that they no longer support arbitrary XPath expressions
+ but only the subset that the ``ElementPath`` language supports.
+ The previous implementation was also redundant with the normal
+ XPath support, which can be used as a replacement.
+
+* ``el.find('*')`` could accidentally return a comment or processing
+ instruction that happened to be in the wrong spot. (Same for the
+ other ``.find*()`` methods.)
+
+* The error logging is less intrusive and avoids a global setup where
+ possible.
+
+* Fixed undefined names in html5lib parser.
+
+* ``xpathgrep.py`` did not work in Python 3.
+
+* ``Element.attrib.update()`` did not accept an ``attrib`` of
+ another Element as parameter.
+
+* For subtypes of ``ElementBase`` that make the ``.text`` or ``.tail``
+ properties immutable (as in objectify, for example), inserting text
+ when creating Elements through the E-Factory feature of the class
+ constructor would fail with an exception, stating that the text
+ cannot be modified.
+
+Other changes
+--------------
+
+* The code base was overhauled to properly use 'const' where the API
+ of libxml2 and libxslt requests it. This also has an impact on the
+ public C-API of lxml itself, as defined in ``etreepublic.pxd``, as
+ well as the provided declarations in the ``lxml/includes/`` directory.
+ Code that uses these declarations may have to be adapted. On the
+ plus side, this fixes several C compiler warnings, also for user
+ code, thus making it easier to spot real problems again.
+
+* The functionality of "lxml.cssselect" was moved into a separate PyPI
+ package called "cssselect". To continue using it, you must install
+ that package separately. The "lxml.cssselect" module is still
+ available and provides the same interface, provided the "cssselect"
+ package can be imported at runtime.
+
+* Element attributes passed in as an ``attrib`` dict or as keyword
+ arguments are now sorted by (namespaced) name before being created
+ to make their order predictable for serialisation and iteration.
+ Note that adding or deleting attributes afterwards does not take
+ that order into account, i.e. setting a new attribute appends it
+ after the existing ones.
+
+* Several classes that are for internal use only were removed
+ from the ``lxml.etree`` module dict:
+ ``_InputDocument, _ResolverRegistry, _ResolverContext, _BaseContext,
+ _ExsltRegExp, _IterparseContext, _TempStore, _ExceptionContext,
+ __ContentOnlyElement, _AttribIterator, _NamespaceRegistry,
+ _ClassNamespaceRegistry, _FunctionNamespaceRegistry,
+ _XPathFunctionNamespaceRegistry, _ParserDictionaryContext,
+ _FileReaderContext, _ParserContext, _PythonSaxParserTarget,
+ _TargetParserContext, _ReadOnlyProxy, _ReadOnlyPIProxy,
+ _ReadOnlyEntityProxy, _ReadOnlyElementProxy, _OpaqueNodeWrapper,
+ _OpaqueDocumentWrapper, _ModifyContentOnlyProxy,
+ _ModifyContentOnlyPIProxy, _ModifyContentOnlyEntityProxy,
+ _AppendOnlyElementProxy, _SaxParserContext, _FilelikeWriter,
+ _ParserSchemaValidationContext, _XPathContext,
+ _XSLTResolverContext, _XSLTContext, _XSLTQuotedStringParam``
+
+* Several internal classes can no longer be inherited from:
+ ``_InputDocument, _ResolverRegistry, _ExsltRegExp, _ElementUnicodeResult,
+ _IterparseContext, _TempStore, _AttribIterator, _ClassNamespaceRegistry,
+ _XPathFunctionNamespaceRegistry, _ParserDictionaryContext,
+ _FileReaderContext, _PythonSaxParserTarget, _TargetParserContext,
+ _ReadOnlyPIProxy, _ReadOnlyEntityProxy, _OpaqueDocumentWrapper,
+ _ModifyContentOnlyPIProxy, _ModifyContentOnlyEntityProxy,
+ _AppendOnlyElementProxy, _FilelikeWriter, _ParserSchemaValidationContext,
+ _XPathContext, _XSLTResolverContext, _XSLTContext, _XSLTQuotedStringParam,
+ _XSLTResultTree, _XSLTProcessingInstruction``
+
+
+2.3.6 (2012-09-28)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Passing long Unicode strings into the ``feed()`` parser interface
+ failed to read the entire string.
+
+Other changes
+--------------
+
+
+2.3.5 (2012-07-31)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Crash when merging text nodes in ``element.remove()``.
+
+* Crash in sax/target parser when reporting empty doctype.
+
+Other changes
+--------------
+
+
+2.3.4 (2012-03-26)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Crash when building an nsmap (Element property) with empty
+ namespace URIs.
+
+* Crash due to race condition when errors (or user messages) occur
+ during threaded XSLT processing.
+
+* XSLT stylesheet compilation could ignore compilation errors.
+
+Other changes
+--------------
+
+
+2.3.3 (2012-01-04)
+==================
+
+Features added
+--------------
+
+* ``lxml.html.tostring()`` gained new serialisation options
+ ``with_tail`` and ``doctype``.
+
+Bugs fixed
+----------
+
+* Fixed a crash when using ``iterparse()`` for HTML parsing and
+ requesting start events.
+
+* Fixed parsing of more selectors in cssselect. Whitespace before
+ pseudo-elements and pseudo-classes is significant as it is a
+ descendant combinator.
+ "E :pseudo" should parse the same as "E \*:pseudo", not "E:pseudo".
+ Patch by Simon Sapin.
+
+* lxml.html.diff no longer raises an exception when hitting
+ 'img' tags without 'src' attribute.
+
+Other changes
+--------------
+
+
+2.3.2 (2011-11-11)
+==================
+
+Features added
+--------------
+
+* ``lxml.objectify.deannotate()`` has a new boolean option
+ ``cleanup_namespaces`` to remove the objectify namespace
+ declarations (and generally clean up the namespace declarations)
+ after removing the type annotations.
+
+* ``lxml.objectify`` gained its own ``SubElement()`` function as a
+ copy of ``etree.SubElement`` to avoid an otherwise redundant import
+ of ``lxml.etree`` on the user side.
+
+Bugs fixed
+----------
+
+* Fixed the "descendant" bug in cssselect a second time (after a first
+ fix in lxml 2.3.1). The previous change resulted in a serious
+ performance regression for the XPath based evaluation of the
+ translated expression. Note that this breaks the usage of some of
+ the generated XPath expressions as XSLT location paths that
+ previously worked in 2.3.1.
+
+* Fixed parsing of some selectors in cssselect. Whitespace after combinators
+ ">", "+" and "~" is now correctly ignored. Previously it was parsed as
+ a descendant combinator. For example, "div> .foo" was parsed the same as
+ "div>* .foo" instead of "div>.foo". Patch by Simon Sapin.
+
+Other changes
+--------------
+
+
+2.3.1 (2011-09-25)
+==================
+
+Features added
+--------------
+
+* New option ``kill_tags`` in ``lxml.html.clean`` to remove specific
+ tags and their content (i.e. their whole subtree).
+
+* ``pi.get()`` and ``pi.attrib`` on processing instructions to parse
+ pseudo-attributes from the text content of processing instructions.
+
+* ``lxml.get_include()`` returns a list of include paths that can be
+ used to compile external C code against lxml.etree. This is
+ specifically required for statically linked lxml builds when code
+ needs to compile against the exact same header file versions as lxml
+ itself.
+
+* ``Resolver.resolve_file()`` takes an additional option
+ ``close_file`` that configures if the file(-like) object will be
+ closed after reading or not. By default, the file will be closed,
+ as the user is not expected to keep a reference to it.
+
+Bugs fixed
+----------
+
+* HTML cleaning didn't remove 'data:' links.
+
+* The html5lib parser integration now uses the 'official'
+ implementation in html5lib itself, which makes it work with newer
+ releases of the library.
+
+* In ``lxml.sax``, ``endElementNS()`` could incorrectly reject a plain
+ tag name when the corresponding start event inferred the same plain
+ tag name to be in the default namespace.
+
+* When an open file-like object is passed into ``parse()`` or
+ ``iterparse()``, the parser will no longer close it after use. This
+ reverts a change in lxml 2.3 where all files would be closed. It is
+ the users responsibility to properly close the file(-like) object,
+ also in error cases.
+
+* Assertion error in lxml.html.cleaner when discarding top-level elements.
+
+* In lxml.cssselect, use the xpath 'A//B' (short for
+ 'A/descendant-or-self::node()/B') instead of 'A/descendant::B' for
+ the css descendant selector ('A B'). This makes a few edge cases
+ like ``"div *:last-child"`` consistent with the selector behavior in
+ WebKit and Firefox, and makes more css expressions valid location
+ paths (for use in xsl:template match).
+
+* In lxml.html, non-selected ``<option>`` tags no longer show up in the
+ collected form values.
+
+* Adding/removing ``<option>`` values to/from a multiple select form
+ field properly selects them and unselects them.
+
+Other changes
+--------------
+
+* Static builds can specify the download directory with the
+ ``--download-dir`` option.
+
+
+2.3 (2011-02-06)
+================
+
+Features added
+--------------
+
+* When looking for children, ``lxml.objectify`` takes '{}tag' as
+ meaning an empty namespace, as opposed to the parent namespace.
+
+Bugs fixed
+----------
+
+* When finished reading from a file-like object, the parser
+ immediately calls its ``.close()`` method.
+
+* When finished parsing, ``iterparse()`` immediately closes the input
+ file.
+
+* Work-around for libxml2 bug that can leave the HTML parser in a
+ non-functional state after parsing a severely broken document (fixed
+ in libxml2 2.7.8).
+
+* ``marque`` tag in HTML cleanup code is correctly named ``marquee``.
+
+Other changes
+--------------
+
+* Some public functions in the Cython-level C-API have more explicit
+ return types.
+
+
+2.3beta1 (2010-09-06)
+=====================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Crash in newer libxml2 versions when moving elements between
+ documents that had attributes on replaced XInclude nodes.
+
+* ``XMLID()`` function was missing the optional ``parser`` and
+ ``base_url`` parameters.
+
+* Searching for wildcard tags in ``iterparse()`` was broken in Py3.
+
+* ``lxml.html.open_in_browser()`` didn't work in Python 3 due to the
+ use of os.tempnam. It now takes an optional 'encoding' parameter.
+
+Other changes
+--------------
+
+
+2.3alpha2 (2010-07-24)
+======================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Crash in XSLT when generating text-only result documents with a
+ stylesheet created in a different thread.
+
+Other changes
+--------------
+
+* ``repr()`` of Element objects shows the hex ID with leading 0x
+ (following ElementTree 1.3).
+
+
+2.3alpha1 (2010-06-19)
+======================
+
+Features added
+--------------
+
+* Keyword argument ``namespaces`` in ``lxml.cssselect.CSSSelector()``
+ to pass a prefix-to-namespace mapping for the selector.
+
+* New function ``lxml.etree.register_namespace(prefix, uri)`` that
+ globally registers a namespace prefix for a namespace that newly
+ created Elements in that namespace will use automatically. Follows
+ ElementTree 1.3.
+
+* Support 'unicode' string name as encoding parameter in
+ ``tostring()``, following ElementTree 1.3.
+
+* Support 'c14n' serialisation method in ``ElementTree.write()`` and
+ ``tostring()``, following ElementTree 1.3.
+
+* The ElementPath expression syntax (``el.find*()``) was extended to
+ match the upcoming ElementTree 1.3 that will ship in the standard
+ library of Python 3.2/2.7. This includes extended support for
+ predicates as well as namespace prefixes (as known from XPath).
+
+* During regular XPath evaluation, various ESXLT functions are
+ available within their namespace when using libxslt 1.1.26 or later.
+
+* Support passing a readily configured logger instance into
+ ``PyErrorLog``, instead of a logger name.
+
+* On serialisation, the new ``doctype`` parameter can be used to
+ override the DOCTYPE (internal subset) of the document.
+
+* New parameter ``output_parent`` to ``XSLTExtension.apply_templates()``
+ to append the resulting content directly to an output element.
+
+* ``XSLTExtension.process_children()`` to process the content of the
+ XSLT extension element itself.
+
+* ISO-Schematron support based on the de-facto Schematron reference
+ 'skeleton implementation'.
+
+* XSLT objects now take XPath object as ``__call__`` stylesheet
+ parameters.
+
+* Enable path caching in ElementPath (``el.find*()``) to avoid parsing
+ overhead.
+
+* Setting the value of a namespaced attribute always uses a prefixed
+ namespace instead of the default namespace even if both declare the
+ same namespace URI. This avoids serialisation problems when an
+ attribute from a default namespace is set on an element from a
+ different namespace.
+
+* XSLT extension elements: support for XSLT context nodes other than
+ elements: document root, comments, processing instructions.
+
+* Support for strings (in addition to Elements) in node-sets returned
+ by extension functions.
+
+* Forms that lack an ``action`` attribute default to the base URL of
+ the document on submit.
+
+* XPath attribute result strings have an ``attrname`` property.
+
+* Namespace URIs get validated against RFC 3986 at the API level
+ (required by the XML namespace specification).
+
+* Target parsers show their target object in the ``.target`` property
+ (compatible with ElementTree).
+
+Bugs fixed
+----------
+
+* API is hardened against invalid proxy instances to prevent crashes
+ due to incorrectly instantiated Element instances.
+
+* Prevent crash when instantiating ``CommentBase`` and friends.
+
+* Export ElementTree compatible XML parser class as
+ ``XMLTreeBuilder``, as it is called in ET 1.2.
+
+* ObjectifiedDataElements in lxml.objectify were not hashable. They
+ now use the hash value of the underlying Python value (string,
+ number, etc.) to which they compare equal.
+
+* Parsing broken fragments in lxml.html could fail if the fragment
+ contained an orphaned closing '</div>' tag.
+
+* Using XSLT extension elements around the root of the output document
+ crashed.
+
+* ``lxml.cssselect`` did not distinguish between ``x[attr="val"]`` and
+ ``x [attr="val"]`` (with a space). The latter now matches the
+ attribute independent of the element.
+
+* Rewriting multiple links inside of HTML text content could end up
+ replacing unrelated content as replacements could impact the
+ reported position of subsequent matches. Modifications are now
+ simplified by letting the ``iterlinks()`` generator in ``lxml.html``
+ return links in reversed order if they appear inside the same text
+ node. Thus, replacements and link-internal modifications no longer
+ change the position of links reported afterwards.
+
+* The ``.value`` attribute of ``textarea`` elements in lxml.html did
+ not represent the complete raw value (including child tags etc.). It
+ now serialises the complete content on read and replaces the
+ complete content by a string on write.
+
+* Target parser didn't call ``.close()`` on the target object if
+ parsing failed. Now it is guaranteed that ``.close()`` will be
+ called after parsing, regardless of the outcome.
+
+Other changes
+-------------
+
+* Official support for Python 3.1.2 and later.
+
+* Static MS Windows builds can now download their dependencies
+ themselves.
+
+* ``Element.attrib`` no longer uses a cyclic reference back to its
+ Element object. It therefore no longer requires the garbage
+ collector to clean up.
+
+* Static builds include libiconv, in addition to libxml2 and libxslt.
+
+
+2.2.8 (2010-09-02)
+==================
+
+Bugs fixed
+----------
+
+* Crash in newer libxml2 versions when moving elements between
+ documents that had attributes on replaced XInclude nodes.
+
+* Import fix for urljoin in Python 3.1+.
+
+
+2.2.7 (2010-07-24)
+==================
+
+Bugs fixed
+----------
+
+* Crash in XSLT when generating text-only result documents with a
+ stylesheet created in a different thread.
+
+
+2.2.6 (2010-03-02)
+==================
+
+Bugs fixed
+----------
+
+* Fixed several Python 3 regressions by building with Cython 0.11.3.
+
+
+2.2.5 (2010-02-28)
+==================
+
+Features added
+--------------
+
+* Support for running XSLT extension elements on the input root node
+ (e.g. in a template matching on "/").
+
+Bugs fixed
+----------
+
+* Crash in XPath evaluation when reading smart strings from a document
+ other than the original context document.
+
+* Support recent versions of html5lib by not requiring its
+ ``XHTMLParser`` in ``htmlparser.py`` anymore.
+
+* Manually instantiating the custom element classes in
+ ``lxml.objectify`` could crash.
+
+* Invalid XML text characters were not rejected by the API when they
+ appeared in unicode strings directly after non-ASCII characters.
+
+* lxml.html.open_http_urllib() did not work in Python 3.
+
+* The functions ``strip_tags()`` and ``strip_elements()`` in
+ ``lxml.etree`` did not remove all occurrences of a tag in all cases.
+
+* Crash in XSLT extension elements when the XSLT context node is not
+ an element.
+
+
+2.2.4 (2009-11-11)
+==================
+
+Bugs fixed
+----------
+
+* Static build of libxml2/libxslt was broken.
+
+
+2.2.3 (2009-10-30)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* The ``resolve_entities`` option did not work in the incremental feed
+ parser.
+
+* Looking up and deleting attributes without a namespace could hit a
+ namespaced attribute of the same name instead.
+
+* Late errors during calls to ``SubElement()`` (e.g. attribute related
+ ones) could leave a partially initialised element in the tree.
+
+* Modifying trees that contain parsed entity references could result
+ in an infinite loop.
+
+* ObjectifiedElement.__setattr__ created an empty-string child element when the
+ attribute value was rejected as a non-unicode/non-ascii string
+
+* Syntax errors in ``lxml.cssselect`` could result in misleading error
+ messages.
+
+* Invalid syntax in CSS expressions could lead to an infinite loop in
+ the parser of ``lxml.cssselect``.
+
+* CSS special character escapes were not properly handled in
+ ``lxml.cssselect``.
+
+* CSS Unicode escapes were not properly decoded in ``lxml.cssselect``.
+
+* Select options in HTML forms that had no explicit ``value``
+ attribute were not handled correctly. The HTML standard dictates
+ that their value is defined by their text content. This is now
+ supported by lxml.html.
+
+* XPath raised a TypeError when finding CDATA sections. This is now
+ fully supported.
+
+* Calling ``help(lxml.objectify)`` didn't work at the prompt.
+
+* The ``ElementMaker`` in lxml.objectify no longer defines the default
+ namespaces when annotation is disabled.
+
+* Feed parser failed to honour the 'recover' option on parse errors.
+
+* Diverting the error logging to Python's logging system was broken.
+
+Other changes
+-------------
+
+
+2.2.2 (2009-06-21)
+==================
+
+Features added
+--------------
+
+* New helper functions ``strip_attributes()``, ``strip_elements()``,
+ ``strip_tags()`` in lxml.etree to remove attributes/subtrees/tags
+ from a subtree.
+
+Bugs fixed
+----------
+
+* Namespace cleanup on subtree insertions could result in missing
+ namespace declarations (and potentially crashes) if the element
+ defining a namespace was deleted and the namespace was not used by
+ the top element of the inserted subtree but only in deeper subtrees.
+
+* Raising an exception from a parser target callback didn't always
+ terminate the parser.
+
+* Only {true, false, 1, 0} are accepted as the lexical representation for
+ BoolElement ({True, False, T, F, t, f} not any more), restoring lxml <= 2.0
+ behaviour.
+
+Other changes
+-------------
+
+
+2.2.1 (2009-06-02)
+==================
+
+Features added
+--------------
+
+* Injecting default attributes into a document during XML Schema
+ validation (also at parse time).
+
+* Pass ``huge_tree`` parser option to disable parser security
+ restrictions imposed by libxml2 2.7.
+
+Bugs fixed
+----------
+
+* The script for statically building libxml2 and libxslt didn't work
+ in Py3.
+
+* ``XMLSchema()`` also passes invalid schema documents on to libxml2
+ for parsing (which could lead to a crash before release 2.6.24).
+
+Other changes
+-------------
+
+
+2.2 (2009-03-21)
+================
+
+Features added
+--------------
+
+* Support for ``standalone`` flag in XML declaration through
+ ``tree.docinfo.standalone`` and by passing ``standalone=True/False``
+ on serialisation.
+
+Bugs fixed
+----------
+
+* Crash when parsing an XML Schema with external imports from a
+ filename.
+
+
+2.2beta4 (2009-02-27)
+=====================
+
+Features added
+--------------
+
+* Support strings and instantiable Element classes as child arguments
+ to the constructor of custom Element classes.
+
+* GZip compression support for serialisation to files and file-like
+ objects.
+
+Bugs fixed
+----------
+
+* Deep-copying an ElementTree copied neither its sibling PIs and
+ comments nor its internal/external DTD subsets.
+
+* Soupparser failed on broken attributes without values.
+
+* Crash in XSLT when overwriting an already defined attribute using
+ ``xsl:attribute``.
+
+* Crash bug in exception handling code under Python 3. This was due
+ to a problem in Cython, not lxml itself.
+
+* ``lxml.html.FormElement._name()`` failed for non top-level forms.
+
+* ``TAG`` special attribute in constructor of custom Element classes
+ was evaluated incorrectly.
+
+Other changes
+-------------
+
+* Official support for Python 3.0.1.
+
+* ``Element.findtext()`` now returns an empty string instead of None
+ for Elements without text content.
+
+
+2.2beta3 (2009-02-17)
+=====================
+
+Features added
+--------------
+
+* ``XSLT.strparam()`` class method to wrap quoted string parameters
+ that require escaping.
+
+Bugs fixed
+----------
+
+* Memory leak in XPath evaluators.
+
+* Crash when parsing indented XML in one thread and merging it with
+ other documents parsed in another thread.
+
+* Setting the ``base`` attribute in ``lxml.objectify`` from a unicode
+ string failed.
+
+* Fixes following changes in Python 3.0.1.
+
+* Minor fixes for Python 3.
+
+Other changes
+-------------
+
+* The global error log (which is copied into the exception log) is now
+ local to a thread, which fixes some race conditions.
+
+* More robust error handling on serialisation.
+
+
+2.2beta2 (2009-01-25)
+=====================
+
+Bugs fixed
+----------
+
+* Potential memory leak on exception handling. This was due to a
+ problem in Cython, not lxml itself.
+
+* ``iter_links`` (and related link-rewriting functions) in
+ ``lxml.html`` would interpret CSS like ``url("link")`` incorrectly
+ (treating the quotation marks as part of the link).
+
+* Failing import on systems that have an ``io`` module.
+
+
+2.1.5 (2009-01-06)
+==================
+
+Bugs fixed
+----------
+
+* Potential memory leak on exception handling. This was due to a
+ problem in Cython, not lxml itself.
+
+* Failing import on systems that have an ``io`` module.
+
+
+2.2beta1 (2008-12-12)
+=====================
+
+Features added
+--------------
+
+* Allow ``lxml.html.diff.htmldiff`` to accept Element objects, not
+ just HTML strings.
+
+Bugs fixed
+----------
+
+* Crash when using an XPath evaluator in multiple threads.
+
+* Fixed missing whitespace before ``Link:...`` in ``lxml.html.diff``.
+
+Other changes
+-------------
+
+* Export ``lxml.html.parse``.
+
+
+2.1.4 (2008-12-12)
+==================
+
+Bugs fixed
+----------
+
+* Crash when using an XPath evaluator in multiple threads.
+
+
+2.0.11 (2008-12-12)
+===================
+
+Bugs fixed
+----------
+
+* Crash when using an XPath evaluator in multiple threads.
+
+
+2.2alpha1 (2008-11-23)
+======================
+
+Features added
+--------------
+
+* Support for XSLT result tree fragments in XPath/XSLT extension
+ functions.
+
+* QName objects have new properties ``namespace`` and ``localname``.
+
+* New options for exclusive C14N and C14N without comments.
+
+* Instantiating a custom Element classes creates a new Element.
+
+Bugs fixed
+----------
+
+* XSLT didn't inherit the parse options of the input document.
+
+* 0-bytes could slip through the API when used inside of Unicode
+ strings.
+
+* With ``lxml.html.clean.autolink``, links with balanced parenthesis,
+ that end in a parenthesis, will be linked in their entirety (typical
+ with Wikipedia links).
+
+Other changes
+-------------
+
+
+2.1.3 (2008-11-17)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Ref-count leaks when lxml enters a try-except statement while an
+ outside exception lives in sys.exc_*(). This was due to a problem in
+ Cython, not lxml itself.
+
+* Parser Unicode decoding errors could get swallowed by other
+ exceptions.
+
+* Name/import errors in some Python modules.
+
+* Internal DTD subsets that did not specify a system or public ID were
+ not serialised and did not appear in the docinfo property of
+ ElementTrees.
+
+* Fix a pre-Py3k warning when parsing from a gzip file in Py2.6.
+
+* Test suite fixes for libxml2 2.7.
+
+* Resolver.resolve_string() did not work for non-ASCII byte strings.
+
+* Resolver.resolve_file() was broken.
+
+* Overriding the parser encoding didn't work for many encodings.
+
+Other changes
+-------------
+
+
+2.0.10 (2008-11-17)
+===================
+
+Bugs fixed
+----------
+
+* Ref-count leaks when lxml enters a try-except statement while an
+ outside exception lives in sys.exc_*(). This was due to a problem in
+ Cython, not lxml itself.
+
+
+2.1.2 (2008-09-05)
+==================
+
+Features added
+--------------
+
+* lxml.etree now tries to find the absolute path name of files when
+ parsing from a file-like object. This helps custom resolvers when
+ resolving relative URLs, as lixbml2 can prepend them with the path
+ of the source document.
+
+Bugs fixed
+----------
+
+* Memory problem when passing documents between threads.
+
+* Target parser did not honour the ``recover`` option and raised an
+ exception instead of calling ``.close()`` on the target.
+
+Other changes
+-------------
+
+
+2.0.9 (2008-09-05)
+==================
+
+Bugs fixed
+----------
+
+* Memory problem when passing documents between threads.
+
+* Target parser did not honour the ``recover`` option and raised an
+ exception instead of calling ``.close()`` on the target.
+
+
+2.1.1 (2008-07-24)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Crash when parsing XSLT stylesheets in a thread and using them in
+ another.
+
+* Encoding problem when including text with ElementInclude under
+ Python 3.
+
+Other changes
+-------------
+
+
+2.0.8 (2008-07-24)
+==================
+
+Features added
+--------------
+
+* ``lxml.html.rewrite_links()`` strips links to work around documents
+ with whitespace in URL attributes.
+
+Bugs fixed
+----------
+
+* Crash when parsing XSLT stylesheets in a thread and using them in
+ another.
+
+* CSS selector parser dropped remaining expression after a function
+ with parameters.
+
+Other changes
+-------------
+
+
+2.1 (2008-07-09)
+================
+
+Features added
+--------------
+
+* Smart strings can be switched off in XPath (``smart_strings``
+ keyword option).
+
+* ``lxml.html.rewrite_links()`` strips links to work around documents
+ with whitespace in URL attributes.
+
+Bugs fixed
+----------
+
+* Custom resolvers were not used for XMLSchema includes/imports and
+ XInclude processing.
+
+* CSS selector parser dropped remaining expression after a function
+ with parameters.
+
+Other changes
+-------------
+
+* ``objectify.enableRecursiveStr()`` was removed, use
+ ``objectify.enable_recursive_str()`` instead
+
+* Speed-up when running XSLTs on documents from other threads
+
+
+2.0.7 (2008-06-20)
+==================
+
+Features added
+--------------
+
+* Pickling ``ElementTree`` objects in lxml.objectify.
+
+Bugs fixed
+----------
+
+* Descending dot-separated classes in CSS selectors were not resolved
+ correctly.
+
+* ``ElementTree.parse()`` didn't handle target parser result.
+
+* Potential threading problem in XInclude.
+
+* Crash in Element class lookup classes when the __init__() method of
+ the super class is not called from Python subclasses.
+
+Other changes
+-------------
+
+* Non-ASCII characters in attribute values are no longer escaped on
+ serialisation.
+
+
+2.1beta3 (2008-06-19)
+=====================
+
+Features added
+--------------
+
+* Major overhaul of ``tools/xpathgrep.py`` script.
+
+* Pickling ``ElementTree`` objects in lxml.objectify.
+
+* Support for parsing from file-like objects that return unicode
+ strings.
+
+* New function ``etree.cleanup_namespaces(el)`` that removes unused
+ namespace declarations from a (sub)tree (experimental).
+
+* XSLT results support the buffer protocol in Python 3.
+
+* Polymorphic functions in ``lxml.html`` that accept either a tree or
+ a parsable string will return either a UTF-8 encoded byte string, a
+ unicode string or a tree, based on the type of the input.
+ Previously, the result was always a byte string or a tree.
+
+* Support for Python 2.6 and 3.0 beta.
+
+* File name handling now uses a heuristic to convert between byte
+ strings (usually filenames) and unicode strings (usually URLs).
+
+* Parsing from a plain file object frees the GIL under Python 2.x.
+
+* Running ``iterparse()`` on a plain file (or filename) frees the GIL
+ on reading under Python 2.x.
+
+* Conversion functions ``html_to_xhtml()`` and ``xhtml_to_html()`` in
+ lxml.html (experimental).
+
+* Most features in lxml.html work for XHTML namespaced tag names
+ (experimental).
+
+Bugs fixed
+----------
+
+* ``ElementTree.parse()`` didn't handle target parser result.
+
+* Crash in Element class lookup classes when the __init__() method of
+ the super class is not called from Python subclasses.
+
+* A number of problems related to unicode/byte string conversion of
+ filenames and error messages were fixed.
+
+* Building on MacOS-X now passes the "flat_namespace" option to the C
+ compiler, which reportedly prevents build quirks and crashes on this
+ platform.
+
+* Windows build was broken.
+
+* Rare crash when serialising to a file object with certain encodings.
+
+Other changes
+-------------
+
+* Non-ASCII characters in attribute values are no longer escaped on
+ serialisation.
+
+* Passing non-ASCII byte strings or invalid unicode strings as .tag,
+ namespaces, etc. will result in a ValueError instead of an
+ AssertionError (just like the tag well-formedness check).
+
+* Up to several times faster attribute access (i.e. tree traversal) in
+ lxml.objectify.
+
+
+2.0.6 (2008-05-31)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Incorrect evaluation of ``el.find("tag[child]")``.
+
+* Windows build was broken.
+
+* Moving a subtree from a document created in one thread into a
+ document of another thread could crash when the rest of the source
+ document is deleted while the subtree is still in use.
+
+* Rare crash when serialising to a file object with certain encodings.
+
+Other changes
+-------------
+
+* lxml should now build without problems on MacOS-X.
+
+
+2.1beta2 (2008-05-02)
+=====================
+
+Features added
+--------------
+
+* All parse functions in lxml.html take a ``parser`` keyword argument.
+
+* lxml.html has a new parser class ``XHTMLParser`` and a module
+ attribute ``xhtml_parser`` that provide XML parsers that are
+ pre-configured for the lxml.html package.
+
+Bugs fixed
+----------
+
+* Moving a subtree from a document created in one thread into a
+ document of another thread could crash when the rest of the source
+ document is deleted while the subtree is still in use.
+
+* Passing an nsmap when creating an Element will no longer strip
+ redundantly defined namespace URIs. This prevented the definition
+ of more than one prefix for a namespace on the same Element.
+
+Other changes
+-------------
+
+* If the default namespace is redundantly defined with a prefix on the
+ same Element, the prefix will now be preferred for subelements and
+ attributes. This allows users to work around a problem in libxml2
+ where attributes from the default namespace could serialise without
+ a prefix even when they appear on an Element with a different
+ namespace (i.e. they would end up in the wrong namespace).
+
+
+2.0.5 (2008-05-01)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Resolving to a filename in custom resolvers didn't work.
+
+* lxml did not honour libxslt's second error state "STOPPED", which
+ let some XSLT errors pass silently.
+
+* Memory leak in Schematron with libxml2 >= 2.6.31.
+
+Other changes
+-------------
+
+
+2.1beta1 (2008-04-15)
+=====================
+
+Features added
+--------------
+
+* Error logging in Schematron (requires libxml2 2.6.32 or later).
+
+* Parser option ``strip_cdata`` for normalising or keeping CDATA
+ sections. Defaults to ``True`` as before, thus replacing CDATA
+ sections by their text content.
+
+* ``CDATA()`` factory to wrap string content as CDATA section.
+
+Bugs fixed
+----------
+
+* Resolving to a filename in custom resolvers didn't work.
+
+* lxml did not honour libxslt's second error state "STOPPED", which
+ let some XSLT errors pass silently.
+
+* Memory leak in Schematron with libxml2 >= 2.6.31.
+
+* lxml.etree accepted non well-formed namespace prefix names.
+
+Other changes
+-------------
+
+* Major cleanup in internal ``moveNodeToDocument()`` function, which
+ takes care of namespace cleanup when moving elements between
+ different namespace contexts.
+
+* New Elements created through the ``makeelement()`` method of an HTML
+ parser or through lxml.html now end up in a new HTML document
+ (doctype HTML 4.01 Transitional) instead of a generic XML document.
+ This mostly impacts the serialisation and the availability of a DTD
+ context.
+
+
+2.0.4 (2008-04-13)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* Hanging thread in conjunction with GTK threading.
+
+* Crash bug in iterparse when moving elements into other documents.
+
+* HTML elements' ``.cssselect()`` method was broken.
+
+* ``ElementTree.find*()`` didn't accept QName objects.
+
+Other changes
+-------------
+
+
+2.1alpha1 (2008-03-27)
+======================
+
+Features added
+--------------
+
+* New event types 'comment' and 'pi' in ``iterparse()``.
+
+* ``XSLTAccessControl`` instances have a property ``options`` that
+ returns a dict of access configuration options.
+
+* Constant instances ``DENY_ALL`` and ``DENY_WRITE`` on
+ ``XSLTAccessControl`` class.
+
+* Extension elements for XSLT (experimental!)
+
+* ``Element.base`` property returns the xml:base or HTML base URL of
+ an Element.
+
+* ``docinfo.URL`` property is writable.
+
+Bugs fixed
+----------
+
+* Default encoding for plain text serialisation was different from
+ that of XML serialisation (UTF-8 instead of ASCII).
+
+Other changes
+-------------
+
+* Minor API speed-ups.
+
+* The benchmark suite now uses tail text in the trees, which makes the
+ absolute numbers incomparable to previous results.
+
+* Generating the HTML documentation now requires Pygments_, which is
+ used to enable syntax highlighting for the doctest examples.
+
+.. _Pygments: http://pygments.org/
+
+Most long-time deprecated functions and methods were removed:
+
+- ``etree.clearErrorLog()``, use ``etree.clear_error_log()``
+
+- ``etree.useGlobalPythonLog()``, use
+ ``etree.use_global_python_log()``
+
+- ``etree.ElementClassLookup.setFallback()``, use
+ ``etree.ElementClassLookup.set_fallback()``
+
+- ``etree.getDefaultParser()``, use ``etree.get_default_parser()``
+
+- ``etree.setDefaultParser()``, use ``etree.set_default_parser()``
+
+- ``etree.setElementClassLookup()``, use
+ ``etree.set_element_class_lookup()``
+
+ Note that ``parser.setElementClassLookup()`` has not been removed
+ yet, although ``parser.set_element_class_lookup()`` should be used
+ instead.
+
+- ``xpath_evaluator.registerNamespace()``, use
+ ``xpath_evaluator.register_namespace()``
+
+- ``xpath_evaluator.registerNamespaces()``, use
+ ``xpath_evaluator.register_namespaces()``
+
+- ``objectify.setPytypeAttributeTag``, use
+ ``objectify.set_pytype_attribute_tag``
+
+- ``objectify.setDefaultParser()``, use
+ ``objectify.set_default_parser()``
+
+
+2.0.3 (2008-03-26)
+==================
+
+Features added
+--------------
+
+* soupparser.parse() allows passing keyword arguments on to
+ BeautifulSoup.
+
+* ``fromstring()`` method in ``lxml.html.soupparser``.
+
+Bugs fixed
+----------
+
+* ``lxml.html.diff`` didn't treat empty tags properly (e.g.,
+ ``<br>``).
+
+* Handle entity replacements correctly in target parser.
+
+* Crash when using ``iterparse()`` with XML Schema validation.
+
+* The BeautifulSoup parser (soupparser.py) did not replace entities,
+ which made them turn up in text content.
+
+* Attribute assignment of custom PyTypes in objectify could fail to
+ correctly serialise the value to a string.
+
+Other changes
+-------------
+
+* ``lxml.html.ElementSoup`` was replaced by a new module
+ ``lxml.html.soupparser`` with a more consistent API. The old module
+ remains for compatibility with ElementTree's own ElementSoup module.
+
+* Setting the XSLT_CONFIG and XML2_CONFIG environment variables at
+ build time will let setup.py pick up the ``xml2-config`` and
+ ``xslt-config`` scripts from the supplied path name.
+
+* Passing ``--with-xml2-config=/path/to/xml2-config`` to setup.py will
+ override the ``xml2-config`` script that is used to determine the C
+ compiler options. The same applies for the ``--with-xslt-config``
+ option.
+
+
+2.0.2 (2008-02-22)
+==================
+
+Features added
+--------------
+
+* Support passing ``base_url`` to file parser functions to override
+ the filename of the file(-like) object.
+
+Bugs fixed
+----------
+
+* The prefix for objectify's pytype namespace was missing from the set
+ of default prefixes.
+
+* Memory leak in Schematron (fixed only for libxml2 2.6.31+).
+
+* Error type names in RelaxNG were reported incorrectly.
+
+* Slice deletion bug fixed in objectify.
+
+Other changes
+-------------
+
+* Enabled doctests for some Python modules (especially ``lxml.html``).
+
+* Add a ``method`` argument to ``lxml.html.tostring()``
+ (``method="xml"`` for XHTML output).
+
+* Make it clearer that methods like ``lxml.html.fromstring()`` take a
+ ``base_url`` argument.
+
+
+2.0.1 (2008-02-13)
+==================
+
+Features added
+--------------
+
+* Child iteration in ``lxml.pyclasslookup``.
+
+* Loads of new docstrings reflect the signature of functions and
+ methods to make them visible in API docs and ``help()``
+
+Bugs fixed
+----------
+
+* The module ``lxml.html.builder`` was duplicated as
+ ``lxml.htmlbuilder``
+
+* Form elements would return None for ``form.fields.keys()`` if there
+ was an unnamed input field. Now unnamed input fields are completely
+ ignored.
+
+* Setting an element slice in objectify could insert slice-overlapping
+ elements at the wrong position.
+
+Other changes
+-------------
+
+* The generated API documentation was cleaned up and disburdened from
+ non-public classes etc.
+
+* The previously public module ``lxml.html.setmixin`` was renamed to
+ ``lxml.html._setmixin`` as it is not an official part of lxml. If
+ you want to use it, feel free to copy it over to your own source
+ base.
+
+* Passing ``--with-xslt-config=/path/to/xslt-config`` to setup.py will
+ override the ``xslt-config`` script that is used to determine the C
+ compiler options.
+
+
+2.0 (2008-02-01)
+================
+
+Features added
+--------------
+
+* Passing the ``unicode`` type as ``encoding`` to ``tostring()`` will
+ serialise to unicode. The ``tounicode()`` function is now
+ deprecated.
+
+* ``XMLSchema()`` and ``RelaxNG()`` can parse from StringIO.
+
+* ``makeparser()`` function in ``lxml.objectify`` to create a new
+ parser with the usual objectify setup.
+
+* Plain ASCII XPath string results are no longer forced into unicode
+ objects as in 2.0beta1, but are returned as plain strings as before.
+
+* All XPath string results are 'smart' objects that have a
+ ``getparent()`` method to retrieve their parent Element.
+
+* ``with_tail`` option in serialiser functions.
+
+* More accurate exception messages in validator creation.
+
+* Parse-time XML schema validation (``schema`` parser keyword).
+
+* XPath string results of the ``text()`` function and attribute
+ selection make their Element container accessible through a
+ ``getparent()`` method. As a side-effect, they are now always
+ unicode objects (even ASCII strings).
+
+* ``XSLT`` objects are usable in any thread - at the cost of a deep
+ copy if they were not created in that thread.
+
+* Invalid entity names and character references will be rejected by
+ the ``Entity()`` factory.
+
+* ``entity.text`` returns the textual representation of the entity,
+ e.g. ``&amp;``.
+
+* New properties ``position`` and ``code`` on ParseError exception (as
+ in ET 1.3)
+
+* Rich comparison of ``element.attrib`` proxies.
+
+* ElementTree compatible TreeBuilder class.
+
+* Use default prefixes for some common XML namespaces.
+
+* ``lxml.html.clean.Cleaner`` now allows for a ``host_whitelist``, and
+ two overridable methods: ``allow_embedded_url(el, url)`` and the
+ more general ``allow_element(el)``.
+
+* Extended slicing of Elements as in ``element[1:-1:2]``, both in
+ etree and in objectify
+
+* Resolvers can now provide a ``base_url`` keyword argument when
+ resolving a document as string data.
+
+* When using ``lxml.doctestcompare`` you can give the doctest option
+ ``NOPARSE_MARKUP`` (like ``# doctest: +NOPARSE_MARKUP``) to suppress
+ the special checking for one test.
+
+* Separate ``feed_error_log`` property for the feed parser interface.
+ The normal parser interface and ``iterparse`` continue to use
+ ``error_log``.
+
+* The normal parsers and the feed parser interface are now separated
+ and can be used concurrently on the same parser instance.
+
+* ``fromstringlist()`` and ``tostringlist()`` functions as in
+ ElementTree 1.3
+
+* ``iterparse()`` accepts an ``html`` boolean keyword argument for
+ parsing with the HTML parser (note that this interface may be
+ subject to change)
+
+* Parsers accept an ``encoding`` keyword argument that overrides the encoding
+ of the parsed documents.
+
+* New C-API function ``hasChild()`` to test for children
+
+* ``annotate()`` function in objectify can annotate with Python types and XSI
+ types in one step. Accompanied by ``xsiannotate()`` and ``pyannotate()``.
+
+* ``ET.write()``, ``tostring()`` and ``tounicode()`` now accept a keyword
+ argument ``method`` that can be one of 'xml' (or None), 'html' or 'text' to
+ serialise as XML, HTML or plain text content.
+
+* ``iterfind()`` method on Elements returns an iterator equivalent to
+ ``findall()``
+
+* ``itertext()`` method on Elements
+
+* Setting a QName object as value of the .text property or as an attribute
+ will resolve its prefix in the respective context
+
+* ElementTree-like parser target interface as described in
+ http://effbot.org/elementtree/elementtree-xmlparser.htm
+
+* ElementTree-like feed parser interface on XMLParser and HTMLParser
+ (``feed()`` and ``close()`` methods)
+
+* Reimplemented ``objectify.E`` for better performance and improved
+ integration with objectify. Provides extended type support based on
+ registered PyTypes.
+
+* XSLT objects now support deep copying
+
+* New ``makeSubElement()`` C-API function that allows creating a new
+ subelement straight with text, tail and attributes.
+
+* XPath extension functions can now access the current context node
+ (``context.context_node``) and use a context dictionary
+ (``context.eval_context``) from the context provided in their first
+ parameter
+
+* HTML tag soup parser based on BeautifulSoup in ``lxml.html.ElementSoup``
+
+* New module ``lxml.doctestcompare`` by Ian Bicking for writing simplified
+ doctests based on XML/HTML output. Use by importing ``lxml.usedoctest`` or
+ ``lxml.html.usedoctest`` from within a doctest.
+
+* New module ``lxml.cssselect`` by Ian Bicking for selecting Elements with CSS
+ selectors.
+
+* New package ``lxml.html`` written by Ian Bicking for advanced HTML
+ treatment.
+
+* Namespace class setup is now local to the ``ElementNamespaceClassLookup``
+ instance and no longer global.
+
+* Schematron validation (incomplete in libxml2)
+
+* Additional ``stringify`` argument to ``objectify.PyType()`` takes a
+ conversion function to strings to support setting text values from arbitrary
+ types.
+
+* Entity support through an ``Entity`` factory and element classes. XML
+ parsers now have a ``resolve_entities`` keyword argument that can be set to
+ False to keep entities in the document.
+
+* ``column`` field on error log entries to accompany the ``line`` field
+
+* Error specific messages in XPath parsing and evaluation
+ NOTE: for evaluation errors, you will now get an XPathEvalError instead of
+ an XPathSyntaxError. To catch both, you can except on ``XPathError``
+
+* The regular expression functions in XPath now support passing a node-set
+ instead of a string
+
+* Extended type annotation in objectify: new ``xsiannotate()`` function
+
+* EXSLT RegExp support in standard XPath (not only XSLT)
+
+Bugs fixed
+----------
+
+* Missing import in ``lxml.html.clean``.
+
+* Some Python 2.4-isms prevented lxml from building/running under
+ Python 2.3.
+
+* XPath on ElementTrees could crash when selecting the virtual root
+ node of the ElementTree.
+
+* Compilation ``--without-threading`` was buggy in alpha5/6.
+
+* Memory leak in the ``parse()`` function.
+
+* Minor bugs in XSLT error message formatting.
+
+* Result document memory leak in target parser.
+
+* Target parser failed to report comments.
+
+* In the ``lxml.html`` ``iter_links`` method, links in ``<object>``
+ tags weren't recognized. (Note: plugin-specific link parameters
+ still aren't recognized.) Also, the ``<embed>`` tag, though not
+ standard, is now included in ``lxml.html.defs.special_inline_tags``.
+
+* Using custom resolvers on XSLT stylesheets parsed from a string
+ could request ill-formed URLs.
+
+* With ``lxml.doctestcompare`` if you do ``<tag xmlns="...">`` in your
+ output, it will then be namespace-neutral (before the ellipsis was
+ treated as a real namespace).
+
+* AttributeError in feed parser on parse errors
+
+* XML feed parser setup problem
+
+* Type annotation for unicode strings in ``DataElement()``
+
+* lxml failed to serialise namespace declarations of elements other than the
+ root node of a tree
+
+* Race condition in XSLT where the resolver context leaked between concurrent
+ XSLT calls
+
+* lxml.etree did not check tag/attribute names
+
+* The XML parser did not report undefined entities as error
+
+* The text in exceptions raised by XML parsers, validators and XPath
+ evaluators now reports the first error that occurred instead of the last
+
+* Passing '' as XPath namespace prefix did not raise an error
+
+* Thread safety in XPath evaluators
+
+Other changes
+-------------
+
+* Exceptions carry only the part of the error log that is related to
+ the operation that caused the error.
+
+* ``XMLSchema()`` and ``RelaxNG()`` now enforce passing the source
+ file/filename through the ``file`` keyword argument.
+
+* The test suite now skips most doctests under Python 2.3.
+
+* ``make clean`` no longer removes the .c files (use ``make
+ realclean`` instead)
+
+* Minor performance tweaks for Element instantiation and subelement
+ creation
+
+* Various places in the XPath, XSLT and iteration APIs now require
+ keyword-only arguments.
+
+* The argument order in ``element.itersiblings()`` was changed to
+ match the order used in all other iteration methods. The second
+ argument ('preceding') is now a keyword-only argument.
+
+* The ``getiterator()`` method on Elements and ElementTrees was
+ reverted to return an iterator as it did in lxml 1.x. The ET API
+ specification allows it to return either a sequence or an iterator,
+ and it traditionally returned a sequence in ET and an iterator in
+ lxml. However, it is now deprecated in favour of the ``iter()``
+ method, which should be used in new code wherever possible.
+
+* The 'pretty printed' serialisation of ElementTree objects now
+ inserts newlines at the root level between processing instructions,
+ comments and the root tag.
+
+* A 'pretty printed' serialisation is now terminated with a newline.
+
+* Second argument to ``lxml.etree.Extension()`` helper is no longer
+ required, third argument is now a keyword-only argument ``ns``.
+
+* ``lxml.html.tostring`` takes an ``encoding`` argument.
+
+* The module source files were renamed to "lxml.*.pyx", such as
+ "lxml.etree.pyx". This was changed for consistency with the way
+ Pyrex commonly handles package imports. The main effect is that
+ classes now know about their fully qualified class name, including
+ the package name of their module.
+
+* Keyword-only arguments in some API functions, especially in the
+ parsers and serialisers.
+
+* Tag name validation in lxml.etree (and lxml.html) now distinguishes
+ between HTML tags and XML tags based on the parser that was used to
+ parse or create them. HTML tags no longer reject any non-ASCII
+ characters in tag names but only spaces and the special characters
+ ``<>&/"'``.
+
+* lxml.etree now emits a warning if you use XPath with libxml2 2.6.27
+ (which can crash on certain XPath errors)
+
+* Type annotation in objectify now preserves the already annotated type by
+ default to prevent losing type information that is already there.
+
+* ``element.getiterator()`` returns a list, use ``element.iter()`` to retrieve
+ an iterator (ElementTree 1.3 compatible behaviour)
+
+* objectify.PyType for None is now called "NoneType"
+
+* ``el.getiterator()`` renamed to ``el.iter()``, following ElementTree 1.3 -
+ original name is still available as alias
+
+* In the public C-API, ``findOrBuildNodeNs()`` was replaced by the more
+ generic ``findOrBuildNodeNsPrefix``
+
+* Major refactoring in XPath/XSLT extension function code
+
+* Network access in parsers disabled by default
+
+
+1.3.6 (2007-10-29)
+==================
+
+Bugs fixed
+----------
+
+* Backported decref crash fix from 2.0
+
+* Well hidden free-while-in-use crash bug in ObjectPath
+
+Other changes
+-------------
+
+* The test suites now run ``gc.collect()`` in the ``tearDown()``
+ methods. While this makes them take a lot longer to run, it also
+ makes it easier to link a specific test to garbage collection
+ problems that would otherwise appear in later tests.
+
+
+1.3.5 (2007-10-22)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* lxml.etree could crash when adding more than 10000 namespaces to a
+ document
+
+* lxml failed to serialise namespace declarations of elements other
+ than the root node of a tree
+
+
+1.3.4 (2007-08-30)
+==================
+
+Features added
+--------------
+
+* The ``ElementMaker`` in ``lxml.builder`` now accepts the keyword arguments
+ ``namespace`` and ``nsmap`` to set a namespace and nsmap for the Elements it
+ creates.
+
+* The ``docinfo`` on ElementTree objects has new properties ``internalDTD``
+ and ``externalDTD`` that return a DTD object for the internal or external
+ subset of the document respectively.
+
+* Serialising an ElementTree now includes any internal DTD subsets that are
+ part of the document, as well as comments and PIs that are siblings of the
+ root node.
+
+Bugs fixed
+----------
+
+* Parsing with the ``no_network`` option could fail
+
+Other changes
+-------------
+
+* lxml now raises a TagNameWarning about tag names containing ':' instead of
+ an Error as 1.3.3 did. The reason is that a number of projects currently
+ misuse the previous lack of tag name validation to generate namespace
+ prefixes without declaring namespaces. Apart from the danger of generating
+ broken XML this way, it also breaks most of the namespace-aware tools in
+ XML, including XPath, XSLT and validation. lxml 1.3.x will continue to
+ support this bug with a Warning, while lxml 2.0 will be strict about
+ well-formed tag names (not only regarding ':').
+
+* Serialising an Element no longer includes its comment and PI siblings (only
+ ElementTree serialisation includes them).
+
+
+1.3.3 (2007-07-26)
+==================
+
+Features added
+--------------
+
+* ElementTree compatible parser ``ETCompatXMLParser`` strips processing
+ instructions and comments while parsing XML
+
+* Parsers now support stripping PIs (keyword argument 'remove_pis')
+
+* ``etree.fromstring()`` now supports parsing both HTML and XML, depending on
+ the parser you pass.
+
+* Support ``base_url`` keyword argument in ``HTML()`` and ``XML()``
+
+Bugs fixed
+----------
+
+* Parsing from Python Unicode strings failed on some platforms
+
+* ``Element()`` did not raise an exception on tag names containing ':'
+
+* ``Element.getiterator(tag)`` did not accept ``Comment`` and
+ ``ProcessingInstruction`` as tags. It also accepts ``Element`` now.
+
+
+1.3.2 (2007-07-03)
+==================
+
+Features added
+--------------
+
+Bugs fixed
+----------
+
+* "deallocating None" crash bug
+
+
+1.3.1 (2007-07-02)
+==================
+
+Features added
+--------------
+
+* objectify.DataElement now supports setting values from existing data
+ elements (not just plain Python types) and reuses defined namespaces etc.
+
+* E-factory support for lxml.objectify (``objectify.E``)
+
+Bugs fixed
+----------
+
+* Better way to prevent crashes in Element proxy cleanup code
+
+* objectify.DataElement didn't set up None value correctly
+
+* objectify.DataElement didn't check the value against the provided type hints
+
+* Reference-counting bug in ``Element.attrib.pop()``
+
+
+1.3 (2007-06-24)
+================
+
+Features added
+--------------
+
+* Module ``lxml.pyclasslookup`` module implements an Element class lookup
+ scheme that can access the entire tree in read-only mode to help determining
+ a suitable Element class
+
+* Parsers take a ``remove_comments`` keyword argument that skips over comments
+
+* ``parse()`` function in ``objectify``, corresponding to ``XML()`` etc.
+
+* ``Element.addnext(el)`` and ``Element.addprevious(el)`` methods to support
+ adding processing instructions and comments around the root node
+
+* ``Element.attrib`` was missing ``clear()`` and ``pop()`` methods
+
+* Extended type annotation in objectify: cleaner annotation namespace setup
+ plus new ``deannotate()`` function
+
+* Support for custom Element class instantiation in lxml.sax: passing a
+ ``makeelement`` function to the ElementTreeContentHandler will reuse the
+ lookup context of that function
+
+* '.' represents empty ObjectPath (identity)
+
+* ``Element.values()`` to accompany the existing ``.keys()`` and ``.items()``
+
+* ``collectAttributes()`` C-function to build a list of attribute
+ keys/values/items for a libxml2 node
+
+* ``DTD`` validator class (like ``RelaxNG`` and ``XMLSchema``)
+
+* HTML generator helpers by Fredrik Lundh in ``lxml.htmlbuilder``
+
+* ``ElementMaker`` XML generator by Fredrik Lundh in ``lxml.builder.E``
+
+* Support for pickling ``objectify.ObjectifiedElement`` objects to XML
+
+* ``update()`` method on Element.attrib
+
+* Optimised replacement for libxml2's _xmlReconsiliateNs(). This allows lxml
+ a better handling of namespaces when moving elements between documents.
+
+Bugs fixed
+----------
+
+* Removing Elements from a tree could make them lose their namespace
+ declarations
+
+* ``ElementInclude`` didn't honour base URL of original document
+
+* Replacing the children slice of an Element would cut off the tails of the
+ original children
+
+* ``Element.getiterator(tag)`` did not accept ``Comment`` and
+ ``ProcessingInstruction`` as tags
+
+* API functions now check incoming strings for XML conformity. Zero bytes or
+ low ASCII characters are no longer accepted (AssertionError).
+
+* XSLT parsing failed to pass resolver context on to imported documents
+
+* passing '' as namespace prefix in nsmap could be passed through to libxml2
+
+* Objectify couldn't handle prefixed XSD type names in ``xsi:type``
+
+* More ET compatible behaviour when writing out XML declarations or not
+
+* More robust error handling in ``iterparse()``
+
+* Documents lost their top-level PIs and comments on serialisation
+
+* lxml.sax failed on comments and PIs. Comments are now properly ignored and
+ PIs are copied.
+
+* Possible memory leaks in namespace handling when moving elements between
+ documents
+
+Other changes
+-------------
+
+* major restructuring in the documentation
+
+
+1.2.1 (2007-02-27)
+==================
+
+Bugs fixed
+----------
+
+* Build fixes for MS compiler
+
+* Item assignments to special names like ``element["text"]`` failed
+
+* Renamed ObjectifiedDataElement.__setText() to _setText() to make it easier
+ to access
+
+* The pattern for attribute names in ObjectPath was too restrictive
+
+
+1.2 (2007-02-20)
+================
+
+Features added
+--------------
+
+* Rich comparison of QName objects
+
+* Support for regular expressions in benchmark selection
+
+* get/set emulation (not .attrib!) for attributes on processing instructions
+
+* ElementInclude Python module for ElementTree compatible XInclude processing
+ that honours custom resolvers registered with the source document
+
+* ElementTree.parser property holds the parser used to parse the document
+
+* setup.py has been refactored for greater readability and flexibility
+
+* --rpath flag to setup.py to induce automatic linking-in of dynamic library
+ runtime search paths has been renamed to --auto-rpath. This makes it
+ possible to pass an --rpath directly to distutils; previously this was being
+ shadowed.
+
+Bugs fixed
+----------
+
+* Element instantiation now uses locks to prevent race conditions with threads
+
+* ElementTree.write() did not raise an exception when the file was not writable
+
+* Error handling could crash under Python <= 2.4.1 - fixed by disabling thread
+ support in these environments
+
+* Element.find*() did not accept QName objects as path
+
+Other changes
+-------------
+
+* code cleanup: redundant _NodeBase super class merged into _Element class
+ Note: although the impact should be zero in most cases, this change breaks
+ the compatibility of the public C-API
+
+
+1.1.2 (2006-10-30)
+==================
+
+Features added
+--------------
+
+* Data elements in objectify support repr(), which is now used by dump()
+
+* Source distribution now ships with a patched Pyrex
+
+* New C-API function makeElement() to create new elements with text,
+ tail, attributes and namespaces
+
+* Reuse original parser flags for XInclude
+
+* Simplified support for handling XSLT processing instructions
+
+Bugs fixed
+----------
+
+* Parser resources were not freed before the next parser run
+
+* Open files and XML strings returned by Python resolvers were not
+ closed/freed
+
+* Crash in the IDDict returned by XMLDTDID
+
+* Copying Comments and ProcessingInstructions failed
+
+* Memory leak for external URLs in _XSLTProcessingInstruction.parseXSL()
+
+* Memory leak when garbage collecting tailed root elements
+
+* HTML script/style content was not propagated to .text
+
+* Show text xincluded between text nodes correctly in .text and .tail
+
+* 'integer * objectify.StringElement' operation was not supported
+
+
+1.1.1 (2006-09-21)
+==================
+
+Features added
+--------------
+
+* XSLT profiling support (``profile_run`` keyword)
+
+* countchildren() method on objectify.ObjectifiedElement
+
+* Support custom elements for tree nodes in lxml.objectify
+
+Bugs fixed
+----------
+
+* lxml.objectify failed to support long data values (e.g., "123L")
+
+* Error messages from XSLT did not reach ``XSLT.error_log``
+
+* Factories objectify.Element() and objectify.DataElement() were missing
+ ``attrib`` and ``nsmap`` keyword arguments
+
+* Changing the default parser in lxml.objectify did not update the factories
+ Element() and DataElement()
+
+* Let lxml.objectify.Element() always generate tree elements (not data
+ elements)
+
+* Build under Windows failed ('\0' bug in patched Pyrex version)
+
+
+1.1 (2006-09-13)
+================
+
+Features added
+--------------
+
+* Comments and processing instructions return '<!-- comment -->' and
+ '<?pi-target content?>' for repr()
+
+* Parsers are now the preferred (and default) place where element class lookup
+ schemes should be registered. Namespace lookup is no longer supported by
+ default.
+
+* Support for Python 2.5 beta
+
+* Unlock the GIL for deep copying documents and for XPath()
+
+* New ``compact`` keyword argument for parsing read-only documents
+
+* Support for parser options in iterparse()
+
+* The ``namespace`` axis is supported in XPath and returns (prefix, URI)
+ tuples
+
+* The XPath expression "/" now returns an empty list instead of raising an
+ exception
+
+* XML-Object API on top of lxml (lxml.objectify)
+
+* Customizable Element class lookup:
+
+ * different pre-implemented lookup mechanisms
+
+ * support for externally provided lookup functions
+
+* Support for processing instructions (ET-like, not compatible)
+
+* Public C-level API for independent extension modules
+
+* Module level ``iterwalk()`` function as 'iterparse' for trees
+
+* Module level ``iterparse()`` function similar to ElementTree (see
+ documentation for differences)
+
+* Element.nsmap property returns a mapping of all namespace prefixes known at
+ the Element to their namespace URI
+
+* Reentrant threading support in RelaxNG, XMLSchema and XSLT
+
+* Threading support in parsers and serializers:
+
+ * All in-memory operations (tostring, parse(StringIO), etc.) free the GIL
+
+ * File operations (on file names) free the GIL
+
+ * Reading from file-like objects frees the GIL and reacquires it for reading
+
+ * Serialisation to file-like objects is single-threaded (high lock overhead)
+
+* Element iteration over XPath axes:
+
+ * Element.iterdescendants() iterates over the descendants of an element
+
+ * Element.iterancestors() iterates over the ancestors of an element (from
+ parent to parent)
+
+ * Element.itersiblings() iterates over either the following or preceding
+ siblings of an element
+
+ * Element.iterchildren() iterates over the children of an element in either
+ direction
+
+ * All iterators support the ``tag`` keyword argument to restrict the
+ generated elements
+
+* Element.getnext() and Element.getprevious() return the direct siblings of an
+ element
+
+Bugs fixed
+----------
+
+* filenames with local 8-bit encoding were not supported
+
+* 1.1beta did not compile under Python 2.3
+
+* ignore unknown 'pyval' attribute values in objectify
+
+* objectify.ObjectifiedElement.addattr() failed to accept Elements and Lists
+
+* objectify.ObjectPath.setattr() failed to accept Elements and Lists
+
+* XPathSyntaxError now inherits from XPathError
+
+* Threading race conditions in RelaxNG and XMLSchema
+
+* Crash when mixing elements from XSLT results into other trees, concurrent
+ XSLT is only allowed when the stylesheet was parsed in the main thread
+
+* The EXSLT ``regexp:match`` function now works as defined (except for some
+ differences in the regular expression syntax)
+
+* Setting element.text to '' returned None on request, not the empty string
+
+* ``iterparse()`` could crash on long XML files
+
+* Creating documents no longer copies the parser for later URL resolving. For
+ performance reasons, only a reference is kept. Resolver updates on the
+ parser will now be reflected by documents that were parsed before the
+ change. Although this should rarely become visible, it is a behavioral
+ change from 1.0.
+
+
+1.0.4 (2006-09-09)
+==================
+
+Features added
+--------------
+
+* List-like ``Element.extend()`` method
+
+Bugs fixed
+----------
+
+* Crash in tail handling in ``Element.replace()``
+
+
+1.0.3 (2006-08-08)
+==================
+
+Features added
+--------------
+
+* Element.replace(old, new) method to replace a subelement by another one
+
+Bugs fixed
+----------
+
+* Crash when mixing elements from XSLT results into other trees
+
+* Copying/deepcopying did not work for ElementTree objects
+
+* Setting an attribute to a non-string value did not raise an exception
+
+* Element.remove() deleted the tail text from the removed Element
+
+
+1.0.2 (2006-06-27)
+==================
+
+Features added
+--------------
+
+* Support for setting a custom default Element class as opposed to namespace
+ specific classes (which still override the default class)
+
+Bugs fixed
+----------
+
+* Rare exceptions in Python list functions were not handled
+
+* Parsing accepted unicode strings with XML encoding declaration in certain
+ cases
+
+* Parsing 8-bit encoded strings from StringIO objects raised an exception
+
+* Module function ``initThread()`` was removed - useless (and never worked)
+
+* XSLT and parser exception messages include the error line number
+
+
+1.0.1 (2006-06-09)
+==================
+
+Features added
+--------------
+
+* Repeated calls to Element.attrib now efficiently return the same instance
+
+Bugs fixed
+----------
+
+* Document deallocation could crash in certain garbage collection scenarios
+
+* Extension function calls in XSLT variable declarations could break the
+ stylesheet and crash on repeated calls
+
+* Deep copying Elements could lose namespaces declared in parents
+
+* Deep copying Elements did not copy tail
+
+* Parsing file(-like) objects failed to load external entities
+
+* Parsing 8-bit strings from file(-like) objects raised an exception
+
+* xsl:include failed when the stylesheet was parsed from a file-like object
+
+* lxml.sax.ElementTreeProducer did not call startDocument() / endDocument()
+
+* MSVC compiler complained about long strings (supports only 2048 bytes)
+
+
+1.0 (2006-06-01)
+================
+
+Features added
+--------------
+
+* Element.getiterator() and the findall() methods support finding arbitrary
+ elements from a namespace (pattern ``{namespace}*``)
+
+* Another speedup in tree iteration code
+
+* General speedup of Python Element object creation and deallocation
+
+* Writing C14N no longer serializes in memory (reduced memory footprint)
+
+* PyErrorLog for error logging through the Python ``logging`` module
+
+* ``Element.getroottree()`` returns an ElementTree for the root node of the
+ document that contains the element.
+
+* ElementTree.getpath(element) returns a simple, absolute XPath expression to
+ find the element in the tree structure
+
+* Error logs have a ``last_error`` attribute for convenience
+
+* Comment texts can be changed through the API
+
+* Formatted output via ``pretty_print`` keyword in serialization functions
+
+* XSLT can block access to file system and network via ``XSLTAccessControl``
+
+* ElementTree.write() no longer serializes in memory (reduced memory
+ footprint)
+
+* Speedup of Element.findall(tag) and Element.getiterator(tag)
+
+* Support for writing the XML representation of Elements and ElementTrees to
+ Python unicode strings via ``etree.tounicode()``
+
+* Support for writing XSLT results to Python unicode strings via ``unicode()``
+
+* Parsing a unicode string no longer copies the string (reduced memory
+ footprint)
+
+* Parsing file-like objects reads chunks rather than the whole file (reduced
+ memory footprint)
+
+* Parsing StringIO objects from the start avoids copying the string (reduced
+ memory footprint)
+
+* Read-only 'docinfo' attribute in ElementTree class holds DOCTYPE
+ information, original encoding and XML version as seen by the parser
+
+* etree module can be compiled without libxslt by commenting out the line
+ ``include "xslt.pxi"`` near the end of the etree.pyx source file
+
+* Better error messages in parser exceptions
+
+* Error reporting also works in XSLT
+
+* Support for custom document loaders (URI resolvers) in parsers and XSLT,
+ resolvers are registered at parser level
+
+* Implementation of exslt:regexp for XSLT based on the Python 're' module,
+ enabled by default, can be switched off with 'regexp=False' keyword argument
+
+* Support for exslt extensions (libexslt) and libxslt extra functions
+ (node-set, document, write, output)
+
+* Substantial speedup in XPath.evaluate()
+
+* HTMLParser for parsing (broken) HTML
+
+* XMLDTDID function parses XML into tuple (root node, ID dict) based on xml:id
+ implementation of libxml2 (as opposed to ET compatible XMLID)
+
+Bugs fixed
+----------
+
+* Memory leak in Element.__setitem__
+
+* Memory leak in Element.attrib.items() and Element.attrib.values()
+
+* Memory leak in XPath extension functions
+
+* Memory leak in unicode related setup code
+
+* Element now raises ValueError on empty tag names
+
+* Namespace fixing after moving elements between documents could fail if the
+ source document was freed too early
+
+* Setting namespace-less tag names on namespaced elements ('{ns}t' -> 't')
+ didn't reset the namespace
+
+* Unknown constants from newer libxml2 versions could raise exceptions in the
+ error handlers
+
+* lxml.etree compiles much faster
+
+* On libxml2 <= 2.6.22, parsing strings with encoding declaration could fail
+ in certain cases
+
+* Document reference in ElementTree objects was not updated when the root
+ element was moved to a different document
+
+* Running absolute XPath expressions on an Element now evaluates against the
+ root tree
+
+* Evaluating absolute XPath expressions (``/*``) on an ElementTree could fail
+
+* Crashes when calling XSLT, RelaxNG, etc. with uninitialized ElementTree
+ objects
+
+* Removed public function ``initThreadLogging()``, replaced by more general
+ ``initThread()`` which fixes a number of setup problems in threads
+
+* Memory leak when using iconv encoders in tostring/write
+
+* Deep copying Elements and ElementTrees maintains the document information
+
+* Serialization functions raise LookupError for unknown encodings
+
+* Memory deallocation crash resulting from deep copying elements
+
+* Some ElementTree methods could crash if the root node was not initialized
+ (neither file nor element passed to the constructor)
+
+* Element/SubElement failed to set attribute namespaces from passed ``attrib``
+ dictionary
+
+* ``tostring()`` adds an XML declaration for non-ASCII encodings
+
+* ``tostring()`` failed to serialize encodings that contain 0-bytes
+
+* ElementTree.xpath() and XPathDocumentEvaluator were not using the
+ ElementTree root node as reference point
+
+* Calling ``document('')`` in XSLT failed to return the stylesheet
+
+
+0.9.2 (2006-05-10)
+==================
+
+Features added
+--------------
+
+* Speedup for Element.makeelement(): the new element reuses the original
+ libxml2 document instead of creating a new empty one
+
+* Speedup for reversed() iteration over element children (Py2.4+ only)
+
+* ElementTree compatible QName class
+
+* RelaxNG and XMLSchema accept any Element, not only ElementTrees
+
+Bugs fixed
+----------
+
+* str(xslt_result) was broken for XSLT output other than UTF-8
+
+* Memory leak if write_c14n fails to write the file after conversion
+
+* Crash in XMLSchema and RelaxNG when passing non-schema documents
+
+* Memory leak in RelaxNG() when RelaxNGParseError is raised
+
+0.9.1 (2006-03-30)
+==================
+
+Features added
+--------------
+
+* lxml.sax.ElementTreeContentHandler checks closing elements and raises
+ SaxError on mismatch
+
+* lxml.sax.ElementTreeContentHandler supports namespace-less SAX events
+ (startElement, endElement) and defaults to empty attributes (keyword
+ argument)
+
+* Speedup for repeatedly accessing element tag names
+
+* Minor API performance improvements
+
+Bugs fixed
+----------
+
+* Memory deallocation bug when using XSLT output method "html"
+
+* sax.py was handling UTF-8 encoded tag names where it shouldn't
+
+* lxml.tests package will no longer be installed (is still in source tar)
+
+0.9 (2006-03-20)
+================
+
+Features added
+--------------
+
+* Error logging API for libxml2 error messages
+
+* Various performance improvements
+
+* Benchmark script for lxml, ElementTree and cElementTree
+
+* Support for registering extension functions through new FunctionNamespace
+ class (see doc/extensions.txt)
+
+* ETXPath class for XPath expressions in ElementTree notation ('//{ns}tag')
+
+* Support for variables in XPath expressions (also in XPath class)
+
+* XPath class for compiled XPath expressions
+
+* XMLID module level function (ElementTree compatible)
+
+* XMLParser API for customized libxml2 parser configuration
+
+* Support for custom Element classes through new Namespace API (see
+ doc/namespace_extensions.txt)
+
+* Common exception base class LxmlError for module exceptions
+
+* real iterator support in iter(Element), Element.getiterator()
+
+* XSLT objects are callable, result trees support str()
+
+* Added MANIFEST.in for easier creation of RPM files.
+
+* 'getparent' method on elements allows navigation to an element's
+ parent element.
+
+* Python core compatible SAX tree builder and SAX event generator. See
+ doc/sax.txt for more information.
+
+Bugs fixed
+----------
+
+* Segfaults and memory leaks in various API functions of Element
+
+* Segfault in XSLT.tostring()
+
+* ElementTree objects no longer interfere, Elements can be root of different
+ ElementTrees at the same time
+
+* document('') works in XSLT documents read from files (in-memory documents
+ cannot support this due to libxslt deficiencies)
+
+0.8 (2005-11-03)
+================
+
+Features added
+--------------
+
+* Support for copy.deepcopy() on elements. copy.copy() works also, but
+ does the same thing, and does *not* create a shallow copy, as that
+ makes no sense in the context of libxml2 trees. This means a
+ potential incompatibility with ElementTree, but there's more chance
+ that it works than if copy.copy() isn't supported at all.
+
+* Increased compatibility with (c)ElementTree; .parse() on ElementTree is
+ supported and parsing of gzipped XML files works.
+
+* implemented index() on elements, allowing one to find the index of a
+ SubElement.
+
+Bugs fixed
+----------
+
+* Use xslt-config instead of xml2-config to find out libxml2
+ directories to take into account a case where libxslt is installed
+ in a different directory than libxslt.
+
+* Eliminate crash condition in iteration when text nodes are changed.
+
+* Passing 'None' to tostring() does not result in a segfault anymore,
+ but an AssertionError.
+
+* Some test fixes for Windows.
+
+* Raise XMLSyntaxError and XPathSyntaxError instead of plain python
+ syntax errors. This should be less confusing.
+
+* Fixed error with uncaught exception in Pyrex code.
+
+* Calling lxml.etree.fromstring('') throws XMLSyntaxError instead of a
+ segfault.
+
+* has_key() works on attrib. 'in' tests also work correctly on attrib.
+
+* INSTALL.txt was saying 2.2.16 instead of 2.6.16 as a supported
+ libxml2 version, as it should.
+
+* Passing a UTF-8 encoded string to the XML() function would fail;
+ fixed.
+
+0.7 (2005-06-15)
+================
+
+Features added
+--------------
+
+* parameters (XPath expressions) can be passed to XSLT using keyword
+ parameters.
+
+* Simple XInclude support. Calling the xinclude() method on a tree
+ will process any XInclude statements in the document.
+
+* XMLSchema support. Use the XMLSchema class or the convenience
+ xmlschema() method on a tree to do XML Schema (XSD) validation.
+
+* Added convenience xslt() method on tree. This is less efficient
+ than the XSLT object, but makes it easier to write quick code.
+
+* Added convenience relaxng() method on tree. This is less efficient
+ than the RelaxNG object, but makes it easier to write quick code.
+
+* Make it possible to use XPathEvaluator with elements as well. The
+ XPathEvaluator in this case will retain the element so multiple
+ XPath queries can be made against one element efficiently. This
+ replaces the second argument to the .evaluate() method that existed
+ previously.
+
+* Allow registerNamespace() to be called on an XPathEvaluator, after
+ creation, to add additional namespaces. Also allow registerNamespaces(),
+ which does the same for a namespace dictionary.
+
+* Add 'prefix' attribute to element to be able to read prefix information.
+ This is entirely read-only.
+
+* It is possible to supply an extra nsmap keyword parameter to
+ the Element() and SubElement() constructors, which supplies a
+ prefix to namespace URI mapping. This will create namespace
+ prefix declarations on these elements and these prefixes will show up
+ in XML serialization.
+
+Bugs fixed
+----------
+
+* Killed yet another memory management related bug: trees created
+ using newDoc would not get a libxml2-level dictionary, which caused
+ problems when deallocating these documents later if they contained a
+ node that came from a document with a dictionary.
+
+* Moving namespaced elements between documents was problematic as
+ references to the original document would remain. This has been fixed
+ by applying xmlReconciliateNs() after each move operation.
+
+* Can pass None to 'dump()' without segfaults.
+
+* tostring() works properly for non-root elements as well.
+
+* Cleaned out the tostring() method so it should handle encoding
+ correctly.
+
+* Cleaned out the ElementTree.write() method so it should handle encoding
+ correctly. Writing directly to a file should also be faster, as there is no
+ need to go through a Python string in that case. Made sure the test cases
+ test both serializing to StringIO as well as serializing to a real file.
+
+0.6 (2005-05-14)
+================
+
+Features added
+--------------
+
+* Changed setup.py so that library_dirs is also guessed. This should
+ help with compilation on the Mac OS X platform, where otherwise the
+ wrong library (shipping with the OS) could be picked up.
+
+* Tweaked setup.py so that it picks up the version from version.txt.
+
+Bugs fixed
+----------
+
+* Do the right thing when handling namespaced attributes.
+
+* fix bug where tostring() moved nodes into new documents. tostring()
+ had very nasty side-effects before this fix, sorry!
+
+0.5.1 (2005-04-09)
+==================
+
+* Python 2.2 compatibility fixes.
+
+* unicode fixes in Element() and Comment() as well as XML(); unicode
+ input wasn't properly being UTF-8 encoded.
+
+0.5 (2005-04-08)
+================
+
+Initial public release.
diff --git a/CREDITS.txt b/CREDITS.txt
new file mode 100644
index 0000000..45f9d5d
--- /dev/null
+++ b/CREDITS.txt
@@ -0,0 +1,83 @@
+=======
+Credits
+=======
+
+Main contributors
+=================
+
+Stefan Behnel
+ main developer and maintainer
+
+Martijn Faassen
+ creator of lxml and initial main developer
+
+Ian Bicking
+ creator and maintainer of lxml.html
+
+Holger Joukl
+ ISO-Schematron support, development on lxml.objectify, bug reports, feedback
+
+Simon Sapin
+ external maintenance and development of the cssselect package
+
+Marc-Antoine Parent
+ XPath extension function help and patches
+
+Olivier Grisel
+ improved (c)ElementTree compatibility patches,
+ website improvements.
+
+Kasimier Buchcik
+ help with specs and libxml2
+
+Florian Wagner
+ help with copy.deepcopy support, bug reporting
+
+Emil Kroymann
+ help with encoding support, bug reporting
+
+Paul Everitt
+ bug reporting, feedback on API design
+
+Victor Ng
+ Discussions on memory management strategies, vlibxml2
+
+Robert Kern
+ feedback on API design
+
+Andreas Pakulat
+ rpath linking support, doc improvements
+
+David Sankel
+ building statically on Windows
+
+Marcin Kasperski
+ PDF documentation generation
+
+Sidnei da Silva
+ official MS Windows builds
+
+Pascal Oberndörfer
+ official Mac-OS builds
+
+... and lots of other people who contributed to lxml by reporting
+bugs, discussing its functionality or blaming the docs for the bugs in
+their code. Thank you all, user feedback and discussions form a very
+important part of an Open Source project!
+
+
+Special thanks goes to:
+=======================
+
+* Daniel Veillard and the libxml2 project for a great XML library.
+
+* Fredrik Lundh for ElementTree, its API, and the competition through
+ cElementTree.
+
+* Greg Ewing (Pyrex) and Robert Bradshaw et al. (Cython) for the
+ binding technology.
+
+* Jonathan Stoppani for hosting the new mailing list on lxml.de.
+
+* the codespeak crew, in particular Philipp von Weitershausen and
+ Holger Krekel for originally hosting lxml on codespeak.net
diff --git a/DD.py b/DD.py
new file mode 100644
index 0000000..47dfec7
--- /dev/null
+++ b/DD.py
@@ -0,0 +1,916 @@
+#! /usr/bin/env python
+# $Id: DD.py,v 1.2 2001/11/05 19:53:33 zeller Exp $
+# Enhanced Delta Debugging class
+# Copyright (c) 1999, 2000, 2001 Andreas Zeller.
+
+# This module (written in Python) implements the base delta debugging
+# algorithms and is at the core of all our experiments. This should
+# easily run on any platform and any Python version since 1.6.
+#
+# To plug this into your system, all you have to do is to create a
+# subclass with a dedicated `test()' method. Basically, you would
+# invoke the DD test case minimization algorithm (= the `ddmin()'
+# method) with a list of characters; the `test()' method would combine
+# them to a document and run the test. This should be easy to realize
+# and give you some good starting results; the file includes a simple
+# sample application.
+#
+# This file is in the public domain; feel free to copy, modify, use
+# and distribute this software as you wish - with one exception.
+# Passau University has filed a patent for the use of delta debugging
+# on program states (A. Zeller: `Isolating cause-effect chains',
+# Saarland University, 2001). The fact that this file is publicly
+# available does not imply that I or anyone else grants you any rights
+# related to this patent.
+#
+# The use of Delta Debugging to isolate failure-inducing code changes
+# (A. Zeller: `Yesterday, my program worked', ESEC/FSE 1999) or to
+# simplify failure-inducing input (R. Hildebrandt, A. Zeller:
+# `Simplifying failure-inducing input', ISSTA 2000) is, as far as I
+# know, not covered by any patent, nor will it ever be. If you use
+# this software in any way, I'd appreciate if you include a citation
+# such as `This software uses the delta debugging algorithm as
+# described in (insert one of the papers above)'.
+#
+# All about Delta Debugging is found at the delta debugging web site,
+#
+# http://www.st.cs.uni-sb.de/dd/
+#
+# Happy debugging,
+#
+# Andreas Zeller
+
+
+# Start with some helpers.
+class OutcomeCache(object):
+ # This class holds test outcomes for configurations. This avoids
+ # running the same test twice.
+
+ # The outcome cache is implemented as a tree. Each node points
+ # to the outcome of the remaining list.
+ #
+ # Example: ([1, 2, 3], PASS), ([1, 2], FAIL), ([1, 4, 5], FAIL):
+ #
+ # (2, FAIL)--(3, PASS)
+ # /
+ # (1, None)
+ # \
+ # (4, None)--(5, FAIL)
+
+ def __init__(self):
+ self.tail = {} # Points to outcome of tail
+ self.result = None # Result so far
+
+ def add(self, c, result):
+ """Add (C, RESULT) to the cache. C must be a list of scalars."""
+ cs = c[:]
+ cs.sort()
+
+ p = self
+ for start in c:
+ if start not in p.tail:
+ p.tail[start] = OutcomeCache()
+ p = p.tail[start]
+
+ p.result = result
+
+ def lookup(self, c):
+ """Return RESULT if (C, RESULT) is in the cache; None, otherwise."""
+ p = self
+ for start in c:
+ if start not in p.tail:
+ return None
+ p = p.tail[start]
+
+ return p.result
+
+ def lookup_superset(self, c, start = 0):
+ """Return RESULT if there is some (C', RESULT) in the cache with
+ C' being a superset of C or equal to C. Otherwise, return None."""
+
+ # FIXME: Make this non-recursive!
+ if start >= len(c):
+ if self.result:
+ return self.result
+ elif self.tail != {}:
+ # Select some superset
+ superset = self.tail[list(self.tail.keys())[0]]
+ return superset.lookup_superset(c, start + 1)
+ else:
+ return None
+
+ if c[start] in self.tail:
+ return self.tail[c[start]].lookup_superset(c, start + 1)
+
+ # Let K0 be the largest element in TAIL such that K0 <= C[START]
+ k0 = None
+ for k in self.tail.keys():
+ if (k0 is None or k > k0) and k <= c[start]:
+ k0 = k
+
+ if k0 is not None:
+ return self.tail[k0].lookup_superset(c, start)
+
+ return None
+
+ def lookup_subset(self, c):
+ """Return RESULT if there is some (C', RESULT) in the cache with
+ C' being a subset of C or equal to C. Otherwise, return None."""
+ p = self
+ for start in range(len(c)):
+ if c[start] in p.tail:
+ p = p.tail[c[start]]
+
+ return p.result
+
+
+
+
+# Test the outcome cache
+def oc_test():
+ oc = OutcomeCache()
+
+ assert oc.lookup([1, 2, 3]) is None
+ oc.add([1, 2, 3], 4)
+ assert oc.lookup([1, 2, 3]) == 4
+ assert oc.lookup([1, 2, 3, 4]) is None
+
+ assert oc.lookup([5, 6, 7]) is None
+ oc.add([5, 6, 7], 8)
+ assert oc.lookup([5, 6, 7]) == 8
+
+ assert oc.lookup([]) is None
+ oc.add([], 0)
+ assert oc.lookup([]) == 0
+
+ assert oc.lookup([1, 2]) is None
+ oc.add([1, 2], 3)
+ assert oc.lookup([1, 2]) == 3
+ assert oc.lookup([1, 2, 3]) == 4
+
+ assert oc.lookup_superset([1]) == 3 or oc.lookup_superset([1]) == 4
+ assert oc.lookup_superset([1, 2]) == 3 or oc.lookup_superset([1, 2]) == 4
+ assert oc.lookup_superset([5]) == 8
+ assert oc.lookup_superset([5, 6]) == 8
+ assert oc.lookup_superset([6, 7]) == 8
+ assert oc.lookup_superset([7]) == 8
+ assert oc.lookup_superset([]) is not None
+
+ assert oc.lookup_superset([9]) is None
+ assert oc.lookup_superset([7, 9]) is None
+ assert oc.lookup_superset([-5, 1]) is None
+ assert oc.lookup_superset([1, 2, 3, 9]) is None
+ assert oc.lookup_superset([4, 5, 6, 7]) is None
+
+ assert oc.lookup_subset([]) == 0
+ assert oc.lookup_subset([1, 2, 3]) == 4
+ assert oc.lookup_subset([1, 2, 3, 4]) == 4
+ assert oc.lookup_subset([1, 3]) is None
+ assert oc.lookup_subset([1, 2]) == 3
+
+ assert oc.lookup_subset([-5, 1]) is None
+ assert oc.lookup_subset([-5, 1, 2]) == 3
+ assert oc.lookup_subset([-5]) == 0
+
+
+# Main Delta Debugging algorithm.
+class DD(object):
+ # Delta debugging base class. To use this class for a particular
+ # setting, create a subclass with an overloaded `test()' method.
+ #
+ # Main entry points are:
+ # - `ddmin()' which computes a minimal failure-inducing configuration, and
+ # - `dd()' which computes a minimal failure-inducing difference.
+ #
+ # See also the usage sample at the end of this file.
+ #
+ # For further fine-tuning, you can implement an own `resolve()'
+ # method (tries to add or remove configuration elements in case of
+ # inconsistencies), or implement an own `split()' method, which
+ # allows you to split configurations according to your own
+ # criteria.
+ #
+ # The class includes other previous delta debugging algorithms,
+ # which are obsolete now; they are only included for comparison
+ # purposes.
+
+ # Test outcomes.
+ PASS = "PASS"
+ FAIL = "FAIL"
+ UNRESOLVED = "UNRESOLVED"
+
+ # Resolving directions.
+ ADD = "ADD" # Add deltas to resolve
+ REMOVE = "REMOVE" # Remove deltas to resolve
+
+ # Debugging output (set to 1 to enable)
+ debug_test = 0
+ debug_dd = 0
+ debug_split = 0
+ debug_resolve = 0
+
+ def __init__(self):
+ self.__resolving = 0
+ self.__last_reported_length = 0
+ self.monotony = 0
+ self.outcome_cache = OutcomeCache()
+ self.cache_outcomes = 1
+ self.minimize = 1
+ self.maximize = 1
+ self.assume_axioms_hold = 1
+
+ # Helpers
+ def __listminus(self, c1, c2):
+ """Return a list of all elements of C1 that are not in C2."""
+ s2 = {}
+ for delta in c2:
+ s2[delta] = 1
+
+ c = []
+ for delta in c1:
+ if delta not in s2:
+ c.append(delta)
+
+ return c
+
+ def __listintersect(self, c1, c2):
+ """Return the common elements of C1 and C2."""
+ s2 = {}
+ for delta in c2:
+ s2[delta] = 1
+
+ c = []
+ for delta in c1:
+ if delta in s2:
+ c.append(delta)
+
+ return c
+
+ def __listunion(self, c1, c2):
+ """Return the union of C1 and C2."""
+ s1 = {}
+ for delta in c1:
+ s1[delta] = 1
+
+ c = c1[:]
+ for delta in c2:
+ if delta not in s1:
+ c.append(delta)
+
+ return c
+
+ def __listsubseteq(self, c1, c2):
+ """Return 1 if C1 is a subset or equal to C2."""
+ s2 = {}
+ for delta in c2:
+ s2[delta] = 1
+
+ for delta in c1:
+ if delta not in s2:
+ return 0
+
+ return 1
+
+ # Output
+ def coerce(self, c):
+ """Return the configuration C as a compact string"""
+ # Default: use printable representation
+ return repr(c)
+
+ def pretty(self, c):
+ """Like coerce(), but sort beforehand"""
+ sorted_c = c[:]
+ sorted_c.sort()
+ return self.coerce(sorted_c)
+
+ # Testing
+ def test(self, c):
+ """Test the configuration C. Return PASS, FAIL, or UNRESOLVED"""
+ c.sort()
+
+ # If we had this test before, return its result
+ if self.cache_outcomes:
+ cached_result = self.outcome_cache.lookup(c)
+ if cached_result is not None:
+ return cached_result
+
+ if self.monotony:
+ # Check whether we had a passing superset of this test before
+ cached_result = self.outcome_cache.lookup_superset(c)
+ if cached_result == self.PASS:
+ return self.PASS
+
+ cached_result = self.outcome_cache.lookup_subset(c)
+ if cached_result == self.FAIL:
+ return self.FAIL
+
+ if self.debug_test:
+ print('')
+ print("test(%s)..." % (self.coerce(c),))
+
+ outcome = self._test(c)
+
+ if self.debug_test:
+ print("test(%s) = %r" % (self.coerce(c), outcome))
+
+ if self.cache_outcomes:
+ self.outcome_cache.add(c, outcome)
+
+ return outcome
+
+ def _test(self, c):
+ """Stub to overload in subclasses"""
+ return self.UNRESOLVED # Placeholder
+
+
+ # Splitting
+ def split(self, c, n):
+ """Split C into [C_1, C_2, ..., C_n]."""
+ if self.debug_split:
+ print("split(%s, %r)..." % (self.coerce(c), n))
+
+ outcome = self._split(c, n)
+
+ if self.debug_split:
+ print("split(%s, %r) = %r" % (self.coerce(c), n, outcome))
+
+ return outcome
+
+ def _split(self, c, n):
+ """Stub to overload in subclasses"""
+ subsets = []
+ start = 0
+ for i in range(n):
+ subset = c[start:start + (len(c) - start) // (n - i)]
+ subsets.append(subset)
+ start = start + len(subset)
+ return subsets
+
+
+ # Resolving
+ def resolve(self, csub, c, direction):
+ """If direction == ADD, resolve inconsistency by adding deltas
+ to CSUB. Otherwise, resolve by removing deltas from CSUB."""
+
+ if self.debug_resolve:
+ print("resolve(%r, %s, %r)..." % (csub, self.coerce(c), direction))
+
+ outcome = self._resolve(csub, c, direction)
+
+ if self.debug_resolve:
+ print("resolve(%r, %s, %r) = %r" % (csub, self.coerce(c), direction, outcome))
+
+ return outcome
+
+
+ def _resolve(self, csub, c, direction):
+ """Stub to overload in subclasses."""
+ # By default, no way to resolve
+ return None
+
+
+ # Test with fixes
+ def test_and_resolve(self, csub, r, c, direction):
+ """Repeat testing CSUB + R while unresolved."""
+
+ initial_csub = csub[:]
+ c2 = self.__listunion(r, c)
+
+ csubr = self.__listunion(csub, r)
+ t = self.test(csubr)
+
+ # necessary to use more resolving mechanisms which can reverse each
+ # other, can (but needn't) be used in subclasses
+ self._resolve_type = 0
+
+ while t == self.UNRESOLVED:
+ self.__resolving = 1
+ csubr = self.resolve(csubr, c, direction)
+
+ if csubr is None:
+ # Nothing left to resolve
+ break
+
+ if len(csubr) >= len(c2):
+ # Added everything: csub == c2. ("Upper" Baseline)
+ # This has already been tested.
+ csubr = None
+ break
+
+ if len(csubr) <= len(r):
+ # Removed everything: csub == r. (Baseline)
+ # This has already been tested.
+ csubr = None
+ break
+
+ t = self.test(csubr)
+
+ self.__resolving = 0
+ if csubr is None:
+ return self.UNRESOLVED, initial_csub
+
+ # assert t == self.PASS or t == self.FAIL
+ csub = self.__listminus(csubr, r)
+ return t, csub
+
+ # Inquiries
+ def resolving(self):
+ """Return 1 while resolving."""
+ return self.__resolving
+
+
+ # Logging
+ def report_progress(self, c, title):
+ if len(c) != self.__last_reported_length:
+ print('')
+ print("%s: %d deltas left: %s" % (title, len(c), self.coerce(c)))
+ self.__last_reported_length = len(c)
+
+
+ # Delta Debugging (old ESEC/FSE version)
+ def old_dd(self, c, r = [], n = 2):
+ """Return the failure-inducing subset of C"""
+
+ assert self.test([]) == dd.PASS
+ assert self.test(c) == dd.FAIL
+
+ if self.debug_dd:
+ print("dd(%s, %r, %r)..." % (self.pretty(c), r, n))
+
+ outcome = self._old_dd(c, r, n)
+
+ if self.debug_dd:
+ print("dd(%s, %r, %r) = %r" % (self.pretty(c), r, n, outcome))
+
+ return outcome
+
+ def _old_dd(self, c, r, n):
+ """Stub to overload in subclasses"""
+
+ if not r:
+ assert self.test([]) == self.PASS
+ assert self.test(c) == self.FAIL
+ else:
+ assert self.test(r) != self.FAIL
+ assert self.test(c + r) != self.PASS
+
+ assert self.__listintersect(c, r) == []
+
+ if len(c) == 1:
+ # Nothing to split
+ return c
+
+ run = 1
+ next_c = c[:]
+ next_r = r[:]
+
+ # We replace the tail recursion from the paper by a loop
+ while 1:
+ self.report_progress(c, "dd")
+
+ cs = self.split(c, n)
+
+ print('')
+ print("dd (run #%r): trying %s" % (run, ' + '.join(map(str, cs))))
+ print('')
+
+ # Check subsets
+ ts = []
+ for i in range(n):
+ if self.debug_dd:
+ print("dd: trying cs[%d] = %s" % (i, self.pretty(cs[i])))
+
+ t, cs[i] = self.test_and_resolve(cs[i], r, c, self.REMOVE)
+ ts.append(t)
+ if t == self.FAIL:
+ # Found
+ if self.debug_dd:
+ print("dd: found %d deltas: %s" % (len(cs[i]), self.pretty(cs[i])))
+ return self.dd(cs[i], r)
+
+ # Check complements
+ cbars = []
+ tbars = []
+
+ for i in range(n):
+ cbar = self.__listminus(c, cs[i] + r)
+ tbar, cbar = self.test_and_resolve(cbar, r, c, self.ADD)
+
+
+ doubled = self.__listintersect(cbar, cs[i])
+ if doubled:
+ cs[i] = self.__listminus(cs[i], doubled)
+
+
+ cbars.append(cbar)
+ tbars.append(tbar)
+
+ if ts[i] == self.PASS and tbars[i] == self.PASS:
+ # Interference
+ if self.debug_dd:
+ print("dd: interference of %s and %s" % (self.pretty(cs[i]), self.pretty(cbars[i])))
+
+ d = self.dd(cs[i][:], cbars[i] + r)
+ dbar = self.dd(cbars[i][:], cs[i] + r)
+ return d + dbar
+
+ if ts[i] == self.UNRESOLVED and tbars[i] == self.PASS:
+ # Preference
+ if self.debug_dd:
+ print("dd: preferring %d deltas: %s" % (len(cs[i]), self.pretty(cs[i])))
+
+ return self.dd(cs[i][:], cbars[i] + r)
+
+ if ts[i] == self.PASS or tbars[i] == self.FAIL:
+ if self.debug_dd:
+ excluded = self.__listminus(next_c, cbars[i])
+ print("dd: excluding %d deltas: %s" % (len(excluded), self.pretty(excluded)))
+
+ if ts[i] == self.PASS:
+ next_r = self.__listunion(next_r, cs[i])
+ next_c = self.__listintersect(next_c, cbars[i])
+ self.report_progress(next_c, "dd")
+
+ next_n = min(len(next_c), n * 2)
+
+ if next_n == n and next_c[:] == c[:] and next_r[:] == r[:]:
+ # Nothing left
+ if self.debug_dd:
+ print("dd: nothing left")
+ return next_c
+
+ # Try again
+ if self.debug_dd:
+ print("dd: try again")
+
+ c = next_c
+ r = next_r
+ n = next_n
+ run = run + 1
+
+
+ def test_mix(self, csub, c, direction):
+ if self.minimize:
+ (t, csub) = self.test_and_resolve(csub, [], c, direction)
+ if t == self.FAIL:
+ return t, csub
+
+ if self.maximize:
+ csubbar = self.__listminus(self.CC, csub)
+ cbar = self.__listminus(self.CC, c)
+ if direction == self.ADD:
+ directionbar = self.REMOVE
+ else:
+ directionbar = self.ADD
+
+ (tbar, csubbar) = self.test_and_resolve(csubbar, [], cbar,
+ directionbar)
+
+ csub = self.__listminus(self.CC, csubbar)
+
+ if tbar == self.PASS:
+ t = self.FAIL
+ elif tbar == self.FAIL:
+ t = self.PASS
+ else:
+ t = self.UNRESOLVED
+
+ return t, csub
+
+
+ # Delta Debugging (new ISSTA version)
+ def ddgen(self, c, minimize, maximize):
+ """Return a 1-minimal failing subset of C"""
+
+ self.minimize = minimize
+ self.maximize = maximize
+
+ n = 2
+ self.CC = c
+
+ if self.debug_dd:
+ print("dd(%s, %r)..." % (self.pretty(c), n))
+
+ outcome = self._dd(c, n)
+
+ if self.debug_dd:
+ print("dd(%s, %r) = %r" % (self.pretty(c), n, outcome))
+
+ return outcome
+
+ def _dd(self, c, n):
+ """Stub to overload in subclasses"""
+
+ assert self.test([]) == self.PASS
+
+ run = 1
+ cbar_offset = 0
+
+ # We replace the tail recursion from the paper by a loop
+ while 1:
+ tc = self.test(c)
+ assert tc == self.FAIL or tc == self.UNRESOLVED
+
+ if n > len(c):
+ # No further minimizing
+ print("dd: done")
+ return c
+
+ self.report_progress(c, "dd")
+
+ cs = self.split(c, n)
+
+ print('')
+ print("dd (run #%d): trying %s" % (run, ' + '.join(map(str, cs))))
+ print('')
+
+ c_failed = 0
+ cbar_failed = 0
+
+ next_c = c[:]
+ next_n = n
+
+ # Check subsets
+ for i in range(n):
+ if self.debug_dd:
+ print("dd: trying %s" % (self.pretty(cs[i]),))
+
+ (t, cs[i]) = self.test_mix(cs[i], c, self.REMOVE)
+
+ if t == self.FAIL:
+ # Found
+ if self.debug_dd:
+ print("dd: found %d deltas: %s" % (len(cs[i]), self.pretty(cs[i])))
+
+ c_failed = 1
+ next_c = cs[i]
+ next_n = 2
+ cbar_offset = 0
+ self.report_progress(next_c, "dd")
+ break
+
+ if not c_failed:
+ # Check complements
+ cbars = n * [self.UNRESOLVED]
+
+ # print "cbar_offset =", cbar_offset
+
+ for j in range(n):
+ i = int((j + cbar_offset) % n)
+ cbars[i] = self.__listminus(c, cs[i])
+ t, cbars[i] = self.test_mix(cbars[i], c, self.ADD)
+
+ doubled = self.__listintersect(cbars[i], cs[i])
+ if doubled:
+ cs[i] = self.__listminus(cs[i], doubled)
+
+ if t == self.FAIL:
+ if self.debug_dd:
+ print("dd: reduced to %d deltas: %s" % (len(cbars[i]), self.pretty(cbars[i])))
+
+ cbar_failed = 1
+ next_c = self.__listintersect(next_c, cbars[i])
+ next_n = next_n - 1
+ self.report_progress(next_c, "dd")
+
+ # In next run, start removing the following subset
+ cbar_offset = i
+ break
+
+ if not c_failed and not cbar_failed:
+ if n >= len(c):
+ # No further minimizing
+ print("dd: done")
+ return c
+
+ next_n = min(len(c), n * 2)
+ print("dd: increase granularity to %d" % next_n)
+ cbar_offset = (cbar_offset * next_n) / n
+
+ c = next_c
+ n = next_n
+ run = run + 1
+
+ def ddmin(self, c):
+ return self.ddgen(c, 1, 0)
+
+ def ddmax(self, c):
+ return self.ddgen(c, 0, 1)
+
+ def ddmix(self, c):
+ return self.ddgen(c, 1, 1)
+
+
+ # General delta debugging (new TSE version)
+ def dddiff(self, c):
+ n = 2
+
+ if self.debug_dd:
+ print("dddiff(%s, %d)..." % (self.pretty(c), n))
+
+ outcome = self._dddiff([], c, n)
+
+ if self.debug_dd:
+ print("dddiff(%s, %d) = %r" % (self.pretty(c), n, outcome))
+
+ return outcome
+
+ def _dddiff(self, c1, c2, n):
+ run = 1
+ cbar_offset = 0
+
+ # We replace the tail recursion from the paper by a loop
+ while 1:
+ if self.debug_dd:
+ print("dd: c1 = %s" % (self.pretty(c1),))
+ print("dd: c2 = %s" % (self.pretty(c2),))
+
+ if self.assume_axioms_hold:
+ t1 = self.PASS
+ t2 = self.FAIL
+ else:
+ t1 = self.test(c1)
+ t2 = self.test(c2)
+
+ assert t1 == self.PASS
+ assert t2 == self.FAIL
+ assert self.__listsubseteq(c1, c2)
+
+ c = self.__listminus(c2, c1)
+
+ if self.debug_dd:
+ print("dd: c2 - c1 = %s" % (self.pretty(c),))
+
+ if n > len(c):
+ # No further minimizing
+ print("dd: done")
+ return c, c1, c2
+
+ self.report_progress(c, "dd")
+
+ cs = self.split(c, n)
+
+ print('')
+ print("dd (run #%d): trying %s" % (run, ' + '.join(map(str, cs))))
+ print('')
+
+ progress = 0
+
+ next_c1 = c1[:]
+ next_c2 = c2[:]
+ next_n = n
+
+ # Check subsets
+ for j in range(n):
+ i = int((j + cbar_offset) % n)
+
+ if self.debug_dd:
+ print("dd: trying %s" % (self.pretty(cs[i]),))
+
+ (t, csub) = self.test_and_resolve(cs[i], c1, c, self.REMOVE)
+ csub = self.__listunion(c1, csub)
+
+ if t == self.FAIL and t1 == self.PASS:
+ # Found
+ progress = 1
+ next_c2 = csub
+ next_n = 2
+ cbar_offset = 0
+
+ if self.debug_dd:
+ print("dd: reduce c2 to %d deltas: %s" % (len(next_c2), self.pretty(next_c2)))
+ break
+
+ if t == self.PASS and t2 == self.FAIL:
+ # Reduce to complement
+ progress = 1
+ next_c1 = csub
+ next_n = max(next_n - 1, 2)
+ cbar_offset = i
+
+ if self.debug_dd:
+ print("dd: increase c1 to %d deltas: %s", (len(next_c1), self.pretty(next_c1)))
+ break
+
+
+ csub = self.__listminus(c, cs[i])
+ (t, csub) = self.test_and_resolve(csub, c1, c, self.ADD)
+ csub = self.__listunion(c1, csub)
+
+ if t == self.PASS and t2 == self.FAIL:
+ # Found
+ progress = 1
+ next_c1 = csub
+ next_n = 2
+ cbar_offset = 0
+
+ if self.debug_dd:
+ print("dd: increase c1 to %d deltas: %s" % (len(next_c1), self.pretty(next_c1)))
+ break
+
+ if t == self.FAIL and t1 == self.PASS:
+ # Increase
+ progress = 1
+ next_c2 = csub
+ next_n = max(next_n - 1, 2)
+ cbar_offset = i
+
+ if self.debug_dd:
+ print("dd: reduce c2 to %d deltas: %s" % (len(next_c2), self.pretty(next_c2)))
+ break
+
+ if progress:
+ self.report_progress(self.__listminus(next_c2, next_c1), "dd")
+ else:
+ if n >= len(c):
+ # No further minimizing
+ print("dd: done")
+ return c, c1, c2
+
+ next_n = min(len(c), n * 2)
+ print("dd: increase granularity to %d" % next_n)
+ cbar_offset = (cbar_offset * next_n) / n
+
+ c1 = next_c1
+ c2 = next_c2
+ n = next_n
+ run = run + 1
+
+ def dd(self, c):
+ return self.dddiff(c) # Backwards compatibility
+
+
+
+
+
+if __name__ == '__main__':
+ # Test the outcome cache
+ oc_test()
+
+ # Define our own DD class, with its own test method
+ class MyDD(DD):
+ def _test_a(self, c):
+ "Test the configuration C. Return PASS, FAIL, or UNRESOLVED."
+
+ # Just a sample
+ # if 2 in c and not 3 in c:
+ # return self.UNRESOLVED
+ # if 3 in c and not 7 in c:
+ # return self.UNRESOLVED
+ if 7 in c and not 2 in c:
+ return self.UNRESOLVED
+ if 5 in c and 8 in c:
+ return self.FAIL
+ return self.PASS
+
+ def _test_b(self, c):
+ if not c:
+ return self.PASS
+ if 1 in c and 2 in c and 3 in c and 4 in c and \
+ 5 in c and 6 in c and 7 in c and 8 in c:
+ return self.FAIL
+ return self.UNRESOLVED
+
+ def _test_c(self, c):
+ if 1 in c and 2 in c and 3 in c and 4 in c and \
+ 6 in c and 8 in c:
+ if 5 in c and 7 in c:
+ return self.UNRESOLVED
+ else:
+ return self.FAIL
+ if 1 in c or 2 in c or 3 in c or 4 in c or \
+ 6 in c or 8 in c:
+ return self.UNRESOLVED
+ return self.PASS
+
+ def __init__(self):
+ self._test = self._test_c
+ DD.__init__(self)
+
+
+ print("WYNOT - a tool for delta debugging.")
+ mydd = MyDD()
+ # mydd.debug_test = 1 # Enable debugging output
+ # mydd.debug_dd = 1 # Enable debugging output
+ # mydd.debug_split = 1 # Enable debugging output
+ # mydd.debug_resolve = 1 # Enable debugging output
+
+ # mydd.cache_outcomes = 0
+ # mydd.monotony = 0
+
+ print("Minimizing failure-inducing input...")
+ c = mydd.ddmin([1, 2, 3, 4, 5, 6, 7, 8]) # Invoke DDMIN
+ print("The 1-minimal failure-inducing input is %s" % (c,))
+ print("Removing any element will make the failure go away.")
+ print('')
+
+ print("Computing the failure-inducing difference...")
+ (c, c1, c2) = mydd.dd([1, 2, 3, 4, 5, 6, 7, 8]) # Invoke DD
+ print("The 1-minimal failure-inducing difference is %s" % (c,))
+ print("%s passes, %s fails" % (c1, c2))
+
+
+
+# Local Variables:
+# mode: python
+# End:
diff --git a/IDEAS.txt b/IDEAS.txt
new file mode 100644
index 0000000..f93a0f3
--- /dev/null
+++ b/IDEAS.txt
@@ -0,0 +1,41 @@
+Things to try out when life permits
+===================================
+
+* zlib-based parsing/serialising of compressed in-memory data
+
+ * requires a libxml2 I/O OutputBuffer with appropriate I/O functions
+ that call into the zlib compression routines
+
+* lzma-based parsing/serialising of compressed in-memory data
+
+ * requires a libxml2 I/O OutputBuffer with appropriate I/O functions
+ that call into the lzma compression routines
+
+ * advantage over zlib: probably faster and better compression
+
+ * maybe embed the lzma C sources in the distro
+ http://www.7-zip.org/sdk.html
+
+* parse-time validation against a user provided DTD
+
+ * currently only works for XML Schema
+
+* support subclassing XSLTAccessControl to provide custom per-URL
+ access check methods
+
+ * maybe custom resolvers are enough, or can be combined with this?
+
+* reimplement iterparse() using the libxml2 xmlReader API
+
+ * Advantage: the implementation can be made safer than the current
+ SAX implementation, as the parser would not interact with the
+ Python-level tree.
+
+ * Disadvantage: the tree has to be built manually. In the current
+ SAX based implementation, libxml2 does it for us.
+
+* provide an HTMLParser wrapper that handles broken encodings in broken
+ HTML better, e.g. using BeautifulSoup's "unicode dammit" analyser
+
+* expose namespace prefixes through the QName class
+
diff --git a/INSTALL.txt b/INSTALL.txt
new file mode 100644
index 0000000..94d6a3e
--- /dev/null
+++ b/INSTALL.txt
@@ -0,0 +1,219 @@
+Installing lxml
+===============
+
+.. contents:: :depth: 1
+..
+ 1 Where to get it
+ 2 Requirements
+ 3 Installation
+ 4 Building lxml from dev sources
+ 5 Using lxml with python-libxml2
+ 6 Source builds on MS Windows
+ 7 Source builds on MacOS-X
+
+
+Where to get it
+---------------
+
+lxml is generally distributed through PyPI_.
+
+.. _PyPI: http://pypi.python.org/pypi/lxml
+
+Most **Linux** platforms come with some version of lxml readily
+packaged, usually named ``python-lxml`` for the Python 2.x version
+and ``python3-lxml`` for Python 3.x. If you can use that version,
+the quickest way to install lxml is to use the system package
+manager, e.g. ``apt-get`` on Debian/Ubuntu::
+
+ sudo apt-get install python3-lxml
+
+For **MacOS-X**, a `macport <http://macports.org/>`_ of lxml is available.
+Try something like
+
+::
+
+ sudo port install py27-lxml
+
+To install a newer version or to install lxml on other systems,
+see below.
+
+
+Requirements
+------------
+
+You need Python 2.7 or 3.4+.
+
+Unless you are using a static binary distribution (e.g. from a
+Windows binary installer), lxml requires libxml2 and libxslt to
+be installed, in particular:
+
+* `libxml2 <http://xmlsoft.org/>`_ version 2.9.2 or later.
+
+* `libxslt <http://xmlsoft.org/XSLT/>`_ version 1.1.27 or later.
+
+ * We recommend libxslt 1.1.28 or later.
+
+Newer versions generally contain fewer bugs and are therefore
+recommended. XML Schema support is also still worked on in libxml2,
+so newer versions will give you better compliance with the W3C spec.
+
+To install the required development packages of these dependencies
+on Linux systems, use your distribution specific installation tool,
+e.g. apt-get on Debian/Ubuntu::
+
+ sudo apt-get install libxml2-dev libxslt-dev python-dev
+
+For Debian based systems, it should be enough to install the known
+build dependencies of the provided lxml package, e.g.
+
+::
+
+ sudo apt-get build-dep python3-lxml
+
+
+Installation
+------------
+
+If your system does not provide binary packages or you want to install
+a newer version, the best way is to get the pip_ package management tool
+(or use a `virtualenv <https://pypi.python.org/pypi/virtualenv>`_) and
+run the following::
+
+ pip install lxml
+
+If you are not using pip in a virtualenv and want to install lxml globally
+instead, you have to run the above command as admin, e.g. on Linux::
+
+ sudo pip install lxml
+
+To install a specific version, either download the distribution
+manually and let pip install that, or pass the desired version
+to pip::
+
+ pip install lxml==3.4.2
+
+.. _pip: http://pypi.python.org/pypi/pip
+
+To speed up the build in test environments, e.g. on a continuous
+integration server, disable the C compiler optimisations by setting
+the ``CFLAGS`` environment variable::
+
+ CFLAGS="-O0" pip install lxml
+
+(The option reads "minus Oh Zero", i.e. zero optimisations.)
+
+MS Windows
+..........
+
+For MS Windows, recent lxml releases feature community donated
+binary distributions, although you might still want to take a look
+at the related `FAQ entry <FAQ.html#where-are-the-binary-builds>`_.
+If you fail to build lxml on your MS Windows system from the signed
+and tested sources that we release, consider using the binary builds
+from PyPI or the `unofficial Windows binaries
+<http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml>`_
+that Christoph Gohlke generously provides.
+
+Linux
+.....
+
+On Linux (and most other well-behaved operating systems), ``pip`` will
+manage to build the source distribution as long as libxml2 and libxslt
+are properly installed, including development packages, i.e. header files,
+etc. See the requirements section above and use your system package
+management tool to look for packages like ``libxml2-dev`` or
+``libxslt-devel``. If the build fails, make sure they are installed.
+
+Alternatively, setting ``STATIC_DEPS=true`` will download and build
+both libraries automatically in their latest version, e.g.
+``STATIC_DEPS=true pip install lxml``.
+
+MacOS-X
+.......
+
+On MacOS-X, use the following to build the source distribution,
+and make sure you have a working Internet connection, as this will
+download libxml2 and libxslt in order to build them::
+
+ STATIC_DEPS=true sudo pip install lxml
+
+
+Building lxml from dev sources
+------------------------------
+
+If you want to build lxml from the GitHub repository, you should read
+`how to build lxml from source`_ (or the file ``doc/build.txt`` in the
+source tree). Building from developer sources or from modified
+distribution sources requires Cython_ to translate the lxml sources
+into C code. The source distribution ships with pre-generated C
+source files, so you do not need Cython installed to build from
+release sources.
+
+.. _Cython: http://www.cython.org
+.. _`how to build lxml from source`: build.html
+
+If you have read these instructions and still cannot manage to install lxml,
+you can check the archives of the `mailing list`_ to see if your problem is
+known or otherwise send a mail to the list.
+
+.. _`mailing list`: http://lxml.de/mailinglist/
+
+
+Using lxml with python-libxml2
+------------------------------
+
+If you want to use lxml together with the official libxml2 Python
+bindings (maybe because one of your dependencies uses it), you must
+build lxml statically. Otherwise, the two packages will interfere in
+places where the libxml2 library requires global configuration, which
+can have any kind of effect from disappearing functionality to crashes
+in either of the two.
+
+To get a static build, either pass the ``--static-deps`` option to the
+setup.py script, or run ``pip`` with the ``STATIC_DEPS`` or
+``STATICBUILD`` environment variable set to true, i.e.
+
+::
+
+ STATIC_DEPS=true pip install lxml
+
+The ``STATICBUILD`` environment variable is handled equivalently to
+the ``STATIC_DEPS`` variable, but is used by some other extension
+packages, too.
+
+
+Source builds on MS Windows
+---------------------------
+
+Most MS Windows systems lack the necessarily tools to build software,
+starting with a C compiler already. Microsoft leaves it to users to
+install and configure them, which is usually not trivial and means
+that distributors cannot rely on these dependencies being available
+on a given system. In a way, you get what you've paid for and make
+others pay for it.
+
+Due to the additional lack of package management of this platform,
+it is best to link the library dependencies statically if you decide
+to build from sources, rather than using a binary installer. For
+that, lxml can use the `binary distribution of libxml2 and libxslt
+<http://www.zlatkovic.com/libxml.en.html>`_, which it downloads
+automatically during the static build. It needs both libxml2 and
+libxslt, as well as iconv and zlib, which are available from the
+same download site. Further build instructions are in the
+`source build documentation <build.html>`_.
+
+
+Source builds on MacOS-X
+------------------------
+
+If you are not using macports or want to use a more recent lxml
+release, you have to build it yourself. While the pre-installed system
+libraries of libxml2 and libxslt are less outdated in recent MacOS-X
+versions than they used to be, so lxml should work with them out of the
+box, it is still recommended to use a static build with the most recent
+library versions.
+
+Luckily, lxml's ``setup.py`` script has built-in support for building
+and integrating these libraries statically during the build. Please
+read the
+`MacOS-X build instructions <build.html#building-lxml-on-macos-x>`_.
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..a76d0ed
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,29 @@
+Copyright (c) 2004 Infrae. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ 3. Neither the name of Infrae nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INFRAE OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/LICENSES.txt b/LICENSES.txt
new file mode 100644
index 0000000..9f97c18
--- /dev/null
+++ b/LICENSES.txt
@@ -0,0 +1,29 @@
+lxml is copyright Infrae and distributed under the BSD license (see
+doc/licenses/BSD.txt), with the following exceptions:
+
+Some code, such a selftest.py, selftest2.py and
+src/lxml/_elementpath.py are derived from ElementTree and
+cElementTree. See doc/licenses/elementtree.txt for the license text.
+
+lxml.cssselect and lxml.html are copyright Ian Bicking and distributed
+under the BSD license (see doc/licenses/BSD.txt).
+
+test.py, the test-runner script, is GPL and copyright Shuttleworth
+Foundation. See doc/licenses/GPL.txt. It is believed the unchanged
+inclusion of test.py to run the unit test suite falls under the
+"aggregation" clause of the GPL and thus does not affect the license
+of the rest of the package.
+
+The isoschematron implementation uses several XSL and RelaxNG resources:
+ * The (XML syntax) RelaxNG schema for schematron, copyright International
+ Organization for Standardization (see
+ src/lxml/isoschematron/resources/rng/iso-schematron.rng for the license
+ text)
+ * The skeleton iso-schematron-xlt1 pure-xslt schematron implementation
+ xsl stylesheets, copyright Rick Jelliffe and Academia Sinica Computing
+ Center, Taiwan (see the xsl files here for the license text:
+ src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/)
+ * The xsd/rng schema schematron extraction xsl transformations are unlicensed
+ and copyright the respective authors as noted (see
+ src/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl and
+ src/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl)
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..f05c257
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,19 @@
+exclude *.py
+include setup.py setupinfo.py versioninfo.py buildlibxml.py
+include test.py
+include update-error-constants.py
+include MANIFEST.in Makefile requirements.txt
+include CHANGES.txt CREDITS.txt INSTALL.txt LICENSES.txt README.rst TODO.txt
+include tools/*.py tools/manylinux/*.sh
+include src/lxml/*.c src/lxml/html/*.c
+include doc/html/*.png
+recursive-include src *.pyx *.pxd *.pxi *.py
+recursive-include src/lxml lxml.etree.h lxml.etree_api.h etree.h etree_api.h etree_defs.h lxml_endian.h
+recursive-include src/lxml/isoschematron *.rng *.xsl *.txt
+recursive-include src/lxml/tests *.rng *.rnc *.xslt *.xml *.dtd *.xsd *.sch *.html *.txt
+recursive-include src/lxml/html/tests *.data *.txt
+recursive-include samples *.xml
+recursive-include benchmark *.py
+recursive-include doc *.py *.txt *.html *.css *.xml *.mgp pubkey.asc Makefile
+recursive-include doc/s5/ui *.gif *.htc *.png *.js
+recursive-include doc/s5/ep2008 *.py *.png *.rng
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..a8c9de8
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,178 @@
+PYTHON?=python
+PYTHON3?=python3
+TESTFLAGS=-p -v
+TESTOPTS=
+SETUPFLAGS=
+LXMLVERSION:=$(shell sed -ne '/__version__/s|.*__version__\s*=\s*"\([^"]*\)".*|\1|p' src/lxml/__init__.py)
+
+PARALLEL?=$(shell $(PYTHON) -c 'import sys; print("-j7" if sys.version_info >= (3, 5) else "")' )
+PARALLEL3?=$(shell $(PYTHON3) -c 'import sys; print("-j7" if sys.version_info >= (3, 5) else "")' )
+PYTHON_WITH_CYTHON?=$(shell $(PYTHON) -c 'import Cython.Build.Dependencies' >/dev/null 2>/dev/null && echo " --with-cython" || true)
+PY3_WITH_CYTHON?=$(shell $(PYTHON3) -c 'import Cython.Build.Dependencies' >/dev/null 2>/dev/null && echo " --with-cython" || true)
+CYTHON_WITH_COVERAGE?=$(shell $(PYTHON) -c 'import Cython.Coverage; import sys; assert not hasattr(sys, "pypy_version_info")' >/dev/null 2>/dev/null && echo " --coverage" || true)
+CYTHON3_WITH_COVERAGE?=$(shell $(PYTHON3) -c 'import Cython.Coverage; import sys; assert not hasattr(sys, "pypy_version_info")' >/dev/null 2>/dev/null && echo " --coverage" || true)
+
+MANYLINUX_LIBXML2_VERSION=2.9.10
+MANYLINUX_LIBXSLT_VERSION=1.1.34
+MANYLINUX_CFLAGS=-O3 -g1 -pipe -fPIC -flto
+MANYLINUX_LDFLAGS=-flto
+MANYLINUX_IMAGE_X86_64=quay.io/pypa/manylinux1_x86_64
+MANYLINUX_IMAGE_686=quay.io/pypa/manylinux1_i686
+MANYLINUX_IMAGE_AARCH64=quay.io/pypa/manylinux2014_aarch64
+
+AARCH64_ENV=-e AR="/opt/rh/devtoolset-9/root/usr/bin/gcc-ar" \
+ -e NM="/opt/rh/devtoolset-9/root/usr/bin/gcc-nm" \
+ -e RANLIB="/opt/rh/devtoolset-9/root/usr/bin/gcc-ranlib"
+
+.PHONY: all inplace inplace3 rebuild-sdist sdist build require-cython wheel_manylinux wheel
+
+all: inplace
+
+# Build in-place
+inplace:
+ $(PYTHON) setup.py $(SETUPFLAGS) build_ext -i $(PYTHON_WITH_CYTHON) --warnings $(subst --,--with-,$(CYTHON_WITH_COVERAGE)) $(PARALLEL)
+
+inplace3:
+ $(PYTHON3) setup.py $(SETUPFLAGS) build_ext -i $(PY3_WITH_CYTHON) --warnings $(subst --,--with-,$(CYTHON3_WITH_COVERAGE)) $(PARALLEL3)
+
+rebuild-sdist: require-cython
+ rm -f dist/lxml-$(LXMLVERSION).tar.gz
+ find src -name '*.c' -exec rm -f {} \;
+ $(MAKE) dist/lxml-$(LXMLVERSION).tar.gz
+
+dist/lxml-$(LXMLVERSION).tar.gz:
+ $(PYTHON) setup.py $(SETUPFLAGS) sdist $(PYTHON_WITH_CYTHON)
+
+sdist: dist/lxml-$(LXMLVERSION).tar.gz
+
+build:
+ $(PYTHON) setup.py $(SETUPFLAGS) build $(PYTHON_WITH_CYTHON)
+
+require-cython:
+ @[ -n "$(PYTHON_WITH_CYTHON)" ] || { \
+ echo "NOTE: missing Cython - please use this command to install it: $(PYTHON) -m pip install Cython"; false; }
+
+qemu-user-static:
+ docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+
+wheel_manylinux: wheel_manylinux64 wheel_manylinux32 wheel_manylinuxaarch64
+wheel_manylinuxaarch64: qemu-user-static
+
+wheel_manylinux32 wheel_manylinux64 wheel_manylinuxaarch64: dist/lxml-$(LXMLVERSION).tar.gz
+ time docker run --rm -t \
+ -v $(shell pwd):/io \
+ $(if $(patsubst %aarch64,,$@),,$(AARCH64_ENV)) \
+ -e CFLAGS="$(MANYLINUX_CFLAGS) $(if $(patsubst %aarch64,,$@),-march=core2,-march=armv8-a -mtune=cortex-a72)" \
+ -e LDFLAGS="$(MANYLINUX_LDFLAGS)" \
+ -e LIBXML2_VERSION="$(MANYLINUX_LIBXML2_VERSION)" \
+ -e LIBXSLT_VERSION="$(MANYLINUX_LIBXSLT_VERSION)" \
+ -e WHEELHOUSE=wheelhouse_$(subst wheel_,,$@) \
+ $(if $(filter $@,wheel_manylinuxaarch64),$(MANYLINUX_IMAGE_AARCH64),$(if $(patsubst %32,,$@),$(MANYLINUX_IMAGE_X86_64),$(MANYLINUX_IMAGE_686))) \
+ bash /io/tools/manylinux/build-wheels.sh /io/$<
+
+wheel:
+ $(PYTHON) setup.py $(SETUPFLAGS) bdist_wheel $(PYTHON_WITH_CYTHON)
+
+wheel_static:
+ $(PYTHON) setup.py $(SETUPFLAGS) bdist_wheel $(PYTHON_WITH_CYTHON) --static-deps
+
+test_build: build
+ $(PYTHON) test.py $(TESTFLAGS) $(TESTOPTS)
+
+test_inplace: inplace
+ $(PYTHON) test.py $(TESTFLAGS) $(TESTOPTS) $(CYTHON_WITH_COVERAGE)
+
+test_inplace3: inplace3
+ $(PYTHON3) test.py $(TESTFLAGS) $(TESTOPTS) $(CYTHON3_WITH_COVERAGE)
+
+valgrind_test_inplace: inplace
+ valgrind --tool=memcheck --leak-check=full --num-callers=30 --suppressions=valgrind-python.supp \
+ $(PYTHON) test.py
+
+gdb_test_inplace: inplace
+ @echo "file $(PYTHON)\nrun test.py" > .gdb.command
+ gdb -x .gdb.command -d src -d src/lxml
+
+bench_inplace: inplace
+ $(PYTHON) benchmark/bench_etree.py -i
+ $(PYTHON) benchmark/bench_xpath.py -i
+ $(PYTHON) benchmark/bench_xslt.py -i
+ $(PYTHON) benchmark/bench_objectify.py -i
+
+ftest_build: build
+ $(PYTHON) test.py -f $(TESTFLAGS) $(TESTOPTS)
+
+ftest_inplace: inplace
+ $(PYTHON) test.py -f $(TESTFLAGS) $(TESTOPTS)
+
+apidoc: apidocclean
+ @[ -x "`which sphinx-apidoc`" ] \
+ && (echo "Generating API docs ..." && \
+ PYTHONPATH=src:$(PYTHONPATH) sphinx-apidoc -e -P -T -o doc/api src/lxml \
+ "*includes" "*tests" "*pyclasslookup.py" "*usedoctest.py" "*html/_html5builder.py" \
+ "*.so" "*.pyd") \
+ || (echo "not generating Sphinx autodoc API rst files")
+
+apihtml: apidoc inplace3
+ @[ -x "`which sphinx-build`" ] \
+ && (echo "Generating API docs ..." && \
+ make -C doc/api html) \
+ || (echo "not generating Sphinx autodoc API documentation")
+
+website: inplace3 docclean
+ PYTHONPATH=src:$(PYTHONPATH) $(PYTHON3) doc/mkhtml.py doc/html . ${LXMLVERSION}
+
+html: apihtml website s5
+
+s5:
+ $(MAKE) -C doc/s5 slides
+
+apipdf: apidoc inplace3
+ rm -fr doc/api/_build
+ @[ -x "`which sphinx-build`" ] \
+ && (echo "Generating API PDF docs ..." && \
+ make -C doc/api latexpdf) \
+ || (echo "not generating Sphinx autodoc API PDF documentation")
+
+pdf: apipdf pdfclean
+ $(PYTHON) doc/mklatex.py doc/pdf . ${LXMLVERSION}
+ (cd doc/pdf && pdflatex lxmldoc.tex \
+ && pdflatex lxmldoc.tex \
+ && pdflatex lxmldoc.tex)
+ @cp doc/pdf/lxmldoc.pdf doc/pdf/lxmldoc-${LXMLVERSION}.pdf
+ @echo "PDF available as doc/pdf/lxmldoc-${LXMLVERSION}.pdf"
+
+# Two pdflatex runs are needed to build the correct Table of contents.
+
+test: test_inplace
+
+test3: test_inplace3
+
+valtest: valgrind_test_inplace
+
+gdbtest: gdb_test_inplace
+
+bench: bench_inplace
+
+ftest: ftest_inplace
+
+clean:
+ find . \( -name '*.o' -o -name '*.so' -o -name '*.py[cod]' -o -name '*.dll' \) -exec rm -f {} \;
+ rm -rf build
+
+docclean:
+ $(MAKE) -C doc/s5 clean
+ rm -f doc/html/*.html
+
+pdfclean:
+ rm -fr doc/pdf
+
+apidocclean:
+ rm -fr doc/html/api
+ rm -f doc/api/lxml*.rst
+ rm -fr doc/api/_build
+
+realclean: clean docclean apidocclean
+ find src -name '*.c' -exec rm -f {} \;
+ rm -f TAGS
+ $(PYTHON) setup.py clean -a --without-cython
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..3ad1ba1
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,95 @@
+What is lxml?
+=============
+
+lxml is the most feature-rich and easy-to-use library for processing XML and HTML in the Python language.
+It's also very fast and memory friendly, just so you know.
+
+For an introduction and further documentation, see `doc/main.txt`_.
+
+For installation information, see `INSTALL.txt`_.
+
+For issue tracker, see https://bugs.launchpad.net/lxml
+
+Support the project
+-------------------
+
+lxml has been downloaded from the `Python Package Index`_
+millions of times and is also available directly in many package
+distributions, e.g. for Linux or macOS.
+
+.. _`Python Package Index`: https://pypi.python.org/pypi/lxml
+
+Most people who use lxml do so because they like using it.
+You can show us that you like it by blogging about your experience
+with it and linking to the project website.
+
+If you are using lxml for your work and feel like giving a bit of
+your own benefit back to support the project, consider sending us
+money through GitHub Sponsors, Tidelift or PayPal that we can use
+to buy us free time for the maintenance of this great library, to
+fix bugs in the software, review and integrate code contributions,
+to improve its features and documentation, or to just take a deep
+breath and have a cup of tea every once in a while.
+Please read the Legal Notice below, at the bottom of this page.
+Thank you for your support.
+
+.. class:: center
+
+ Support lxml through `GitHub Sponsors <https://github.com/users/scoder/sponsorship>`_
+
+ via a `Tidelift subscription <https://tidelift.com/subscription/pkg/pypi-lxml>`_
+
+ or via PayPal:
+
+ |Donate|_
+
+.. _`Donate`: https://www.paypal.com/cgi-bin/webscr?cmd=_s-xclick&hosted_button_id=R56JE3VCPDA9N
+
+Please `contact Stefan Behnel <http://consulting.behnel.de/>`_
+for other ways to support the lxml project,
+as well as commercial consulting, customisations and trainings on lxml and
+fast Python XML processing.
+
+.. |Donate| image:: https://lxml.de/paypal_btn_donateCC_LG.png
+ :width: 160
+ :height: 47
+ :alt: Donate to the lxml project
+
+.. _`doc/main.txt`: https://github.com/lxml/lxml/blob/master/doc/main.txt
+.. _`INSTALL.txt`: http://lxml.de/installation.html
+
+`Travis-CI <https://travis-ci.org/>`_ and `AppVeyor <https://www.appveyor.com/>`_
+support the lxml project with their build and CI servers.
+Jetbrains supports the lxml project by donating free licenses of their
+`PyCharm IDE <https://www.jetbrains.com/pycharm/>`_.
+Another supporter of the lxml project is
+`COLOGNE Webdesign <https://www.colognewebdesign.de/>`_.
+
+
+Project income report
+---------------------
+
+* Total project income in 2019: EUR 717.52 (59.79 € / month)
+
+ - Tidelift: EUR 360.30
+ - Paypal: EUR 157.22
+ - other: EUR 200.00
+
+
+Legal Notice for Donations
+--------------------------
+
+Any donation that you make to the lxml project is voluntary and
+is not a fee for any services, goods, or advantages. By making
+a donation to the lxml project, you acknowledge that we have the
+right to use the money you donate in any lawful way and for any
+lawful purpose we see fit and we are not obligated to disclose
+the way and purpose to any party unless required by applicable
+law. Although lxml is free software, to the best of our knowledge
+the lxml project does not have any tax exempt status. The lxml
+project is neither a registered non-profit corporation nor a
+registered charity in any country. Your donation may or may not
+be tax-deductible; please consult your tax advisor in this matter.
+We will not publish or disclose your name and/or e-mail address
+without your consent, unless required by applicable law. Your
+donation is non-refundable.
diff --git a/TODO.txt b/TODO.txt
new file mode 100644
index 0000000..d51ef69
--- /dev/null
+++ b/TODO.txt
@@ -0,0 +1,58 @@
+===============
+ToDo's for lxml
+===============
+
+lxml
+====
+
+In general
+----------
+
+* more testing on multi-threading
+
+* better exception messages for XPath and schemas based on error log,
+ e.g. missing namespace mappings in XPath
+
+* when building statically, compile everything into one shared library
+ instead of one for lxml.etree and one for lxml.objectify to prevent
+ the redundant static linking of the library dependencies.
+
+* more testing on input/output of encoded filenames, including custom
+ resolvers, relative XSLT imports, ...
+
+* always use '<string>' as URL when tree was parsed from string? (can libxml2
+ handle this?)
+
+* follow PEP 8 in API naming (avoidCamelCase in_favour_of_underscores)
+
+* use per-call or per-thread error logs in XSLT/XPath/etc. to keep the
+ messages separate, especially in exceptions
+
+* add 'nsmap' parameter to cleanup_namespaces()
+
+* fix tail text handling in addnext()/addprevious()
+
+* make Element nsmap editable to allow defining new namespaces (LP#555602)
+
+
+Entities
+--------
+
+* clean support for entities (is the Entity element class enough?)
+
+
+Objectify
+---------
+
+* emulate setting special __attributes__ on ObjectifiedElement's as Python
+ attributes, not XML children
+
+
+Incremental parsing
+-------------------
+
+* create all iterparse events only on start events and
+ store the end events in the stack
+
+* rewrite SAX event creation in a more C-ish way to avoid having to
+ acquire the GIL on each event
diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 0000000..b8d7a72
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,44 @@
+version: 1.0.{build}
+
+environment:
+ matrix:
+ - python: 39
+ - python: 39-x64
+ - python: 27
+ - python: 27-x64
+ - python: 38
+ - python: 38-x64
+ - python: 37
+ - python: 37-x64
+ - python: 36
+ - python: 36-x64
+ - python: 35
+ - python: 35-x64
+ - python: 39
+ arch: arm64
+ env: STATIC_DEPS=true
+ - python: 38
+ arch: arm64
+ env: STATIC_DEPS=true
+
+install:
+ - SET PATH=C:\\Python%PYTHON%;c:\\Python%PYTHON%\\scripts;%PATH%
+ - ps: |
+ $env:PYTHON = "C:\\Python$($env:PYTHON)"
+ if (-not (Test-Path $env:PYTHON)) {
+ curl -o install_python.ps1 https://raw.githubusercontent.com/matthew-brett/multibuild/11a389d78892cf90addac8f69433d5e22bfa422a/install_python.ps1
+ .\\install_python.ps1
+ }
+ # remove the above when appveyor has proper Python 3.8 support
+ - python -m pip.__main__ install -U pip wheel setuptools
+ - pip install -r requirements.txt
+
+build: off
+build_script:
+ - python -u setup.py bdist_wheel --static-deps
+ - python -u setup.py build_ext --inplace --static-deps
+ - python -u test.py -vv -p
+
+test: off
+test_script:
+ - ps: Get-ChildItem dist\*.whl | % { Push-AppveyorArtifact $_.FullName -FileName $_.Name }
diff --git a/benchmark/bench_etree.py b/benchmark/bench_etree.py
new file mode 100644
index 0000000..0f66db8
--- /dev/null
+++ b/benchmark/bench_etree.py
@@ -0,0 +1,452 @@
+import copy
+from itertools import *
+
+import benchbase
+from benchbase import (with_attributes, with_text, onlylib,
+ serialized, children, nochange, BytesIO)
+
+TEXT = "some ASCII text"
+UTEXT = u"some klingon: \F8D2"
+
+############################################################
+# Benchmarks
+############################################################
+
+class BenchMark(benchbase.TreeBenchMark):
+ @nochange
+ def bench_iter_children(self, root):
+ for child in root:
+ pass
+
+ @nochange
+ def bench_iter_children_reversed(self, root):
+ for child in reversed(root):
+ pass
+
+ @nochange
+ def bench_first_child(self, root):
+ for i in self.repeat1000:
+ child = root[0]
+
+ @nochange
+ def bench_last_child(self, root):
+ for i in self.repeat1000:
+ child = root[-1]
+
+ @nochange
+ def bench_middle_child(self, root):
+ pos = len(root) // 2
+ for i in self.repeat1000:
+ child = root[pos]
+
+ @nochange
+ @with_attributes(False)
+ @with_text(text=True)
+ def bench_tostring_text_ascii(self, root):
+ self.etree.tostring(root, method="text")
+
+ @nochange
+ @with_attributes(False)
+ @with_text(text=True, utext=True)
+ def bench_tostring_text_unicode(self, root):
+ self.etree.tostring(root, method="text", encoding='unicode')
+
+ @nochange
+ @with_attributes(False)
+ @with_text(text=True, utext=True)
+ def bench_tostring_text_utf16(self, root):
+ self.etree.tostring(root, method="text", encoding='UTF-16')
+
+ @nochange
+ @with_attributes(False)
+ @with_text(text=True, utext=True)
+ @onlylib('lxe')
+ @children
+ def bench_tostring_text_utf8_with_tail(self, children):
+ for child in children:
+ self.etree.tostring(child, method="text",
+ encoding='UTF-8', with_tail=True)
+
+ @nochange
+ @with_attributes(True, False)
+ @with_text(text=True, utext=True)
+ def bench_tostring_utf8(self, root):
+ self.etree.tostring(root, encoding='UTF-8')
+
+ @nochange
+ @with_attributes(True, False)
+ @with_text(text=True, utext=True)
+ def bench_tostring_utf16(self, root):
+ self.etree.tostring(root, encoding='UTF-16')
+
+ @nochange
+ @with_attributes(True, False)
+ @with_text(text=True, utext=True)
+ def bench_tostring_utf8_unicode_XML(self, root):
+ xml = self.etree.tostring(root, encoding='UTF-8').decode('UTF-8')
+ self.etree.XML(xml)
+
+ @nochange
+ @with_attributes(True, False)
+ @with_text(text=True, utext=True)
+ def bench_write_utf8_parse_bytesIO(self, root):
+ f = BytesIO()
+ self.etree.ElementTree(root).write(f, encoding='UTF-8')
+ f.seek(0)
+ self.etree.parse(f)
+
+ @with_attributes(True, False)
+ @with_text(text=True, utext=True)
+ @serialized
+ def bench_parse_bytesIO(self, root_xml):
+ f = BytesIO(root_xml)
+ self.etree.parse(f)
+
+ @with_attributes(True, False)
+ @with_text(text=True, utext=True)
+ @serialized
+ def bench_XML(self, root_xml):
+ self.etree.XML(root_xml)
+
+ @with_attributes(True, False)
+ @with_text(text=True, utext=True)
+ @serialized
+ def bench_iterparse_bytesIO(self, root_xml):
+ f = BytesIO(root_xml)
+ for event, element in self.etree.iterparse(f):
+ pass
+
+ @with_attributes(True, False)
+ @with_text(text=True, utext=True)
+ @serialized
+ def bench_iterparse_bytesIO_clear(self, root_xml):
+ f = BytesIO(root_xml)
+ for event, element in self.etree.iterparse(f):
+ element.clear()
+
+ def bench_append_from_document(self, root1, root2):
+ # == "1,2 2,3 1,3 3,1 3,2 2,1" # trees 1 and 2, or 2 and 3, or ...
+ for el in root2:
+ root1.append(el)
+
+ def bench_insert_from_document(self, root1, root2):
+ pos = len(root1)//2
+ for el in root2:
+ root1.insert(pos, el)
+ pos = pos + 1
+
+ def bench_rotate_children(self, root):
+ # == "1 2 3" # runs on any single tree independently
+ for i in range(100):
+ el = root[0]
+ del root[0]
+ root.append(el)
+
+ def bench_reorder(self, root):
+ for i in range(1,len(root)//2):
+ el = root[0]
+ del root[0]
+ root[-i:-i] = [ el ]
+
+ def bench_reorder_slice(self, root):
+ for i in range(1,len(root)//2):
+ els = root[0:1]
+ del root[0]
+ root[-i:-i] = els
+
+ def bench_clear(self, root):
+ root.clear()
+
+ @nochange
+ @children
+ def bench_has_children(self, children):
+ for child in children:
+ if child and child and child and child and child:
+ pass
+
+ @nochange
+ @children
+ def bench_len(self, children):
+ for child in children:
+ map(len, repeat(child, 20))
+
+ @children
+ def bench_create_subelements(self, children):
+ SubElement = self.etree.SubElement
+ for child in children:
+ SubElement(child, '{test}test')
+
+ def bench_append_elements(self, root):
+ Element = self.etree.Element
+ for child in root:
+ el = Element('{test}test')
+ child.append(el)
+
+ @nochange
+ @children
+ def bench_makeelement(self, children):
+ empty_attrib = {}
+ for child in children:
+ child.makeelement('{test}test', empty_attrib)
+
+ @nochange
+ @children
+ def bench_create_elements(self, children):
+ Element = self.etree.Element
+ for child in children:
+ Element('{test}test')
+
+ @children
+ def bench_replace_children_element(self, children):
+ Element = self.etree.Element
+ for child in children:
+ el = Element('{test}test')
+ child[:] = [el]
+
+ @children
+ def bench_replace_children(self, children):
+ els = [ self.etree.Element("newchild") ]
+ for child in children:
+ child[:] = els
+
+ def bench_remove_children(self, root):
+ for child in root:
+ root.remove(child)
+
+ def bench_remove_children_reversed(self, root):
+ for child in reversed(root):
+ root.remove(child)
+
+ @children
+ def bench_set_attributes(self, children):
+ for child in children:
+ child.set('a', 'bla')
+
+ @with_attributes(True)
+ @children
+ @nochange
+ def bench_get_attributes(self, children):
+ for child in children:
+ child.get('bla1')
+ child.get('{attr}test1')
+
+ @children
+ def bench_setget_attributes(self, children):
+ for child in children:
+ child.set('a', 'bla')
+ for child in children:
+ child.get('a')
+
+ @nochange
+ def bench_root_getchildren(self, root):
+ root.getchildren()
+
+ @nochange
+ def bench_root_list_children(self, root):
+ list(root)
+
+ @nochange
+ @children
+ def bench_getchildren(self, children):
+ for child in children:
+ child.getchildren()
+
+ @nochange
+ @children
+ def bench_get_children_slice(self, children):
+ for child in children:
+ child[:]
+
+ @nochange
+ @children
+ def bench_get_children_slice_2x(self, children):
+ for child in children:
+ child[:]
+ child[:]
+
+ @nochange
+ @children
+ @with_attributes(True, False)
+ @with_text(utext=True, text=True, no_text=True)
+ def bench_deepcopy(self, children):
+ for child in children:
+ copy.deepcopy(child)
+
+ @nochange
+ @with_attributes(True, False)
+ @with_text(utext=True, text=True, no_text=True)
+ def bench_deepcopy_all(self, root):
+ copy.deepcopy(root)
+
+ @nochange
+ @children
+ def bench_tag(self, children):
+ for child in children:
+ child.tag
+
+ @nochange
+ @children
+ def bench_tag_repeat(self, children):
+ for child in children:
+ for i in self.repeat100:
+ child.tag
+
+ @nochange
+ @with_text(utext=True, text=True, no_text=True)
+ @children
+ def bench_text(self, children):
+ for child in children:
+ child.text
+
+ @nochange
+ @with_text(utext=True, text=True, no_text=True)
+ @children
+ def bench_text_repeat(self, children):
+ for child in children:
+ for i in self.repeat500:
+ child.text
+
+ @children
+ def bench_set_text(self, children):
+ text = TEXT
+ for child in children:
+ child.text = text
+
+ @children
+ def bench_set_utext(self, children):
+ text = UTEXT
+ for child in children:
+ child.text = text
+
+ @nochange
+ @onlylib('lxe')
+ def bench_index(self, root):
+ for child in root:
+ root.index(child)
+
+ @nochange
+ @onlylib('lxe')
+ def bench_index_slice(self, root):
+ for child in root[5:100]:
+ root.index(child, 5, 100)
+
+ @nochange
+ @onlylib('lxe')
+ def bench_index_slice_neg(self, root):
+ for child in root[-100:-5]:
+ root.index(child, start=-100, stop=-5)
+
+ @nochange
+ def bench_iter_all(self, root):
+ list(root.iter())
+
+ @nochange
+ def bench_iter_one_at_a_time(self, root):
+ list(islice(root.iter(), 2**30, None))
+
+ @nochange
+ def bench_iter_islice(self, root):
+ list(islice(root.iter(), 10, 110))
+
+ @nochange
+ def bench_iter_tag(self, root):
+ list(islice(root.iter(self.SEARCH_TAG), 3, 10))
+
+ @nochange
+ def bench_iter_tag_all(self, root):
+ list(root.iter(self.SEARCH_TAG))
+
+ @nochange
+ def bench_iter_tag_one_at_a_time(self, root):
+ list(islice(root.iter(self.SEARCH_TAG), 2**30, None))
+
+ @nochange
+ def bench_iter_tag_none(self, root):
+ list(root.iter("{ThisShould}NeverExist"))
+
+ @nochange
+ def bench_iter_tag_text(self, root):
+ [ e.text for e in root.iter(self.SEARCH_TAG) ]
+
+ @nochange
+ def bench_findall(self, root):
+ root.findall(".//*")
+
+ @nochange
+ def bench_findall_child(self, root):
+ root.findall(".//*/" + self.SEARCH_TAG)
+
+ @nochange
+ def bench_findall_tag(self, root):
+ root.findall(".//" + self.SEARCH_TAG)
+
+ @nochange
+ def bench_findall_path(self, root):
+ root.findall(".//*[%s]/./%s/./*" % (self.SEARCH_TAG, self.SEARCH_TAG))
+
+ @nochange
+ @onlylib('lxe')
+ def bench_xpath_path(self, root):
+ ns, tag = self.SEARCH_TAG[1:].split('}')
+ root.xpath(".//*[p:%s]/./p:%s/./*" % (tag,tag),
+ namespaces = {'p':ns})
+
+ @nochange
+ def bench_iterfind(self, root):
+ list(root.iterfind(".//*"))
+
+ @nochange
+ def bench_iterfind_tag(self, root):
+ list(root.iterfind(".//" + self.SEARCH_TAG))
+
+ @nochange
+ def bench_iterfind_islice(self, root):
+ list(islice(root.iterfind(".//*"), 10, 110))
+
+ _bench_xpath_single_xpath = None
+
+ @nochange
+ @onlylib('lxe')
+ def bench_xpath_single(self, root):
+ xpath = self._bench_xpath_single_xpath
+ if xpath is None:
+ ns, tag = self.SEARCH_TAG[1:].split('}')
+ xpath = self._bench_xpath_single_xpath = self.etree.XPath(
+ './/p:%s[1]' % tag, namespaces={'p': ns})
+ xpath(root)
+
+ @nochange
+ def bench_find_single(self, root):
+ root.find(".//%s" % self.SEARCH_TAG)
+
+ @nochange
+ def bench_iter_single(self, root):
+ next(root.iter(self.SEARCH_TAG))
+
+ _bench_xpath_two_xpath = None
+
+ @nochange
+ @onlylib('lxe')
+ def bench_xpath_two(self, root):
+ xpath = self._bench_xpath_two_xpath
+ if xpath is None:
+ ns, tag = self.SEARCH_TAG[1:].split('}')
+ xpath = self._bench_xpath_two_xpath = self.etree.XPath(
+ './/p:%s[position() < 3]' % tag, namespaces={'p': ns})
+ xpath(root)
+
+ @nochange
+ def bench_iterfind_two(self, root):
+ it = root.iterfind(".//%s" % self.SEARCH_TAG)
+ next(it)
+ next(it)
+
+ @nochange
+ def bench_iter_two(self, root):
+ it = root.iter(self.SEARCH_TAG)
+ next(it)
+ next(it)
+
+
+if __name__ == '__main__':
+ benchbase.main(BenchMark)
diff --git a/benchmark/bench_objectify.py b/benchmark/bench_objectify.py
new file mode 100644
index 0000000..9b71267
--- /dev/null
+++ b/benchmark/bench_objectify.py
@@ -0,0 +1,122 @@
+from itertools import *
+
+import benchbase
+from benchbase import (with_text, children, nochange)
+
+############################################################
+# Benchmarks
+############################################################
+
+class BenchMark(benchbase.TreeBenchMark):
+ repeat100 = range(100)
+ repeat1000 = range(1000)
+ repeat3000 = range(3000)
+
+ def __init__(self, lib):
+ from lxml import etree, objectify
+ self.objectify = objectify
+ parser = etree.XMLParser(remove_blank_text=True)
+ lookup = objectify.ObjectifyElementClassLookup()
+ parser.setElementClassLookup(lookup)
+ super(BenchMark, self).__init__(etree, parser)
+
+ @nochange
+ def bench_attribute(self, root):
+ "1 2 4"
+ for i in self.repeat3000:
+ root.zzzzz
+
+ def bench_attribute_assign_int(self, root):
+ "1 2 4"
+ for i in self.repeat3000:
+ root.XYZ = 5
+
+ def bench_attribute_assign_string(self, root):
+ "1 2 4"
+ for i in self.repeat3000:
+ root.XYZ = "5"
+
+ @nochange
+ def bench_attribute_cached(self, root):
+ "1 2 4"
+ cache = root.zzzzz
+ for i in self.repeat3000:
+ root.zzzzz
+
+ @nochange
+ def bench_attributes_deep(self, root):
+ "1 2 4"
+ for i in self.repeat3000:
+ root.zzzzz['{cdefg}a00001']
+
+ @nochange
+ def bench_attributes_deep_cached(self, root):
+ "1 2 4"
+ cache1 = root.zzzzz
+ cache2 = cache1['{cdefg}a00001']
+ for i in self.repeat3000:
+ root.zzzzz['{cdefg}a00001']
+
+ @nochange
+ def bench_objectpath(self, root):
+ "1 2 4"
+ path = self.objectify.ObjectPath(".zzzzz")
+ for i in self.repeat3000:
+ path(root)
+
+ @nochange
+ def bench_objectpath_deep(self, root):
+ "1 2 4"
+ path = self.objectify.ObjectPath(".zzzzz.{cdefg}a00001")
+ for i in self.repeat3000:
+ path(root)
+
+ @nochange
+ def bench_objectpath_deep_cached(self, root):
+ "1 2 4"
+ cache1 = root.zzzzz
+ cache2 = cache1['{cdefg}a00001']
+ path = self.objectify.ObjectPath(".zzzzz.{cdefg}a00001")
+ for i in self.repeat3000:
+ path(root)
+
+ @with_text(text=True, utext=True, no_text=True)
+ def bench_annotate(self, root):
+ self.objectify.annotate(root)
+
+ @nochange
+ def bench_descendantpaths(self, root):
+ root.descendantpaths()
+
+ @nochange
+ @with_text(text=True)
+ def bench_type_inference(self, root):
+ "1 2 4"
+ el = root.aaaaa
+ for i in self.repeat1000:
+ el.getchildren()
+
+ @nochange
+ @with_text(text=True)
+ def bench_type_inference_annotated(self, root):
+ "1 2 4"
+ el = root.aaaaa
+ self.objectify.annotate(el)
+ for i in self.repeat1000:
+ el.getchildren()
+
+ @nochange
+ @children
+ def bench_elementmaker(self, children):
+ E = self.objectify.E
+ for child in children:
+ root = E.this(
+ "test",
+ E.will(
+ E.do("nothing"),
+ E.special,
+ )
+ )
+
+if __name__ == '__main__':
+ benchbase.main(BenchMark)
diff --git a/benchmark/bench_xpath.py b/benchmark/bench_xpath.py
new file mode 100644
index 0000000..59cdc78
--- /dev/null
+++ b/benchmark/bench_xpath.py
@@ -0,0 +1,93 @@
+from itertools import *
+
+import benchbase
+from benchbase import onlylib, children, nochange
+
+############################################################
+# Benchmarks
+############################################################
+
+class XPathBenchMark(benchbase.TreeBenchMark):
+ @nochange
+ @onlylib('lxe')
+ @children
+ def bench_xpath_class(self, children):
+ xpath = self.etree.XPath("./*[1]")
+ for child in children:
+ xpath(child)
+
+ @nochange
+ @onlylib('lxe')
+ @children
+ def bench_xpath_class_repeat(self, children):
+ for child in children:
+ xpath = self.etree.XPath("./*[1]")
+ xpath(child)
+
+ @nochange
+ @onlylib('lxe')
+ def bench_xpath_element(self, root):
+ xpath = self.etree.XPathElementEvaluator(root)
+ for child in root:
+ xpath.evaluate("./*[1]")
+
+ @nochange
+ @onlylib('lxe')
+ @children
+ def bench_xpath_method(self, children):
+ for child in children:
+ child.xpath("./*[1]")
+
+ @nochange
+ @onlylib('lxe')
+ @children
+ def bench_multiple_xpath_or(self, children):
+ xpath = self.etree.XPath(".//p:a00001|.//p:b00001|.//p:c00001",
+ namespaces={'p':'cdefg'})
+ for child in children:
+ xpath(child)
+
+ @nochange
+ @onlylib('lxe')
+ @children
+ def bench_multiple_iter_tag(self, children):
+ for child in children:
+ list(child.iter("{cdefg}a00001"))
+ list(child.iter("{cdefg}b00001"))
+ list(child.iter("{cdefg}c00001"))
+
+ @nochange
+ @onlylib('lxe')
+ @children
+ def bench_xpath_old_extensions(self, children):
+ def return_child(_, elements):
+ if elements:
+ return elements[0][0]
+ else:
+ return ()
+ extensions = {("test", "child") : return_child}
+ xpath = self.etree.XPath("t:child(.)", namespaces={"t":"test"},
+ extensions=extensions)
+ for child in children:
+ xpath(child)
+
+ @nochange
+ @onlylib('lxe')
+ @children
+ def bench_xpath_extensions(self, children):
+ def return_child(_, elements):
+ if elements:
+ return elements[0][0]
+ else:
+ return ()
+ self.etree.FunctionNamespace("testns")["t"] = return_child
+
+ try:
+ xpath = self.etree.XPath("test:t(.)", namespaces={"test":"testns"})
+ for child in children:
+ xpath(child)
+ finally:
+ del self.etree.FunctionNamespace("testns")["t"]
+
+if __name__ == '__main__':
+ benchbase.main(XPathBenchMark)
diff --git a/benchmark/bench_xslt.py b/benchmark/bench_xslt.py
new file mode 100644
index 0000000..abfdb7c
--- /dev/null
+++ b/benchmark/bench_xslt.py
@@ -0,0 +1,56 @@
+from itertools import *
+
+import benchbase
+from benchbase import onlylib
+
+############################################################
+# Benchmarks
+############################################################
+
+class XSLTBenchMark(benchbase.TreeBenchMark):
+ @onlylib('lxe')
+ def bench_xslt_extensions_old(self, root):
+ tree = self.etree.XML("""\
+<xsl:stylesheet version="1.0"
+ xmlns:l="test"
+ xmlns:testns="testns"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <l:data>TEST</l:data>
+ <xsl:template match="/">
+ <l:result>
+ <xsl:for-each select="*/*">
+ <xsl:copy-of select="testns:child(.)"/>
+ </xsl:for-each>
+ </l:result>
+ </xsl:template>
+</xsl:stylesheet>
+""")
+ def return_child(_, elements):
+ return elements[0][0]
+
+ extensions = {('testns', 'child') : return_child}
+
+ transform = self.etree.XSLT(tree, extensions)
+ for i in range(10):
+ transform(root)
+
+ @onlylib('lxe')
+ def bench_xslt_document(self, root):
+ transform = self.etree.XSLT(self.etree.XML("""\
+<xsl:stylesheet version="1.0"
+ xmlns:l="test"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <l:data>TEST</l:data>
+ <xsl:template match="/">
+ <l:result>
+ <xsl:for-each select="*/*">
+ <l:test><xsl:copy-of select="document('')//l:data/text()"/></l:test>
+ </xsl:for-each>
+ </l:result>
+ </xsl:template>
+</xsl:stylesheet>
+"""))
+ transform(root)
+
+if __name__ == '__main__':
+ benchbase.main(XSLTBenchMark)
diff --git a/benchmark/benchbase.py b/benchmark/benchbase.py
new file mode 100644
index 0000000..e34e610
--- /dev/null
+++ b/benchmark/benchbase.py
@@ -0,0 +1,541 @@
+import sys, re, string, time, copy, gc
+from itertools import *
+import time
+
+try:
+ izip
+except NameError:
+ izip = zip # Py3
+
+def exec_(code, glob):
+ if sys.version_info[0] >= 3:
+ exec(code, glob)
+ else:
+ exec("exec code in glob")
+
+
+TREE_FACTOR = 1 # increase tree size with '-l / '-L' cmd option
+
+_TEXT = "some ASCII text" * TREE_FACTOR
+_UTEXT = u"some klingon: \F8D2" * TREE_FACTOR
+_ATTRIBUTES = {
+ '{attr}test1' : _TEXT,
+ '{attr}test2' : _TEXT,
+ 'bla1' : _TEXT,
+ 'bla2' : _TEXT,
+ 'bla3' : _TEXT
+ }
+
+
+def initArgs(argv):
+ global TREE_FACTOR
+ try:
+ argv.remove('-l')
+ # use large trees
+ TREE_FACTOR *= 2
+ except ValueError:
+ pass
+
+ try:
+ argv.remove('-L')
+ # use LARGE trees
+ TREE_FACTOR *= 2
+ except ValueError:
+ pass
+
+############################################################
+# benchmark decorators
+############################################################
+
+def with_attributes(*use_attributes):
+ "Decorator for benchmarks that use attributes"
+ vmap = {False : 0, True : 1}
+ values = [ vmap[bool(v)] for v in use_attributes ]
+ def set_value(function):
+ try:
+ function.ATTRIBUTES.update(values)
+ except AttributeError:
+ function.ATTRIBUTES = set(values)
+ return function
+ return set_value
+
+def with_text(no_text=False, text=False, utext=False):
+ "Decorator for benchmarks that use text"
+ values = []
+ if no_text:
+ values.append(0)
+ if text:
+ values.append(1)
+ if utext:
+ values.append(2)
+ def set_value(function):
+ try:
+ function.TEXT.add(values)
+ except AttributeError:
+ function.TEXT = set(values)
+ return function
+ return set_value
+
+def onlylib(*libs):
+ "Decorator to restrict benchmarks to specific libraries"
+ def set_libs(function):
+ if libs:
+ function.LIBS = libs
+ return function
+ return set_libs
+
+def serialized(function):
+ "Decorator for benchmarks that require serialized XML data"
+ function.STRING = True
+ return function
+
+def children(function):
+ "Decorator for benchmarks that require a list of root children"
+ function.CHILDREN = True
+ return function
+
+def nochange(function):
+ "Decorator for benchmarks that do not change the XML tree"
+ function.NO_CHANGE = True
+ return function
+
+############################################################
+# benchmark baseclass
+############################################################
+
+class SkippedTest(Exception):
+ pass
+
+class TreeBenchMark(object):
+ atoz = string.ascii_lowercase
+ repeat100 = range(100)
+ repeat500 = range(500)
+ repeat1000 = range(1000)
+
+ _LIB_NAME_MAP = {
+ 'etree' : 'lxe',
+ 'ElementTree' : 'ET',
+ 'cElementTree' : 'cET'
+ }
+
+ SEARCH_TAG = "{cdefg}a00001"
+
+ def __init__(self, etree, etree_parser=None):
+ self.etree = etree
+ libname = etree.__name__.split('.')[-1]
+ self.lib_name = self._LIB_NAME_MAP.get(libname, libname)
+
+ if libname == 'etree':
+ deepcopy = copy.deepcopy
+ def set_property(root, fname):
+ xml = self._serialize_tree(root)
+ if etree_parser is not None:
+ setattr(self, fname, lambda : etree.XML(xml, etree_parser))
+ else:
+ setattr(self, fname, lambda : deepcopy(root))
+ setattr(self, fname + '_xml', lambda : xml)
+ setattr(self, fname + '_children', lambda : root[:])
+ else:
+ def set_property(root, fname):
+ setattr(self, fname, self.et_make_clone_factory(root))
+ xml = self._serialize_tree(root)
+ setattr(self, fname + '_xml', lambda : xml)
+ setattr(self, fname + '_children', lambda : root[:])
+
+ attribute_list = list(enumerate( [{}, _ATTRIBUTES] ))
+ text_list = list(enumerate( [None, _TEXT, _UTEXT] ))
+ build_name = self._tree_builder_name
+
+ self.setup_times = []
+ for tree in self._all_trees():
+ times = []
+ self.setup_times.append(times)
+ setup = getattr(self, '_setup_tree%d' % tree)
+ for an, attributes in attribute_list:
+ for tn, text in text_list:
+ root, t = setup(text, attributes)
+ times.append(t)
+ set_property(root, build_name(tree, tn, an))
+
+ def _tree_builder_name(self, tree, tn, an):
+ return '_root%d_T%d_A%d' % (tree, tn, an)
+
+ def tree_builder(self, tree, tn, an, serial, children):
+ name = self._tree_builder_name(tree, tn, an)
+ if serial:
+ name += '_xml'
+ elif children:
+ name += '_children'
+ return getattr(self, name)
+
+ def _serialize_tree(self, root):
+ return self.etree.tostring(root, encoding='UTF-8')
+
+ def et_make_clone_factory(self, elem):
+ def generate_elem(append, elem, level):
+ var = "e" + str(level)
+ arg = repr(elem.tag)
+ if elem.attrib:
+ arg += ", **%r" % elem.attrib
+ if level == 1:
+ append(" e1 = Element(%s)" % arg)
+ else:
+ append(" %s = SubElement(e%d, %s)" % (var, level-1, arg))
+ if elem.text:
+ append(" %s.text = %r" % (var, elem.text))
+ if elem.tail:
+ append(" %s.tail = %r" % (var, elem.tail))
+ for e in elem:
+ generate_elem(append, e, level+1)
+ # generate code for a function that creates a tree
+ output = ["def element_factory():"]
+ generate_elem(output.append, elem, 1)
+ output.append(" return e1")
+ # setup global function namespace
+ namespace = {
+ "Element" : self.etree.Element,
+ "SubElement" : self.etree.SubElement
+ }
+
+ # create function object
+ exec_("\n".join(output), namespace)
+ return namespace["element_factory"]
+
+ def _all_trees(self):
+ all_trees = []
+ for name in dir(self):
+ if name.startswith('_setup_tree'):
+ all_trees.append(int(name[11:]))
+ return all_trees
+
+ def _setup_tree1(self, text, attributes):
+ "tree with 26 2nd level and 520 * TREE_FACTOR 3rd level children"
+ atoz = self.atoz
+ SubElement = self.etree.SubElement
+ current_time = time.time
+ t = current_time()
+ root = self.etree.Element('{abc}rootnode')
+ for ch1 in atoz:
+ el = SubElement(root, "{abc}"+ch1*5, attributes)
+ el.text = text
+ for ch2 in atoz:
+ tag = "{cdefg}%s00001" % ch2
+ for i in range(20 * TREE_FACTOR):
+ SubElement(el, tag).tail = text
+ t = current_time() - t
+ return root, t
+
+ def _setup_tree2(self, text, attributes):
+ "tree with 520 * TREE_FACTOR 2nd level and 26 3rd level children"
+ atoz = self.atoz
+ SubElement = self.etree.SubElement
+ current_time = time.time
+ t = current_time()
+ root = self.etree.Element('{abc}rootnode')
+ for ch1 in atoz:
+ for i in range(20 * TREE_FACTOR):
+ el = SubElement(root, "{abc}"+ch1*5, attributes)
+ el.text = text
+ for ch2 in atoz:
+ SubElement(el, "{cdefg}%s00001" % ch2).tail = text
+ t = current_time() - t
+ return root, t
+
+ def _setup_tree3(self, text, attributes):
+ "tree of depth 8 + TREE_FACTOR with 3 children per node"
+ SubElement = self.etree.SubElement
+ current_time = time.time
+ t = current_time()
+ root = self.etree.Element('{abc}rootnode')
+ children = [root]
+ for i in range(6 + TREE_FACTOR):
+ children = [ SubElement(c, "{cdefg}a%05d" % (i%8), attributes)
+ for i,c in enumerate(chain(children, children, children)) ]
+ for child in children:
+ child.text = text
+ child.tail = text
+ t = current_time() - t
+ return root, t
+
+ def _setup_tree4(self, text, attributes):
+ "small tree with 26 2nd level and 2 3rd level children"
+ SubElement = self.etree.SubElement
+ current_time = time.time
+ t = current_time()
+ root = self.etree.Element('{abc}rootnode')
+ for ch1 in self.atoz:
+ el = SubElement(root, "{abc}"+ch1*5, attributes)
+ el.text = text
+ SubElement(el, "{cdefg}a00001", attributes).tail = text
+ SubElement(el, "{cdefg}z00000", attributes).tail = text
+ t = current_time() - t
+ return root, t
+
+ def benchmarks(self):
+ """Returns a list of all benchmarks.
+
+ A benchmark is a tuple containing a method name and a list of tree
+ numbers. Trees are prepared by the setup function.
+ """
+ all_trees = self._all_trees()
+ benchmarks = []
+ for name in dir(self):
+ if not name.startswith('bench_'):
+ continue
+ method = getattr(self, name)
+ if hasattr(method, 'LIBS') and self.lib_name not in method.LIBS:
+ method_call = None
+ else:
+ method_call = method
+ if method.__doc__:
+ tree_sets = method.__doc__.split()
+ else:
+ tree_sets = ()
+ if tree_sets:
+ tree_tuples = [list(map(int, tree_set.split(',')))
+ for tree_set in tree_sets]
+ else:
+ try:
+ arg_count = method.func_code.co_argcount - 1
+ except AttributeError:
+ try:
+ arg_count = method.__code__.co_argcount - 1
+ except AttributeError:
+ arg_count = 1
+ tree_tuples = self._permutations(all_trees, arg_count)
+
+ serialized = getattr(method, 'STRING', False)
+ children = getattr(method, 'CHILDREN', False)
+ no_change = getattr(method, 'NO_CHANGE', False)
+
+ for tree_tuple in tree_tuples:
+ for tn in sorted(getattr(method, 'TEXT', (0,))):
+ for an in sorted(getattr(method, 'ATTRIBUTES', (0,))):
+ benchmarks.append((name, method_call, tree_tuple,
+ tn, an, serialized, children,
+ no_change))
+
+ return benchmarks
+
+ def _permutations(self, seq, count):
+ def _permutations(prefix, remainder, count):
+ if count == 0:
+ return [ prefix[:] ]
+ count -= 1
+ perms = []
+ prefix.append(None)
+ for pos, el in enumerate(remainder):
+ new_remainder = remainder[:pos] + remainder[pos+1:]
+ prefix[-1] = el
+ perms.extend( _permutations(prefix, new_remainder, count) )
+ prefix.pop()
+ return perms
+ return _permutations([], seq, count)
+
+############################################################
+# Prepare and run benchmark suites
+############################################################
+
+def buildSuites(benchmark_class, etrees, selected):
+ benchmark_suites = list(map(benchmark_class, etrees))
+
+ # sorted by name and tree tuple
+ benchmarks = [ sorted(b.benchmarks()) for b in benchmark_suites ]
+
+ selected = [ re.compile(r).search for r in selected ]
+
+ if selected:
+ benchmarks = [ [ b for b in bs
+ if [ match for match in selected
+ if match(b[0]) ] ]
+ for bs in benchmarks ]
+
+ return benchmark_suites, benchmarks
+
+def build_treeset_name(trees, tn, an, serialized, children):
+ text = {0:'-', 1:'S', 2:'U'}[tn]
+ attr = {0:'-', 1:'A'}[an]
+ ser = {True:'X', False:'T'}[serialized]
+ chd = {True:'C', False:'R'}[children]
+ return "%s%s%s%s T%s" % (text, attr, ser, chd, ',T'.join(map(str, trees))[:6])
+
+def printSetupTimes(benchmark_suites):
+ print("Setup times for trees in seconds:")
+ for b in benchmark_suites:
+ sys.stdout.write("%-3s: " % b.lib_name)
+ for an in (0,1):
+ for tn in (0,1,2):
+ sys.stdout.write(' %s ' %
+ build_treeset_name((), tn, an, False, False)[:2])
+ print('')
+ for i, tree_times in enumerate(b.setup_times):
+ print(" T%d: %s" % (i+1, ' '.join("%6.4f" % t for t in tree_times)))
+ print('')
+
+def runBench(suite, method_name, method_call, tree_set, tn, an,
+ serial, children, no_change):
+ if method_call is None:
+ raise SkippedTest
+
+ current_time = time.time
+ call_repeat = range(10)
+
+ tree_builders = [ suite.tree_builder(tree, tn, an, serial, children)
+ for tree in tree_set ]
+
+ rebuild_trees = not no_change and not serial
+
+ args = tuple([ build() for build in tree_builders ])
+ method_call(*args) # run once to skip setup overhead
+
+ times = []
+ for i in range(3):
+ gc.collect()
+ gc.disable()
+ t = -1
+ for i in call_repeat:
+ if rebuild_trees:
+ args = [ build() for build in tree_builders ]
+ t_one_call = current_time()
+ method_call(*args)
+ t_one_call = current_time() - t_one_call
+ if t < 0:
+ t = t_one_call
+ else:
+ t = min(t, t_one_call)
+ times.append(1000.0 * t)
+ gc.enable()
+ if rebuild_trees:
+ args = ()
+ args = ()
+ gc.collect()
+ return times
+
+
+def runBenchmarks(benchmark_suites, benchmarks):
+ for bench_calls in izip(*benchmarks):
+ for lib, (bench, benchmark_setup) in enumerate(izip(benchmark_suites, bench_calls)):
+ bench_name = benchmark_setup[0]
+ tree_set_name = build_treeset_name(*benchmark_setup[-6:-1])
+ sys.stdout.write("%-3s: %-28s (%-10s) " % (
+ bench.lib_name, bench_name[6:34], tree_set_name))
+ sys.stdout.flush()
+
+ try:
+ result = runBench(bench, *benchmark_setup)
+ except SkippedTest:
+ print("skipped")
+ except KeyboardInterrupt:
+ print("interrupted by user")
+ sys.exit(1)
+ except Exception:
+ exc_type, exc_value = sys.exc_info()[:2]
+ print("failed: %s: %s" % (exc_type.__name__, exc_value))
+ exc_type = exc_value = None
+ else:
+ print("%9.4f msec/pass, best of (%s)" % (
+ min(result), ' '.join("%9.4f" % t for t in result)))
+
+ if len(benchmark_suites) > 1:
+ print('') # empty line between different benchmarks
+
+############################################################
+# Main program
+############################################################
+
+def main(benchmark_class):
+ import_lxml = True
+ callgrind_zero = False
+ if len(sys.argv) > 1:
+ try:
+ sys.argv.remove('-i')
+ # run benchmark 'inplace'
+ sys.path.insert(0, 'src')
+ except ValueError:
+ pass
+
+ try:
+ sys.argv.remove('-nolxml')
+ # run without lxml
+ import_lxml = False
+ except ValueError:
+ pass
+
+ try:
+ sys.argv.remove('-z')
+ # reset callgrind after tree setup
+ callgrind_zero = True
+ except ValueError:
+ pass
+
+ initArgs(sys.argv)
+
+ _etrees = []
+ if import_lxml:
+ from lxml import etree
+ _etrees.append(etree)
+
+ try:
+ sys.argv.remove('-fel')
+ except ValueError:
+ pass
+ else:
+ # use fast element creation in lxml.etree
+ etree.set_element_class_lookup(
+ etree.ElementDefaultClassLookup())
+
+ if len(sys.argv) > 1:
+ if '-a' in sys.argv or '-c' in sys.argv:
+ # 'all' or 'C-implementations' ?
+ try:
+ sys.argv.remove('-c')
+ except ValueError:
+ pass
+ try:
+ import cElementTree as cET
+ _etrees.append(cET)
+ except ImportError:
+ try:
+ import xml.etree.cElementTree as cET
+ _etrees.append(cET)
+ except ImportError:
+ pass
+
+ try:
+ # 'all' ?
+ sys.argv.remove('-a')
+ except ValueError:
+ pass
+ else:
+ try:
+ from elementtree import ElementTree as ET
+ _etrees.append(ET)
+ except ImportError:
+ try:
+ from xml.etree import ElementTree as ET
+ _etrees.append(ET)
+ except ImportError:
+ pass
+
+ if not _etrees:
+ print("No library to test. Exiting.")
+ sys.exit(1)
+
+ print("Preparing test suites and trees ...")
+ selected = set( sys.argv[1:] )
+ benchmark_suites, benchmarks = \
+ buildSuites(benchmark_class, _etrees, selected)
+
+ print("Running benchmark on", ', '.join(b.lib_name
+ for b in benchmark_suites))
+ print('')
+
+ printSetupTimes(benchmark_suites)
+
+ if callgrind_zero:
+ cmd = open("callgrind.cmd", 'w')
+ cmd.write('+Instrumentation\n')
+ cmd.write('Zero\n')
+ cmd.close()
+
+ runBenchmarks(benchmark_suites, benchmarks)
diff --git a/bisect_crashes.py b/bisect_crashes.py
new file mode 100644
index 0000000..7a3fe6c
--- /dev/null
+++ b/bisect_crashes.py
@@ -0,0 +1,66 @@
+
+import os
+import sys
+import unittest
+
+# make sure we import test.py from the right place
+script_path = os.path.abspath(os.path.dirname(sys.argv[0]))
+sys.path.insert(0, script_path)
+
+test_base_path = os.path.join(script_path, 'src')
+sys.path.insert(1, test_base_path)
+
+import test
+from DD import DD
+
+cfg = test.Options()
+cfg.verbosity = 0
+cfg.basedir = test_base_path
+cfg.unit_tests = True
+
+def write(line, *args):
+ if args:
+ line = line % args
+ sys.stderr.write(line + '\n')
+
+
+def find_tests():
+ test_files = test.get_test_files(cfg)
+ return test.get_test_cases(test_files, cfg)
+
+class DDTester(DD):
+ def _test(self, test_cases):
+ if not test_cases:
+ return self.PASS
+ write('Running subset of %d tests %s',
+ len(test_cases), self.coerce(test_cases))
+ test_cases = [ item[-1] for item in test_cases ]
+ pid = os.fork()
+ if not pid:
+ # child executes tests
+ runner = test.CustomTestRunner(cfg, None)
+ suite = unittest.TestSuite()
+ suite.addTests(test_cases)
+ os._exit( not runner.run(suite).wasSuccessful() )
+ cid, retval = os.waitpid(pid, 0)
+ if retval:
+ write('exit status: %d, signal: %d', retval >> 8, retval % 0xFF)
+ if (retval % 0xFF) > 2: # signal received?
+ return self.FAIL
+ return self.PASS
+
+ def coerce(self, test_cases):
+ if not test_cases:
+ return '[]'
+ test_cases = [ item[-1] for item in test_cases ]
+ return '[%s .. %s]' % (test_cases[0].id(), test_cases[-1].id())
+
+def dd_tests():
+ tests = find_tests()
+ write('Found %d tests', len(tests))
+ dd = DDTester()
+ min_tests = dd.ddmin( list(enumerate(tests)) )
+ return [ item[-1] for item in min_tests ]
+
+if __name__ == '__main__':
+ write('Failing tests:\n%s', '\n'.join([test.id() for test in dd_tests()]))
diff --git a/buildlibxml.py b/buildlibxml.py
new file mode 100644
index 0000000..f45c860
--- /dev/null
+++ b/buildlibxml.py
@@ -0,0 +1,466 @@
+import os, re, sys, subprocess
+import tarfile
+from distutils import log, version
+from contextlib import closing
+from ftplib import FTP
+
+try:
+ from urlparse import urljoin, unquote, urlparse
+ from urllib import urlretrieve, urlopen, urlcleanup
+except ImportError:
+ from urllib.parse import urljoin, unquote, urlparse
+ from urllib.request import urlretrieve, urlopen, urlcleanup
+
+multi_make_options = []
+try:
+ import multiprocessing
+ cpus = multiprocessing.cpu_count()
+ if cpus > 1:
+ if cpus > 5:
+ cpus = 5
+ multi_make_options = ['-j%d' % (cpus+1)]
+except:
+ pass
+
+
+# use pre-built libraries on Windows
+
+def download_and_extract_windows_binaries(destdir):
+ url = "https://github.com/mhils/libxml2-win-binaries/releases"
+ filenames = list(_list_dir_urllib(url))
+
+ release_path = "/download/%s/" % find_max_version(
+ "library release", filenames, re.compile(r"/releases/tag/([0-9.]+[0-9])$"))
+ url += release_path
+ filenames = [
+ filename.rsplit('/', 1)[1]
+ for filename in filenames
+ if release_path in filename
+ ]
+
+ arch = "win64" if sys.maxsize > 2**32 else "win32"
+ if sys.version_info < (3, 5):
+ arch = 'vs2008.' + arch
+
+ libs = {}
+ for libname in ['libxml2', 'libxslt', 'zlib', 'iconv']:
+ libs[libname] = "%s-%s.%s.zip" % (
+ libname,
+ find_max_version(libname, filenames),
+ arch,
+ )
+
+ if not os.path.exists(destdir):
+ os.makedirs(destdir)
+
+ for libname, libfn in libs.items():
+ srcfile = urljoin(url, libfn)
+ destfile = os.path.join(destdir, libfn)
+ if os.path.exists(destfile + ".keep"):
+ print('Using local copy of "{}"'.format(srcfile))
+ else:
+ print('Retrieving "%s" to "%s"' % (srcfile, destfile))
+ urlcleanup() # work around FTP bug 27973 in Py2.7.12+
+ urlretrieve(srcfile, destfile)
+ d = unpack_zipfile(destfile, destdir)
+ libs[libname] = d
+
+ return libs
+
+
+def find_top_dir_of_zipfile(zipfile):
+ topdir = None
+ files = [f.filename for f in zipfile.filelist]
+ dirs = [d for d in files if d.endswith('/')]
+ if dirs:
+ dirs.sort(key=len)
+ topdir = dirs[0]
+ topdir = topdir[:topdir.index("/")+1]
+ for path in files:
+ if not path.startswith(topdir):
+ topdir = None
+ break
+ assert topdir, (
+ "cannot determine single top-level directory in zip file %s" %
+ zipfile.filename)
+ return topdir.rstrip('/')
+
+
+def unpack_zipfile(zipfn, destdir):
+ assert zipfn.endswith('.zip')
+ import zipfile
+ print('Unpacking %s into %s' % (os.path.basename(zipfn), destdir))
+ f = zipfile.ZipFile(zipfn)
+ try:
+ extracted_dir = os.path.join(destdir, find_top_dir_of_zipfile(f))
+ f.extractall(path=destdir)
+ finally:
+ f.close()
+ assert os.path.exists(extracted_dir), 'missing: %s' % extracted_dir
+ return extracted_dir
+
+
+def get_prebuilt_libxml2xslt(download_dir, static_include_dirs, static_library_dirs):
+ assert sys.platform.startswith('win')
+ libs = download_and_extract_windows_binaries(download_dir)
+ for libname, path in libs.items():
+ i = os.path.join(path, 'include')
+ l = os.path.join(path, 'lib')
+ assert os.path.exists(i), 'does not exist: %s' % i
+ assert os.path.exists(l), 'does not exist: %s' % l
+ static_include_dirs.append(i)
+ static_library_dirs.append(l)
+
+
+## Routines to download and build libxml2/xslt from sources:
+
+LIBXML2_LOCATION = 'http://xmlsoft.org/sources/'
+LIBICONV_LOCATION = 'https://ftp.gnu.org/pub/gnu/libiconv/'
+ZLIB_LOCATION = 'https://zlib.net/'
+match_libfile_version = re.compile('^[^-]*-([.0-9-]+)[.].*').match
+
+
+def _find_content_encoding(response, default='iso8859-1'):
+ from email.message import Message
+ content_type = response.headers.get('Content-Type')
+ if content_type:
+ msg = Message()
+ msg.add_header('Content-Type', content_type)
+ charset = msg.get_content_charset(default)
+ else:
+ charset = default
+ return charset
+
+
+def remote_listdir(url):
+ try:
+ return _list_dir_urllib(url)
+ except IOError:
+ assert url.lower().startswith('ftp://')
+ print("Requesting with urllib failed. Falling back to ftplib. "
+ "Proxy argument will be ignored for %s" % url)
+ return _list_dir_ftplib(url)
+
+
+def _list_dir_ftplib(url):
+ parts = urlparse(url)
+ ftp = FTP(parts.netloc)
+ try:
+ ftp.login()
+ ftp.cwd(parts.path)
+ data = []
+ ftp.dir(data.append)
+ finally:
+ ftp.quit()
+ return parse_text_ftplist("\n".join(data))
+
+
+def _list_dir_urllib(url):
+ with closing(urlopen(url)) as res:
+ charset = _find_content_encoding(res)
+ content_type = res.headers.get('Content-Type')
+ data = res.read()
+
+ data = data.decode(charset)
+ if content_type and content_type.startswith('text/html'):
+ files = parse_html_filelist(data)
+ else:
+ files = parse_text_ftplist(data)
+ return files
+
+
+def http_listfiles(url, re_pattern):
+ with closing(urlopen(url)) as res:
+ charset = _find_content_encoding(res)
+ data = res.read()
+ files = re.findall(re_pattern, data.decode(charset))
+ return files
+
+
+def parse_text_ftplist(s):
+ for line in s.splitlines():
+ if not line.startswith('d'):
+ # -rw-r--r-- 1 ftp ftp 476 Sep 1 2011 md5sum.txt
+ # Last (9th) element is 'md5sum.txt' in the above example, but there
+ # may be variations, so we discard only the first 8 entries.
+ yield line.split(None, 8)[-1]
+
+
+def parse_html_filelist(s):
+ re_href = re.compile(
+ r'<a\s+(?:[^>]*\s+)?href=["\']([^;?"\']+?)[;?"\']',
+ re.I|re.M)
+ links = set(re_href.findall(s))
+ for link in links:
+ if not link.endswith('/'):
+ yield unquote(link)
+
+
+def tryint(s):
+ try:
+ return int(s)
+ except ValueError:
+ return s
+
+
+def download_libxml2(dest_dir, version=None):
+ """Downloads libxml2, returning the filename where the library was downloaded"""
+ #version_re = re.compile(r'LATEST_LIBXML2_IS_([0-9.]+[0-9](?:-[abrc0-9]+)?)')
+ version_re = re.compile(r'libxml2-([0-9.]+[0-9]).tar.gz')
+ filename = 'libxml2-%s.tar.gz'
+ return download_library(dest_dir, LIBXML2_LOCATION, 'libxml2',
+ version_re, filename, version=version)
+
+
+def download_libxslt(dest_dir, version=None):
+ """Downloads libxslt, returning the filename where the library was downloaded"""
+ #version_re = re.compile(r'LATEST_LIBXSLT_IS_([0-9.]+[0-9](?:-[abrc0-9]+)?)')
+ version_re = re.compile(r'libxslt-([0-9.]+[0-9]).tar.gz')
+ filename = 'libxslt-%s.tar.gz'
+ return download_library(dest_dir, LIBXML2_LOCATION, 'libxslt',
+ version_re, filename, version=version)
+
+
+def download_libiconv(dest_dir, version=None):
+ """Downloads libiconv, returning the filename where the library was downloaded"""
+ version_re = re.compile(r'libiconv-([0-9.]+[0-9]).tar.gz')
+ filename = 'libiconv-%s.tar.gz'
+ return download_library(dest_dir, LIBICONV_LOCATION, 'libiconv',
+ version_re, filename, version=version)
+
+
+def download_zlib(dest_dir, version):
+ """Downloads zlib, returning the filename where the library was downloaded"""
+ version_re = re.compile(r'zlib-([0-9.]+[0-9]).tar.gz')
+ filename = 'zlib-%s.tar.gz'
+ return download_library(dest_dir, ZLIB_LOCATION, 'zlib',
+ version_re, filename, version=version)
+
+
+def find_max_version(libname, filenames, version_re=None):
+ if version_re is None:
+ version_re = re.compile(r'%s-([0-9.]+[0-9](?:-[abrc0-9]+)?)' % libname)
+ versions = []
+ for fn in filenames:
+ match = version_re.search(fn)
+ if match:
+ version_string = match.group(1)
+ versions.append((tuple(map(tryint, version_string.split('.'))),
+ version_string))
+ if not versions:
+ raise Exception(
+ "Could not find the most current version of %s from the files: %s" % (
+ libname, filenames))
+ versions.sort()
+ version_string = versions[-1][-1]
+ print('Latest version of %s is %s' % (libname, version_string))
+ return version_string
+
+
+def download_library(dest_dir, location, name, version_re, filename, version=None):
+ if version is None:
+ try:
+ if location.startswith('ftp://'):
+ fns = remote_listdir(location)
+ else:
+ fns = http_listfiles(location, '(%s)' % filename.replace('%s', '(?:[0-9.]+[0-9])'))
+ version = find_max_version(name, fns, version_re)
+ except IOError:
+ # network failure - maybe we have the files already?
+ latest = (0,0,0)
+ fns = os.listdir(dest_dir)
+ for fn in fns:
+ if fn.startswith(name+'-'):
+ match = match_libfile_version(fn)
+ if match:
+ version_tuple = tuple(map(tryint, match.group(1).split('.')))
+ if version_tuple > latest:
+ latest = version_tuple
+ filename = fn
+ version = None
+ if latest == (0,0,0):
+ raise
+ if version:
+ filename = filename % version
+ full_url = urljoin(location, filename)
+ dest_filename = os.path.join(dest_dir, filename)
+ if os.path.exists(dest_filename):
+ print(('Using existing %s downloaded into %s '
+ '(delete this file if you want to re-download the package)') % (
+ name, dest_filename))
+ else:
+ print('Downloading %s into %s from %s' % (name, dest_filename, full_url))
+ urlcleanup() # work around FTP bug 27973 in Py2.7.12
+ urlretrieve(full_url, dest_filename)
+ return dest_filename
+
+
+def unpack_tarball(tar_filename, dest):
+ print('Unpacking %s into %s' % (os.path.basename(tar_filename), dest))
+ tar = tarfile.open(tar_filename)
+ base_dir = None
+ for member in tar:
+ base_name = member.name.split('/')[0]
+ if base_dir is None:
+ base_dir = base_name
+ elif base_dir != base_name:
+ print('Unexpected path in %s: %s' % (tar_filename, base_name))
+ tar.extractall(dest)
+ tar.close()
+ return os.path.join(dest, base_dir)
+
+
+def call_subprocess(cmd, **kw):
+ import subprocess
+ cwd = kw.get('cwd', '.')
+ cmd_desc = ' '.join(cmd)
+ log.info('Running "%s" in %s' % (cmd_desc, cwd))
+ returncode = subprocess.call(cmd, **kw)
+ if returncode:
+ raise Exception('Command "%s" returned code %s' % (cmd_desc, returncode))
+
+
+def safe_mkdir(dir):
+ if not os.path.exists(dir):
+ os.makedirs(dir)
+
+
+def cmmi(configure_cmd, build_dir, multicore=None, **call_setup):
+ print('Starting build in %s' % build_dir)
+ call_subprocess(configure_cmd, cwd=build_dir, **call_setup)
+ if not multicore:
+ make_jobs = multi_make_options
+ elif int(multicore) > 1:
+ make_jobs = ['-j%s' % multicore]
+ else:
+ make_jobs = []
+ call_subprocess(
+ ['make'] + make_jobs,
+ cwd=build_dir, **call_setup)
+ call_subprocess(
+ ['make'] + make_jobs + ['install'],
+ cwd=build_dir, **call_setup)
+
+
+def configure_darwin_env(env_setup):
+ import platform
+ # configure target architectures on MacOS-X (x86_64 only, by default)
+ major_version, minor_version = tuple(map(int, platform.mac_ver()[0].split('.')[:2]))
+ if major_version > 7:
+ env_default = {
+ 'CFLAGS': "-arch x86_64 -O2",
+ 'LDFLAGS': "-arch x86_64",
+ 'MACOSX_DEPLOYMENT_TARGET': "10.6"
+ }
+ env_default.update(os.environ)
+ env_setup['env'] = env_default
+
+
+def build_libxml2xslt(download_dir, build_dir,
+ static_include_dirs, static_library_dirs,
+ static_cflags, static_binaries,
+ libxml2_version=None,
+ libxslt_version=None,
+ libiconv_version=None,
+ zlib_version=None,
+ multicore=None):
+ safe_mkdir(download_dir)
+ safe_mkdir(build_dir)
+ zlib_dir = unpack_tarball(download_zlib(download_dir, zlib_version), build_dir)
+ libiconv_dir = unpack_tarball(download_libiconv(download_dir, libiconv_version), build_dir)
+ libxml2_dir = unpack_tarball(download_libxml2(download_dir, libxml2_version), build_dir)
+ libxslt_dir = unpack_tarball(download_libxslt(download_dir, libxslt_version), build_dir)
+ prefix = os.path.join(os.path.abspath(build_dir), 'libxml2')
+ lib_dir = os.path.join(prefix, 'lib')
+ safe_mkdir(prefix)
+
+ lib_names = ['libxml2', 'libexslt', 'libxslt', 'iconv', 'libz']
+ existing_libs = {
+ lib: os.path.join(lib_dir, filename)
+ for lib in lib_names
+ for filename in os.listdir(lib_dir)
+ if lib in filename and filename.endswith('.a')
+ } if os.path.isdir(lib_dir) else {}
+
+ def has_current_lib(name, build_dir, _build_all_following=[False]):
+ if _build_all_following[0]:
+ return False # a dependency was rebuilt => rebuilt this lib as well
+ lib_file = existing_libs.get(name)
+ found = lib_file and os.path.getmtime(lib_file) > os.path.getmtime(build_dir)
+ if found:
+ print("Found pre-built '%s'" % name)
+ else:
+ # also rebuild all following libs (which may depend on this one)
+ _build_all_following[0] = True
+ return found
+
+ call_setup = {}
+ if sys.platform == 'darwin':
+ configure_darwin_env(call_setup)
+
+ configure_cmd = ['./configure',
+ '--disable-dependency-tracking',
+ '--disable-shared',
+ '--prefix=%s' % prefix,
+ ]
+
+ # build zlib
+ zlib_configure_cmd = [
+ './configure',
+ '--prefix=%s' % prefix,
+ ]
+ if not has_current_lib("libz", zlib_dir):
+ cmmi(zlib_configure_cmd, zlib_dir, multicore, **call_setup)
+
+ # build libiconv
+ if not has_current_lib("iconv", libiconv_dir):
+ cmmi(configure_cmd, libiconv_dir, multicore, **call_setup)
+
+ # build libxml2
+ libxml2_configure_cmd = configure_cmd + [
+ '--without-python',
+ '--with-iconv=%s' % prefix,
+ '--with-zlib=%s' % prefix,
+ ]
+
+ if not libxml2_version:
+ libxml2_version = os.path.basename(libxml2_dir).split('-', 1)[-1]
+
+ if tuple(map(tryint, libxml2_version.split('-', 1)[0].split('.'))) >= (2, 9, 5):
+ libxml2_configure_cmd.append('--without-lzma') # can't currently build that
+
+ try:
+ if tuple(map(tryint, libxml2_version.split('-', 1)[0].split('.'))) >= (2, 7, 3):
+ libxml2_configure_cmd.append('--enable-rebuild-docs=no')
+ except Exception:
+ pass # this isn't required, so ignore any errors
+ if not has_current_lib("libxml2", libxml2_dir):
+ cmmi(libxml2_configure_cmd, libxml2_dir, multicore, **call_setup)
+
+ # build libxslt
+ libxslt_configure_cmd = configure_cmd + [
+ '--without-python',
+ '--with-libxml-prefix=%s' % prefix,
+ '--without-crypto',
+ ]
+ if not (has_current_lib("libxslt", libxslt_dir) and has_current_lib("libexslt", libxslt_dir)):
+ cmmi(libxslt_configure_cmd, libxslt_dir, multicore, **call_setup)
+
+ # collect build setup for lxml
+ xslt_config = os.path.join(prefix, 'bin', 'xslt-config')
+ xml2_config = os.path.join(prefix, 'bin', 'xml2-config')
+
+ static_include_dirs.extend([
+ os.path.join(prefix, 'include'),
+ os.path.join(prefix, 'include', 'libxml2'),
+ os.path.join(prefix, 'include', 'libxslt'),
+ os.path.join(prefix, 'include', 'libexslt')])
+ static_library_dirs.append(lib_dir)
+
+ listdir = os.listdir(lib_dir)
+ static_binaries += [os.path.join(lib_dir, filename)
+ for lib in lib_names
+ for filename in listdir
+ if lib in filename and filename.endswith('.a')]
+
+ return xml2_config, xslt_config
diff --git a/changelog b/debian/changelog
index 3baf07e..3baf07e 100644
--- a/changelog
+++ b/debian/changelog
diff --git a/compat b/debian/compat
index f599e28..f599e28 100644
--- a/compat
+++ b/debian/compat
diff --git a/control b/debian/control
index ab21814..ab21814 100644
--- a/control
+++ b/debian/control
diff --git a/copyright b/debian/copyright
index 7da5fe0..7da5fe0 100644
--- a/copyright
+++ b/debian/copyright
diff --git a/python-lxml-doc.doc-base b/debian/python-lxml-doc.doc-base
index d10a666..d10a666 100644
--- a/python-lxml-doc.doc-base
+++ b/debian/python-lxml-doc.doc-base
diff --git a/rules b/debian/rules
index 0679f79..0679f79 100755
--- a/rules
+++ b/debian/rules
diff --git a/source/format b/debian/source/format
index 163aaf8..163aaf8 100644
--- a/source/format
+++ b/debian/source/format
diff --git a/watch b/debian/watch
index 67deb90..67deb90 100644
--- a/watch
+++ b/debian/watch
diff --git a/doc/FAQ.txt b/doc/FAQ.txt
new file mode 100644
index 0000000..24ec8c4
--- /dev/null
+++ b/doc/FAQ.txt
@@ -0,0 +1,1279 @@
+=====================================
+lxml FAQ - Frequently Asked Questions
+=====================================
+
+.. meta::
+ :description: Frequently Asked Questions about lxml (FAQ)
+ :keywords: lxml, lxml.etree, FAQ, frequently asked questions
+
+Frequently asked questions on lxml. See also the notes on compatibility_ to
+ElementTree_.
+
+.. _compatibility: compatibility.html
+.. _ElementTree: http://effbot.org/zone/element-index.htm
+.. _`build instructions`: build.html
+.. _`MacOS-X` : build.html#building-lxml-on-macos-x
+
+.. contents::
+..
+ 1 General Questions
+ 1.1 Is there a tutorial?
+ 1.2 Where can I find more documentation about lxml?
+ 1.3 What standards does lxml implement?
+ 1.4 Who uses lxml?
+ 1.5 What is the difference between lxml.etree and lxml.objectify?
+ 1.6 How can I make my application run faster?
+ 1.7 What about that trailing text on serialised Elements?
+ 1.8 How can I find out if an Element is a comment or PI?
+ 1.9 How can I map an XML tree into a dict of dicts?
+ 1.10 Why does lxml sometimes return 'str' values for text in Python 2?
+ 1.11 Why do I get XInclude or DTD lookup failures on some systems but not on others?
+ 1.12 How do namespaces work in lxml?
+ 2 Installation
+ 2.1 Which version of libxml2 and libxslt should I use or require?
+ 2.2 Where are the binary builds?
+ 2.3 Why do I get errors about missing UCS4 symbols when installing lxml?
+ 2.4 My C compiler crashes on installation
+ 3 Contributing
+ 3.1 Why is lxml not written in Python?
+ 3.2 How can I contribute?
+ 4 Bugs
+ 4.1 My application crashes!
+ 4.2 My application crashes on MacOS-X!
+ 4.3 I think I have found a bug in lxml. What should I do?
+ 4.4 How do I know a bug is really in lxml and not in libxml2?
+ 5 Threading
+ 5.1 Can I use threads to concurrently access the lxml API?
+ 5.2 Does my program run faster if I use threads?
+ 5.3 Would my single-threaded program run faster if I turned off threading?
+ 5.4 Why can't I reuse XSLT stylesheets in other threads?
+ 5.5 My program crashes when run with mod_python/Pyro/Zope/Plone/...
+ 6 Parsing and Serialisation
+ 6.1 Why doesn't the ``pretty_print`` option reformat my XML output?
+ 6.2 Why can't lxml parse my XML from unicode strings?
+ 6.3 Can lxml parse from file objects opened in unicode mode?
+ 6.4 What is the difference between str(xslt(doc)) and xslt(doc).write() ?
+ 6.5 Why can't I just delete parents or clear the root node in iterparse()?
+ 6.6 How do I output null characters in XML text?
+ 6.7 Is lxml vulnerable to XML bombs?
+ 6.8 How do I configure lxml safely as a web-service endpoint?
+ 6.9 How can I sort the attributes?
+ 7 XPath and Document Traversal
+ 7.1 What are the ``findall()`` and ``xpath()`` methods on Element(Tree)?
+ 7.2 Why doesn't ``findall()`` support full XPath expressions?
+ 7.3 How can I find out which namespace prefixes are used in a document?
+ 7.4 How can I specify a default namespace for XPath expressions?
+ 7.5 How can I modify the tree during iteration?
+
+
+The code examples below use the `'lxml.etree`` module:
+
+.. sourcecode:: pycon
+
+ >>> from lxml import etree
+
+..
+ >>> import sys
+ >>> _etree = etree
+ >>> if sys.version_info[0] >= 3:
+ ... class etree_mock(object):
+ ... def __getattr__(self, name): return getattr(_etree, name)
+ ... def tostring(self, *args, **kwargs):
+ ... s = _etree.tostring(*args, **kwargs)
+ ... if isinstance(s, bytes): s = s.decode("utf-8") # CR
+ ... if s[-1] == '\n': s = s[:-1]
+ ... return s
+ ... else:
+ ... class etree_mock(object):
+ ... def __getattr__(self, name): return getattr(_etree, name)
+ ... def tostring(self, *args, **kwargs):
+ ... s = _etree.tostring(*args, **kwargs)
+ ... if s[-1] == '\n': s = s[:-1]
+ ... return s
+ >>> etree = etree_mock()
+
+
+General Questions
+=================
+
+Is there a tutorial?
+--------------------
+
+Read the `lxml.etree Tutorial`_. While this is still work in progress
+(just as any good documentation), it provides an overview of the most
+important concepts in ``lxml.etree``. If you want to help out,
+improving the tutorial is a very good place to start.
+
+There is also a `tutorial for ElementTree`_ which works for
+``lxml.etree``. The documentation of the `extended etree API`_ also
+contains many examples for ``lxml.etree``. Fredrik Lundh's `element
+library`_ contains a lot of nice recipes that show how to solve common
+tasks in ElementTree and lxml.etree. To learn using
+``lxml.objectify``, read the `objectify documentation`_.
+
+John Shipman has written another tutorial called `Python XML
+processing with lxml`_ that contains lots of examples. Liza Daly
+wrote a nice article about high-performance aspects when `parsing
+large files with lxml`_.
+
+.. _`lxml.etree Tutorial`: tutorial.html
+.. _`tutorial for ElementTree`: https://effbot.org/zone/element.htm
+.. _`extended etree API`: api.html
+.. _`objectify documentation`: objectify.html
+.. _`Python XML processing with lxml`: http://www.nmt.edu/tcc/help/pubs/pylxml/
+.. _`element library`: https://effbot.org/zone/element-lib.htm
+.. _`parsing large files with lxml`: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
+
+
+Where can I find more documentation about lxml?
+-----------------------------------------------
+
+There is a lot of documentation on the web and also in the Python
+standard library documentation, as lxml implements the well-known
+`ElementTree API`_ and tries to follow its documentation as closely as
+possible. The recipes in Fredrik Lundh's `element library`_ are
+generally worth taking a look at. There are a couple of issues where
+lxml cannot keep up compatibility. They are described in the
+compatibility_ documentation.
+
+The lxml specific extensions to the API are described by individual
+files in the ``doc`` directory of the source distribution and on `the
+web page`_.
+
+The `generated API documentation`_ is a comprehensive API reference
+for the lxml package.
+
+.. _`ElementTree API`: https://effbot.org/zone/element-index.htm
+.. _`the web page`: https://lxml.de/#documentation
+.. _`generated API documentation`: api/index.html
+
+
+What standards does lxml implement?
+-----------------------------------
+
+The compliance to XML Standards depends on the support in libxml2 and libxslt.
+Here is a quote from `http://xmlsoft.org/ <http://xmlsoft.org/>`_:
+
+ In most cases libxml2 tries to implement the specifications in a relatively
+ strictly compliant way. As of release 2.4.16, libxml2 passed all 1800+ tests
+ from the OASIS XML Tests Suite.
+
+lxml currently supports libxml2 2.6.20 or later, which has even better
+support for various XML standards. The important ones are:
+
+* XML 1.0
+* HTML 4
+* XML namespaces
+* XML Schema 1.0
+* XPath 1.0
+* XInclude 1.0
+* XSLT 1.0
+* EXSLT
+* XML catalogs
+* canonical XML
+* RelaxNG
+* xml:id
+* xml:base
+
+Support for XML Schema is currently not 100% complete in libxml2, but
+is definitely very close to compliance. Schematron is supported in
+two ways, the best being the original ISO Schematron reference
+implementation via XSLT. libxml2 also supports loading documents
+through HTTP and FTP.
+
+For `RelaxNG Compact Syntax <http://relaxng.org/compact-tutorial-20030326.html>`_
+support, there is a tool called `rnc2rng <http://www.gnosis.cx/download/relax/>`_,
+written by David Mertz, which you might be able to use from Python. Failing
+that, `trang <http://code.google.com/p/jing-trang/>`_ is the 'official'
+command line tool (written in Java) to do the conversion.
+
+
+Who uses lxml?
+--------------
+
+As an XML library, lxml is often used under the hood of in-house
+server applications, such as web servers or applications that
+facilitate some kind of content management. Many people who deploy
+Zope_, Plone_ or Django_ use it together with lxml in the background,
+without speaking publicly about it. Therefore, it is hard to get an
+idea of who uses it, and the following list of 'users and projects we
+know of' is very far from a complete list of lxml's users.
+
+Also note that the compatibility to the ElementTree library does not
+require projects to set a hard dependency on lxml - as long as they do
+not take advantage of lxml's enhanced feature set.
+
+* `cssutils <http://code.google.com/p/cssutils/source/browse/trunk/examples/style.py?r=917>`_,
+ a CSS parser and toolkit, can be used with ``lxml.cssselect``
+* `Deliverance <http://www.openplans.org/projects/deliverance/project-home>`_,
+ a content theming tool
+* `Enfold Proxy 4 <http://www.enfoldsystems.com/Products/Proxy/4>`_,
+ a web server accelerator with on-the-fly XSLT processing
+* `Inteproxy <http://lists.wald.intevation.org/pipermail/inteproxy-devel/2007-February/000000.html>`_,
+ a secure HTTP proxy
+* `lwebstring <http://pypi.python.org/pypi/lwebstring>`_,
+ an XML template engine
+* `openpyxl <https://openpyxl.readthedocs.io/>`_,
+ a library to read/write MS Excel 2007 files
+* `OpenXMLlib <http://permalink.gmane.org/gmane.comp.python.lxml.devel/3250>`_,
+ a library for handling OpenXML document meta data
+* `PsychoPy <http://www.psychopy.org/>`_,
+ psychology software in Python
+* `Pycoon <http://pypi.python.org/pypi/pycoon>`_,
+ a WSGI web development framework based on XML pipelines
+* `pycsw <http://pycsw.org>`_,
+ an `OGC CSW <http://opengeospatial.org/standards/cat>`_ server implementation written in Python
+* `PyQuery <http://pypi.python.org/pypi/pyquery>`_,
+ a query framework for XML/HTML, similar to jQuery for JavaScript
+* `python-docx <http://github.com/mikemaccana/python-docx>`_,
+ a package for handling Microsoft's Word OpenXML format
+* `Rambler <https://www.rambler.ru/>`_,
+ news aggregator on Runet
+* `rdfadict <http://pypi.python.org/pypi/rdfadict>`_,
+ an RDFa parser with a simple dictionary-like interface.
+* `xupdate-processor <http://pypi.python.org/pypi/xupdate-processor>`_,
+ an XUpdate implementation for lxml.etree
+* `Diazo <http://docs.diazo.org/>`_,
+ an XSLT-under-the-hood web site theming engine
+
+Zope3 and some of its extensions have good support for lxml:
+
+* `gocept.lxml <http://pypi.python.org/pypi/gocept.lxml>`_,
+ Zope3 interface bindings for lxml
+* `z3c.rml <http://pypi.python.org/pypi/z3c.rml>`_,
+ an implementation of ReportLab's RML format
+* `zif.sedna <http://pypi.python.org/pypi/zif.sedna>`_,
+ an XQuery based interface to the Sedna OpenSource XML database
+
+And don't miss the quotes by our generally happy_ users_, and other
+`sites that link to lxml`_. As `Liza Daly`_ puts it: "Many software
+products come with the pick-two caveat, meaning that you must choose
+only two: speed, flexibility, or readability. When used carefully,
+lxml can provide all three."
+
+.. _Zope: http://www.zope.org/
+.. _Plone: http://www.plone.org/
+.. _Django: https://www.djangoproject.com/
+
+.. _happy: http://thread.gmane.org/gmane.comp.python.lxml.devel/3244/focus=3244
+.. _users: http://article.gmane.org/gmane.comp.python.lxml.devel/3246
+.. _`sites that link to lxml`: http://www.google.com/search?as_lq=http:%2F%2Flxml.de%2F
+.. _`Liza Daly`: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
+
+
+What is the difference between lxml.etree and lxml.objectify?
+-------------------------------------------------------------
+
+The two modules provide different ways of handling XML. However, objectify
+builds on top of lxml.etree and therefore inherits most of its capabilities
+and a large portion of its API.
+
+* lxml.etree is a generic API for XML and HTML handling. It aims for
+ ElementTree compatibility_ and supports the entire XML infoset. It is well
+ suited for both mixed content and data centric XML. Its generality makes it
+ the best choice for most applications.
+
+* lxml.objectify is a specialized API for XML data handling in a Python object
+ syntax. It provides a very natural way to deal with data fields stored in a
+ structurally well defined XML format. Data is automatically converted to
+ Python data types and can be manipulated with normal Python operators. Look
+ at the examples in the `objectify documentation`_ to see what it feels like
+ to use it.
+
+ Objectify is not well suited for mixed contents or HTML documents. As it is
+ built on top of lxml.etree, however, it inherits the normal support for
+ XPath, XSLT or validation.
+
+
+How can I make my application run faster?
+-----------------------------------------
+
+lxml.etree is a very fast library for processing XML. There are, however, `a
+few caveats`_ involved in the mapping of the powerful libxml2 library to the
+simple and convenient ElementTree API. Not all operations are as fast as the
+simplicity of the API might suggest, while some use cases can heavily benefit
+from finding the right way of doing them. The `benchmark page`_ has a
+comparison to other ElementTree implementations and a number of tips for
+performance tweaking. As with any Python application, the rule of thumb is:
+the more of your processing runs in C, the faster your application gets. See
+also the section on threading_.
+
+.. _`a few caveats`: performance.html#the-elementtree-api
+.. _`benchmark page`: performance.html
+.. _threading: #threading
+
+
+What about that trailing text on serialised Elements?
+-----------------------------------------------------
+
+The ElementTree tree model defines an Element as a container with a tag name,
+contained text, child Elements and a tail text. This means that whenever you
+serialise an Element, you will get all parts of that Element:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<root><tag>text<child/></tag>tail</root>")
+ >>> print(etree.tostring(root[0]))
+ <tag>text<child/></tag>tail
+
+Here is an example that shows why not serialising the tail would be
+even more surprising from an object point of view:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.Element("test")
+
+ >>> root.text = "TEXT"
+ >>> print(etree.tostring(root))
+ <test>TEXT</test>
+
+ >>> root.tail = "TAIL"
+ >>> print(etree.tostring(root))
+ <test>TEXT</test>TAIL
+
+ >>> root.tail = None
+ >>> print(etree.tostring(root))
+ <test>TEXT</test>
+
+Just imagine a Python list where you append an item and it doesn't
+show up when you look at the list.
+
+The ``.tail`` property is a huge simplification for the tree model as
+it avoids text nodes to appear in the list of children and makes
+access to them quick and simple. So this is a benefit in most
+applications and simplifies many, many XML tree algorithms.
+
+However, in document-like XML (and especially HTML), the above result can be
+unexpected to new users and can sometimes require a bit more overhead. A good
+way to deal with this is to use helper functions that copy the Element without
+its tail. The ``lxml.html`` package also deals with this in a couple of
+places, as most HTML algorithms benefit from a tail-free behaviour.
+
+
+How can I find out if an Element is a comment or PI?
+----------------------------------------------------
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<?my PI?><root><!-- empty --></root>")
+
+ >>> root.tag
+ 'root'
+ >>> root.getprevious().tag is etree.PI
+ True
+ >>> root[0].tag is etree.Comment
+ True
+
+
+How can I map an XML tree into a dict of dicts?
+-----------------------------------------------
+
+I'm glad you asked.
+
+.. sourcecode:: python
+
+ def recursive_dict(element):
+ return element.tag, \
+ dict(map(recursive_dict, element)) or element.text
+
+Note that this beautiful quick-and-dirty converter expects children
+to have unique tag names and will silently overwrite any data that
+was contained in preceding siblings with the same name. For any
+real-world application of xml-to-dict conversion, you would better
+write your own, longer version of this.
+
+
+Why does lxml sometimes return 'str' values for text in Python 2?
+-----------------------------------------------------------------
+
+In Python 2, lxml's API returns byte strings for plain ASCII text
+values, be it for tag names or text in Element content. This is the
+same behaviour as known from ElementTree. The reasoning is that ASCII
+encoded byte strings are compatible with Unicode strings in Python 2,
+but consume less memory (usually by a factor of 2 or 4) and are faster
+to create because they do not require decoding. Plain ASCII string
+values are very common in XML, so this optimisation is generally worth
+it.
+
+In Python 3, lxml always returns Unicode strings for text and names,
+as does ElementTree. Since Python 3.3, Unicode strings containing
+only characters that can be encoded in ASCII or Latin-1 are generally
+as efficient as byte strings. In older versions of Python 3, the
+above mentioned drawbacks apply.
+
+
+Why do I get XInclude or DTD lookup failures on some systems but not on others?
+-------------------------------------------------------------------------------
+
+To avoid network access, external resources are first looked up in
+`XML catalogues <https://www.oasis-open.org/committees/entity/spec.html>`_.
+Many systems have them installed by default, but some don't.
+On Linux systems, the default place to look is the index file
+``/etc/xml/catalog``, which most importantly provides a mapping from
+doctype IDs to locally installed DTD files.
+
+See the `libxml2 catalogue documentation <http://xmlsoft.org/catalog.html>`_
+for further information.
+
+
+How do namespaces work in lxml?
+-------------------------------
+
+The same as in ElementTree. See the `tutorial <tutorial.html#namespaces>`_.
+
+
+Installation
+============
+
+Which version of libxml2 and libxslt should I use or require?
+-------------------------------------------------------------
+
+It really depends on your application, but the rule of thumb is: more recent
+versions contain less bugs and provide more features.
+
+* Do not use libxml2 2.6.27 if you want to use XPath (including XSLT). You
+ will get crashes when XPath errors occur during the evaluation (e.g. for
+ unknown functions). This happens inside the evaluation call to libxml2, so
+ there is nothing that lxml can do about it.
+
+* Try to use versions of both libraries that were released together. At least
+ the libxml2 version should not be older than the libxslt version.
+
+* If you use XML Schema or Schematron which are still under development, the
+ most recent version of libxml2 is usually a good bet.
+
+* The same applies to XPath, where a substantial number of bugs and memory
+ leaks were fixed over time. If you encounter crashes or memory leaks in
+ XPath applications, try a more recent version of libxml2.
+
+* For parsing and fixing broken HTML, lxml requires at least libxml2 2.6.21.
+
+* For the normal tree handling, however, any libxml2 version starting with
+ 2.6.20 should do.
+
+Read the `release notes of libxml2`_ and the `release notes of libxslt`_ to
+see when (or if) a specific bug has been fixed.
+
+.. _`release notes of libxml2`: http://xmlsoft.org/news.html
+.. _`release notes of libxslt`: http://xmlsoft.org/XSLT/news.html
+
+
+Where are the binary builds?
+----------------------------
+
+Thanks to the help by Joar Wandborg, we try to make "manylinux_" binary
+builds for Linux available shortly after each source release, as they
+are very frequently used by continuous integration and/or build servers.
+
+Thanks to the help by Maximilian Hils and the Appveyor build service,
+we also try to serve the frequent requests for binary builds available
+for Microsoft Windows in a timely fashion, since users of that platform
+usually fail to build lxml themselves. Two of the major design issues
+of this operating system make this non-trivial for its users: the lack
+of a pre-installed standard compiler and the missing package management.
+
+Besides that, Christoph Gohlke generously provides `unofficial lxml binary
+builds for Windows <http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml>`_
+that are usually very up to date. Consider using them if you prefer a
+binary build over a signed official source release.
+
+.. _manylinux: https://www.python.org/dev/peps/pep-0513
+
+
+Why do I get errors about missing UCS4 symbols when installing lxml?
+--------------------------------------------------------------------
+
+You are using a Python installation that was configured for a different
+internal Unicode representation than the lxml package you are trying to
+install. CPython versions before 3.3 allowed to switch between two types
+at build time: the 32 bit encoding UCS4 and the 16 bit encoding UCS2.
+Sadly, both are not compatible, so eggs and other binary distributions
+can only support the one they were compiled with.
+
+This means that you have to compile lxml from sources for your system. Note
+that you do not need Cython for this, the lxml source distribution is directly
+compilable on both platform types. See the `build instructions`_ on how to do
+this.
+
+
+My C compiler crashes on installation
+-------------------------------------
+
+lxml consists of a relatively large amount of (Cython) generated C code
+in a single source module. Compiling this module requires a lot of free
+memory, usually more than half a GB, which can pose problems especially on
+shared/cloud build systems.
+
+If your C compiler crashes while building lxml from sources, consider
+using one of the binary wheels that we provide. The "manylinux_" binaries
+should generally work well on most build systems and install substantially
+faster than a source build.
+
+
+Contributing
+============
+
+Why is lxml not written in Python?
+----------------------------------
+
+It *almost* is.
+
+lxml is not written in plain Python, because it interfaces with two C
+libraries: libxml2 and libxslt. Accessing them at the C-level is
+required for performance reasons.
+
+However, to avoid writing plain C-code and caring too much about the
+details of built-in types and reference counting, lxml is written in
+Cython_, a superset of the Python language that translates to C-code.
+Chances are that if you know Python, you can write `code that Cython
+accepts`_. Again, the C-ish style used in the lxml code is just for
+performance optimisations. If you want to contribute, don't bother
+with the details, a Python implementation of your contribution is
+better than none. And keep in mind that lxml's flexible API often
+favours an implementation of features in pure Python, without
+bothering with C-code at all. For example, the ``lxml.html`` package
+is written entirely in Python.
+
+Please contact the `mailing list`_ if you need any help.
+
+.. _Cython: http://cython.org/
+.. _`code that Cython accepts`: http://docs.cython.org/docs/tutorial.html
+
+
+How can I contribute?
+---------------------
+
+If you find something that you would like lxml to do (or do better),
+then please tell us about it on the `mailing list`_. Pull requests
+on github are always appreciated, especially when accompanied by unit
+tests and documentation (doctests would be great). See the ``tests``
+subdirectories in the lxml source tree (below the ``src`` directory)
+and the ReST_ `text files`_ in the ``doc`` directory.
+
+We also have a `list of missing features`_ that we would like to
+implement but didn't due to lack if time. If *you* find the time,
+patches are very welcome.
+
+.. _ReST: http://docutils.sourceforge.net/rst.html
+.. _`text files`: https://github.com/lxml/lxml/tree/master/doc
+.. _`list of missing features`: https://github.com/lxml/lxml/blob/master/IDEAS.txt
+
+Besides enhancing the code, there are a lot of places where you can help the
+project and its user base. You can
+
+* spread the word and write about lxml. Many users (especially new Python
+ users) have not yet heard about lxml, although our user base is constantly
+ growing. If you write your own blog and feel like saying something about
+ lxml, go ahead and do so. If we think your contribution or criticism is
+ valuable to other users, we may even put a link or a quote on the project
+ page.
+
+* provide code examples for the general usage of lxml or specific problems
+ solved with lxml. Readable code is a very good way of showing how a library
+ can be used and what great things you can do with it. Again, if we hear
+ about it, we can set a link on the project page.
+
+* work on the documentation. The web page is generated from a set of ReST_
+ `text files`_. It is meant both as a representative project page for lxml
+ and as a site for documenting lxml's API and usage. If you have questions
+ or an idea how to make it more readable and accessible while you are reading
+ it, please send a comment to the `mailing list`_.
+
+* enhance the web site. We put some work into making the web site
+ usable, understandable and also easy to find, but there's always
+ things that can be done better. You may notice that we are not
+ top-ranked when searching the web for "Python and XML", so maybe you
+ have an idea how to improve that.
+
+* help with the tutorial. A tutorial is the most important starting point for
+ new users, so it is important for us to provide an easy to understand guide
+ into lxml. As all documentation, the tutorial is work in progress, so we
+ appreciate every helping hand.
+
+* improve the docstrings. lxml uses docstrings to support Python's integrated
+ online ``help()`` function. However, sometimes these are not sufficient to
+ grasp the details of the function in question. If you find such a place,
+ you can try to write up a better description and send it to the `mailing
+ list`_.
+
+
+Bugs
+====
+
+My application crashes!
+-----------------------
+
+One of the goals of lxml is "no segfaults", so if there is no clear
+warning in the documentation that you were doing something potentially
+harmful, you have found a bug and we would like to hear about it.
+Please report this bug to the `mailing list`_. See the section on bug
+reporting to learn how to do that.
+
+If your application (or e.g. your web container) uses threads, please
+see the FAQ section on threading_ to check if you touch on one of the
+potential pitfalls.
+
+In any case, try to reproduce the problem with the latest versions of
+libxml2 and libxslt. From time to time, bugs and race conditions are found
+in these libraries, so a more recent version might already contain a fix for
+your problem.
+
+Remember: even if you see lxml appear in a crash stack trace, it is
+not necessarily lxml that *caused* the crash.
+
+
+My application crashes on MacOS-X!
+----------------------------------
+
+This was a common problem up to lxml 2.1.x. Since lxml 2.2, the only
+officially supported way to use it on this platform is through a
+static build against freshly downloaded versions of libxml2 and
+libxslt. See the build instructions for `MacOS-X`_.
+
+
+I think I have found a bug in lxml. What should I do?
+-----------------------------------------------------
+
+First, you should look at the `current developer changelog`_ to see if this
+is a known problem that has already been fixed in the master branch since the
+release you are using.
+
+.. _`current developer changelog`: https://github.com/lxml/lxml/blob/master/CHANGES.txt
+
+Also, the 'crash' section above has a few good advices what to try to see if
+the problem is really in lxml - and not in your setup. Believe it or not,
+that happens more often than you might think, especially when old libraries
+or even multiple library versions are installed.
+
+You should always try to reproduce the problem with the latest
+versions of libxml2 and libxslt - and make sure they are used.
+``lxml.etree`` can tell you what it runs with:
+
+.. sourcecode:: python
+
+ import sys
+ from lxml import etree
+
+ print("%-20s: %s" % ('Python', sys.version_info))
+ print("%-20s: %s" % ('lxml.etree', etree.LXML_VERSION))
+ print("%-20s: %s" % ('libxml used', etree.LIBXML_VERSION))
+ print("%-20s: %s" % ('libxml compiled', etree.LIBXML_COMPILED_VERSION))
+ print("%-20s: %s" % ('libxslt used', etree.LIBXSLT_VERSION))
+ print("%-20s: %s" % ('libxslt compiled', etree.LIBXSLT_COMPILED_VERSION))
+
+If you can figure that the problem is not in lxml but in the
+underlying libxml2 or libxslt, you can ask right on the respective
+mailing lists, which may considerably reduce the time to find a fix or
+work-around. See the next question for some hints on how to do that.
+
+Otherwise, we would really like to hear about it. Please report it to
+the `bug tracker`_ or to the `mailing list`_ so that we can fix it.
+It is very helpful in this case if you can come up with a short code
+snippet that demonstrates your problem. If others can reproduce and
+see the problem, it is much easier for them to fix it - and maybe even
+easier for you to describe it and get people convinced that it really
+is a problem to fix.
+
+It is important that you always report the version of lxml, libxml2
+and libxslt that you get from the code snippet above. If we do not
+know the library versions you are using, we will ask back, so it will
+take longer for you to get a helpful answer.
+
+Since as a user of lxml you are likely a programmer, you might find
+`this article on bug reports`_ an interesting read.
+
+.. _`bug tracker`: https://bugs.launchpad.net/lxml/
+.. _`mailing list`: http://lxml.de/mailinglist/
+.. _`this article on bug reports`: http://www.chiark.greenend.org.uk/~sgtatham/bugs.html
+
+
+How do I know a bug is really in lxml and not in libxml2?
+---------------------------------------------------------
+
+A large part of lxml's functionality is implemented by libxml2 and
+libxslt, so problems that you encounter may be in one or the other.
+Knowing the right place to ask will reduce the time it takes to fix
+the problem, or to find a work-around.
+
+Both libxml2 and libxslt come with their own command line frontends,
+namely ``xmllint`` and ``xsltproc``. If you encounter problems with
+XSLT processing for specific stylesheets or with validation for
+specific schemas, try to run the XSLT with ``xsltproc`` or the
+validation with ``xmllint`` respectively to find out if it fails there
+as well. If it does, please report directly to the mailing lists of
+the respective project, namely:
+
+* `libxml2 mailing list <http://mail.gnome.org/mailman/listinfo/xml>`_
+* `libxslt mailing list <http://mail.gnome.org/mailman/listinfo/xslt>`_
+
+On the other hand, everything that seems to be related to Python code,
+including custom resolvers, custom XPath functions, etc. is likely
+outside of the scope of libxml2/libxslt. If you encounter problems
+here or you are not sure where there the problem may come from, please
+ask on the lxml mailing list first.
+
+In any case, a good explanation of the problem including some simple
+test code and some input data will help us (or the libxml2 developers)
+see and understand the problem, which largely increases your chance of
+getting help. See the question above for a few hints on what is
+helpful here.
+
+
+Threading
+=========
+
+Can I use threads to concurrently access the lxml API?
+------------------------------------------------------
+
+Short answer: yes, if you use lxml 2.2 and later.
+
+Since version 1.1, lxml frees the GIL (Python's global interpreter
+lock) internally when parsing from disk and memory, as long as you use
+either the default parser (which is replicated for each thread) or
+create a parser for each thread yourself. lxml also allows
+concurrency during validation (RelaxNG and XMLSchema) and XSL
+transformation. You can share RelaxNG, XMLSchema and XSLT objects
+between threads.
+
+While you can also share parsers between threads, this will serialize
+the access to each of them, so it is better to ``.copy()`` parsers or
+to just use the default parser if you do not need any special
+configuration. The same applies to the XPath evaluators, which use an
+internal lock to protect their prepared evaluation contexts. It is
+therefore best to use separate evaluator instances in threads.
+
+Warning: Before lxml 2.2, and especially before 2.1, there were
+various issues when moving subtrees between different threads, or when
+applying XSLT objects from one thread to trees parsed or modified in
+another. If you need code to run with older versions, you should
+generally avoid modifying trees in other threads than the one it was
+generated in. Although this should work in many cases, there are
+certain scenarios where the termination of a thread that parsed a tree
+can crash the application if subtrees of this tree were moved to other
+documents. You should be on the safe side when passing trees between
+threads if you either
+
+- do not modify these trees and do not move their elements to other
+ trees, or
+
+- do not terminate threads while the trees they parsed are still in
+ use (e.g. by using a fixed size thread-pool or long-running threads
+ in processing chains)
+
+Since lxml 2.2, even multi-thread pipelines are supported. However,
+note that it is more efficient to do all tree work inside one thread,
+than to let multiple threads work on a tree one after the other. This
+is because trees inherit state from the thread that created them,
+which must be maintained when the tree is modified inside another
+thread.
+
+
+Does my program run faster if I use threads?
+--------------------------------------------
+
+Depends. The best way to answer this is timing and profiling.
+
+The global interpreter lock (GIL) in Python serializes access to the
+interpreter, so if the majority of your processing is done in Python
+code (walking trees, modifying elements, etc.), your gain will be
+close to zero. The more of your XML processing moves into lxml,
+however, the higher your gain. If your application is bound by XML
+parsing and serialisation, or by very selective XPath expressions and
+complex XSLTs, your speedup on multi-processor machines can be
+substantial.
+
+See the question above to learn which operations free the GIL to support
+multi-threading.
+
+
+Would my single-threaded program run faster if I turned off threading?
+----------------------------------------------------------------------
+
+Possibly, yes. You can see for yourself by compiling lxml entirely
+without threading support. Pass the ``--without-threading`` option to
+setup.py when building lxml from source. You can also build libxml2
+without pthread support (``--without-pthreads`` option), which may add
+another bit of performance. Note that this will leave internal data
+structures entirely without thread protection, so make sure you really
+do not use lxml outside of the main application thread in this case.
+
+
+Why can't I reuse XSLT stylesheets in other threads?
+----------------------------------------------------
+
+Since later lxml 2.0 versions, you can do this. There is some
+overhead involved as the result document needs an additional cleanup
+traversal when the input document and/or the stylesheet were created
+in other threads. However, on a multi-processor machine, the gain of
+freeing the GIL easily covers this drawback.
+
+If you need even the last bit of performance, consider keeping (a copy
+of) the stylesheet in thread-local storage, and try creating the input
+document(s) in the same thread. And do not forget to benchmark your
+code to see if the increased code complexity is really worth it.
+
+
+My program crashes when run with mod_python/Pyro/Zope/Plone/...
+---------------------------------------------------------------
+
+These environments can use threads in a way that may not make it obvious when
+threads are created and what happens in which thread. This makes it hard to
+ensure lxml's threading support is used in a reliable way. Sadly, if problems
+arise, they are as diverse as the applications, so it is difficult to provide
+any generally applicable solution. Also, these environments are so complex
+that problems become hard to debug and even harder to reproduce in a
+predictable way. If you encounter crashes in one of these systems, but your
+code runs perfectly when started by hand, the following gives you a few hints
+for possible approaches to solve your specific problem:
+
+* make sure you use recent versions of libxml2, libxslt and lxml. The
+ libxml2 developers keep fixing bugs in each release, and lxml also
+ tries to become more robust against possible pitfalls. So newer
+ versions might already fix your problem in a reliable way. Version
+ 2.2 of lxml contains many improvements.
+
+* make sure the library versions you installed are really used. Do
+ not rely on what your operating system tells you! Print the version
+ constants in ``lxml.etree`` from within your runtime environment to
+ make sure it is the case. This is especially a problem under
+ MacOS-X when newer library versions were installed in addition to
+ the outdated system libraries. Please read the bugs section
+ regarding MacOS-X in this FAQ.
+
+* if you use ``mod_python``, try setting this option:
+
+ PythonInterpreter main_interpreter
+
+ There was a discussion on the mailing list about this problem:
+
+ http://comments.gmane.org/gmane.comp.python.lxml.devel/2942
+
+* in a threaded environment, try to initially import ``lxml.etree``
+ from the main application thread instead of doing first-time imports
+ separately in each spawned worker thread. If you cannot control the
+ thread spawning of your web/application server, an import of
+ ``lxml.etree`` in sitecustomize.py or usercustomize.py may still do
+ the trick.
+
+* compile lxml without threading support by running ``setup.py`` with the
+ ``--without-threading`` option. While this might be slower in certain
+ scenarios on multi-processor systems, it *might* also keep your application
+ from crashing, which should be worth more to you than peek performance.
+ Remember that lxml is fast anyway, so concurrency may not even be worth it.
+
+* look out for fancy XSLT stuff like foreign document access or
+ passing in subtrees trough XSLT variables. This might or might not
+ work, depending on your specific usage. Again, later versions of
+ lxml and libxslt provide safer support here.
+
+* try copying trees at suspicious places in your code and working with
+ those instead of a tree shared between threads. Note that the
+ copying must happen inside the target thread to be effective, not in
+ the thread that created the tree. Serialising in one thread and
+ parsing in another is also a simple (and fast) way of separating
+ thread contexts.
+
+* try keeping thread-local copies of XSLT stylesheets, i.e. one per thread,
+ instead of sharing one. Also see the question above.
+
+* you can try to serialise suspicious parts of your code with explicit thread
+ locks, thus disabling the concurrency of the runtime system.
+
+* report back on the mailing list to see if there are other ways to work
+ around your specific problems. Do not forget to report the version numbers
+ of lxml, libxml2 and libxslt you are using (see the question on reporting
+ a bug).
+
+Note that most of these options will degrade performance and/or your
+code quality. If you are unsure what to do, please ask on the mailing
+list.
+
+
+Parsing and Serialisation
+=========================
+
+..
+ making doctest happy:
+
+ >>> try: from StringIO import StringIO
+ ... except ImportError: from io import StringIO # Py3
+ >>> filename = StringIO("<root/>")
+
+
+Why doesn't the ``pretty_print`` option reformat my XML output?
+---------------------------------------------------------------
+
+Pretty printing (or formatting) an XML document means adding white space to
+the content. These modifications are harmless if they only impact elements in
+the document that do not carry (text) data. They corrupt your data if they
+impact elements that contain data. If lxml cannot distinguish between
+whitespace and data, it will not alter your data. Whitespace is therefore
+only added between nodes that do not contain data. This is always the case
+for trees constructed element-by-element, so no problems should be expected
+here. For parsed trees, a good way to assure that no conflicting whitespace
+is left in the tree is the ``remove_blank_text`` option:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser(remove_blank_text=True)
+ >>> tree = etree.parse(filename, parser)
+
+This will allow the parser to drop blank text nodes when constructing the
+tree. If you now call a serialization function to pretty print this tree,
+lxml can add fresh whitespace to the XML tree to indent it.
+
+Note that the ``remove_blank_text`` option also uses a heuristic if it
+has no definite knowledge about the document's ignorable whitespace.
+It will keep blank text nodes that appear after non-blank text nodes
+at the same level. This is to prevent document-style XML from losing
+content.
+
+The HTMLParser has this structural knowledge built-in, which means that
+most whitespace that appears between tags in HTML documents will *not*
+be removed by this option, except in places where it is truly ignorable,
+e.g. in the page header, between table structure tags, etc. Therefore,
+it is also safe to use this option with the HTMLParser, as it will keep
+content like the following intact (i.e. it will not remove the space
+that separates the two words):
+
+.. sourcecode:: html
+
+ <p><b>some</b> <em>text</em></p>
+
+If you want to be sure all blank text is removed from an XML document
+(or just more blank text than the parser does by itself), you have to
+use either a DTD to tell the parser which whitespace it can safely
+ignore, or remove the ignorable whitespace manually after parsing,
+e.g. by setting all tail text to None:
+
+.. sourcecode:: python
+
+ for element in root.iter():
+ element.tail = None
+
+Fredrik Lundh also has a Python-level function for indenting XML by
+appending whitespace to tags. It can be found on his `element library
+recipes page <http://effbot.org/zone/element-lib.htm#prettyprint>`_.
+
+
+Why can't lxml parse my XML from unicode strings?
+-------------------------------------------------
+
+First of all, XML is explicitly defined as a stream of bytes. It's not
+Unicode text. Take a look at the `XML specification`_, it's all about byte
+sequences and how to map them to text and structure. That leads to rule
+number one: do not decode your XML data yourself. That's a part of the
+work of an XML parser, and it does it very well. Just pass it your data as
+a plain byte stream, it will always do the right thing, by specification.
+
+This also includes not opening XML files in text mode. Make sure you always
+use binary mode, or, even better, pass the file path into lxml's ``parse()``
+function to let it do the file opening, reading and closing itself. This
+is the most simple and most efficient way to do it.
+
+That being said, lxml can read Python unicode strings and even tries to
+support them if libxml2 does not. This is because there is one valid use
+case for parsing XML from text strings: literal XML fragments in source
+code.
+
+However, if the unicode string declares an XML encoding internally
+(``<?xml encoding="..."?>``), parsing is bound to fail, as this encoding is
+almost certainly not the real encoding used in Python unicode. The same is
+true for HTML unicode strings that contain charset meta tags, although the
+problems may be more subtle here. The libxml2 HTML parser may not be able
+to parse the meta tags in broken HTML and may end up ignoring them, so even
+if parsing succeeds, later handling may still fail with character encoding
+errors. Therefore, parsing HTML from unicode strings is a much saner thing
+to do than parsing XML from unicode strings.
+
+Note that Python uses different encodings for unicode on different platforms,
+so even specifying the real internal unicode encoding is not portable between
+Python interpreters. Don't do it.
+
+Python unicode strings with XML data that carry encoding information are
+broken. lxml will not parse them. You must provide parsable data in a
+valid encoding.
+
+.. _`XML specification`: http://www.w3.org/TR/REC-xml/
+
+
+Can lxml parse from file objects opened in unicode/text mode?
+-------------------------------------------------------------
+
+Technically, yes. However, you likely do not want to do that, because
+it is extremely inefficient. The text encoding that libxml2 uses
+internally is UTF-8, so parsing from a Unicode file means that Python
+first reads a chunk of data from the file, then decodes it into a new
+buffer, and then copies it into a new unicode string object, just to
+let libxml2 make yet another copy while encoding it down into UTF-8
+in order to parse it. It's clear that this involves a lot more
+recoding and copying than when parsing straight from the bytes that
+the file contains.
+
+If you really know the encoding better than the parser (e.g. when
+parsing HTML that lacks a content declaration), then instead of passing
+an encoding parameter into the file object when opening it, create a
+new instance of an XMLParser or HTMLParser and pass the encoding into
+its constructor. Afterwards, use that parser for parsing, e.g. by
+passing it into the ``etree.parse(file, parser)`` function. Remember
+to open the file in binary mode (mode="rb"), or, if possible, prefer
+passing the file path directly into ``parse()`` instead of an opened
+Python file object.
+
+
+What is the difference between str(xslt(doc)) and xslt(doc).write() ?
+---------------------------------------------------------------------
+
+The str() implementation of the XSLTResultTree class (a subclass of the
+ElementTree class) knows about the output method chosen in the stylesheet
+(xsl:output), write() doesn't. If you call write(), the result will be a
+normal XML tree serialization in the requested encoding. Calling this method
+may also fail for XSLT results that are not XML trees (e.g. string results).
+
+If you call str(), it will return the serialized result as specified by the
+XSL transform. This correctly serializes string results to encoded Python
+strings and honours ``xsl:output`` options like ``indent``. This almost
+certainly does what you want, so you should only use ``write()`` if you are
+sure that the XSLT result is an XML tree and you want to override the encoding
+and indentation options requested by the stylesheet.
+
+
+Why can't I just delete parents or clear the root node in iterparse()?
+----------------------------------------------------------------------
+
+The ``iterparse()`` implementation is based on the libxml2 parser. It
+requires the tree to be intact to finish parsing. If you delete or modify
+parents of the current node, chances are you modify the structure in a way
+that breaks the parser. Normally, this will result in a segfault. Please
+refer to the `iterparse section`_ of the lxml API documentation to find out
+what you can do and what you can't do.
+
+.. _`iterparse section`: parsing.html#iterparse-and-iterwalk
+
+
+How do I output null characters in XML text?
+--------------------------------------------
+
+Don't. What you would produce is not well-formed XML. XML parsers
+will refuse to parse a document that contains null characters. The
+right way to embed binary data in XML is using a text encoding such as
+uuencode or base64.
+
+
+Is lxml vulnerable to XML bombs?
+--------------------------------
+
+This has nothing to do with lxml itself, only with the parser of
+libxml2. Since libxml2 version 2.7, the parser imposes hard security
+limits on input documents to prevent DoS attacks with forged input
+data. Since lxml 2.2.1, you can disable these limits with the
+``huge_tree`` parser option if you need to parse *really* large,
+trusted documents. All lxml versions will leave these restrictions
+enabled by default.
+
+Note that libxml2 versions of the 2.6 series do not restrict their
+parser and are therefore vulnerable to DoS attacks.
+
+Note also that these "hard limits" may still be high enough to
+allow for excessive resource usage in a given use case. They are
+compile time modifiable, so building your own library versions will
+allow you to change the limits to your own needs. Also see the next
+question.
+
+
+How do I use lxml safely as a web-service endpoint?
+---------------------------------------------------
+
+XML based web-service endpoints are generally subject to several
+types of attacks if they allow some kind of untrusted input.
+From the point of view of the underlying XML tool, the most
+obvious attacks try to send a relatively small amount of data
+that induces a comparatively large resource consumption on the
+receiver side.
+
+First of all, make sure network access is not enabled for the XML
+parser that you use for parsing untrusted content and that it is
+not configured to load external DTDs. Otherwise, attackers can
+try to trick the parser into an attempt to load external resources
+that are overly slow or impossible to retrieve, thus wasting time
+and other valuable resources on your server such as socket
+connections. Note that you can register your own document loader
+in lxml, which allows for fine-grained control over any read access
+to resources.
+
+Some of the most famous excessive content expansion attacks
+use XML entity references. Luckily, entity expansion is mostly
+useless for the data commonly sent through web services and
+can simply be disabled, which rules out several types of
+denial of service attacks at once. This also involves an attack
+that reads local files from the server, as XML entities can be
+defined to expand into their content. Consequently, version
+1.2 of the SOAP standard explicitly disallows entity references
+in the XML stream.
+
+To disable entity expansion, use an XML parser that is configured
+with the option ``resolve_entities=False``. Then, after (or
+while) parsing the document, use ``root.iter(etree.Entity)`` to
+recursively search for entity references. If it contains any,
+reject the entire input document with a suitable error response.
+In lxml 3.x, you can also use the new DTD introspection API to
+apply your own restrictions on input documents.
+
+Another attack to consider is compression bombs. If you allow
+compressed input into your web service, attackers can try to send
+well forged highly repetitive and thus very well compressing input
+that unpacks into a very large XML document in your server's main
+memory, potentially a thousand times larger than the compressed
+input data.
+
+As a counter measure, either disable compressed input for your
+web server, at least for untrusted sources, or use incremental
+parsing with ``iterparse()`` instead of parsing the whole input
+document into memory in one shot. That allows you to enforce
+suitable limits on the input by applying semantic checks that
+detect and prevent an illegitimate use of your service. If
+possible, you can also use this to reduce the amount of data
+that you need to keep in memory while parsing the document,
+thus further reducing the possibility of an attacker to trick
+your system into excessive resource usage.
+
+Finally, please be aware that XPath suffers from the same
+vulnerability as SQL when it comes to content injection. The
+obvious fix is to not build any XPath expressions via string
+formatting or concatenation when the parameters may come from
+untrusted sources, and instead use XPath variables, which
+safely expose their values to the evaluation engine.
+
+The defusedxml_ package comes with an example setup and a wrapper
+API for lxml that applies certain counter measures internally.
+
+.. _defusedxml: https://bitbucket.org/tiran/defusedxml
+
+
+How can I sort the attributes?
+------------------------------
+
+lxml preserves the order in which attributes were originally created.
+There is one case in which this is difficult: when attributes are passed
+in a dict or as keyword arguments to the `Element()` factory. Before Python
+3.6, dicts had no predictable order.
+Since Python 3.6, however, dicts also preserve the creation order of their keys,
+and lxml makes use of that since release 4.4.
+In earlier versions, lxml tries to assure at least reproducible output by
+sorting the attributes from the dict before creating them. All sequential
+ways to set attributes keep their order and do not apply sorting. Also,
+OrderedDict instances are recognised and not sorted.
+
+In cases where you cannot control the order in which attributes are created,
+you can still change it before serialisation. To sort them by name, for example,
+you can apply the following function:
+
+.. sourcecode:: python
+
+ def sort_attributes(root):
+ for el in root.iter():
+ attrib = el.attrib
+ if len(attrib) > 1:
+ attributes = sorted(attrib.items())
+ attrib.clear()
+ attrib.update(attributes)
+
+
+XPath and Document Traversal
+============================
+
+What are the ``findall()`` and ``xpath()`` methods on Element(Tree)?
+--------------------------------------------------------------------
+
+``findall()`` is part of the original `ElementTree API`_. It supports a
+`simple subset of the XPath language`_, without predicates, conditions and
+other advanced features. It is very handy for finding specific tags in a
+tree. Another important difference is namespace handling, which uses the
+``{namespace}tagname`` notation. This is not supported by XPath. The
+findall, find and findtext methods are compatible with other ElementTree
+implementations and allow writing portable code that runs on ElementTree,
+cElementTree and lxml.etree.
+
+``xpath()``, on the other hand, supports the complete power of the XPath
+language, including predicates, XPath functions and Python extension
+functions. The syntax is defined by the `XPath specification`_. If you need
+the expressiveness and selectivity of XPath, the ``xpath()`` method, the
+``XPath`` class and the ``XPathEvaluator`` are the best choice_.
+
+.. _`simple subset of the XPath language`: http://effbot.org/zone/element-xpath.htm
+.. _`XPath specification`: http://www.w3.org/TR/xpath
+.. _choice: performance.html#xpath
+
+
+Why doesn't ``findall()`` support full XPath expressions?
+---------------------------------------------------------
+
+It was decided that it is more important to keep compatibility with
+ElementTree_ to simplify code migration between the libraries. The main
+difference compared to XPath is the ``{namespace}tagname`` notation used in
+``findall()``, which is not valid XPath.
+
+ElementTree and lxml.etree use the same implementation, which assures 100%
+compatibility. Note that ``findall()`` is `so fast`_ in lxml that a native
+implementation would not bring any performance benefits.
+
+.. _`so fast`: performance.html#tree-traversal
+
+
+How can I find out which namespace prefixes are used in a document?
+-------------------------------------------------------------------
+
+You can traverse the document (``root.iter()``) and collect the prefix
+attributes from all Elements into a set. However, it is unlikely that you
+really want to do that. You do not need these prefixes, honestly. You only
+need the namespace URIs. All namespace comparisons use these, so feel free to
+make up your own prefixes when you use XPath expressions or extension
+functions.
+
+The only place where you might consider specifying prefixes is the
+serialization of Elements that were created through the API. Here, you can
+specify a prefix mapping through the ``nsmap`` argument when creating the root
+Element. Its children will then inherit this prefix for serialization.
+
+
+How can I specify a default namespace for XPath expressions?
+------------------------------------------------------------
+
+You can't. In XPath, there is no such thing as a default namespace. Just use
+an arbitrary prefix and let the namespace dictionary of the XPath evaluators
+map it to your namespace. See also the question above.
+
+
+How can I modify the tree during iteration?
+-------------------------------------------
+
+lxml's iterators need to hold on to an element in the tree in order to remember
+their current position. Therefore, tree modifications between two calls into the
+iterator can lead to surprising results if such an element is deleted or moved
+around, for example.
+
+If your code risks modifying elements that the iterator might still need, and
+you know that the number of elements returned by the iterator is small, then just
+read them all into a list (or use ``.findall()``), and iterate over that list.
+
+If the number of elements can be larger and you really want to process the tree
+incrementally, you can often use a read-ahead generator to make the iterator
+advance beyond the critical point before touching the tree structure.
+
+For example:
+
+.. sourcecode:: python
+
+ from itertools import islice
+ from collections import deque
+
+ def readahead(iterator, count=1):
+ iterator = iter(iterator) # allow iterables as well
+ elements = deque(islice(iterator, 0, count))
+ for element in iterator:
+ elements.append(element)
+ yield elements.popleft()
+ yield from elements
+
+ for element in readahead(root.iterfind("path/to/children")):
+ element.getparent().remove(element)
diff --git a/doc/api.txt b/doc/api.txt
new file mode 100644
index 0000000..2a085d2
--- /dev/null
+++ b/doc/api.txt
@@ -0,0 +1,667 @@
+===========================
+APIs specific to lxml.etree
+===========================
+
+lxml.etree tries to follow established APIs wherever possible. Sometimes,
+however, the need to expose a feature in an easy way led to the invention of a
+new API. This page describes the major differences and a few additions to the
+main ElementTree API.
+
+For a complete reference of the API, see the `generated API
+documentation`_.
+
+Separate pages describe the support for `parsing XML`_, executing `XPath and
+XSLT`_, `validating XML`_ and interfacing with other XML tools through the
+`SAX-API`_.
+
+lxml is extremely extensible through `XPath functions in Python`_, custom
+`Python element classes`_, custom `URL resolvers`_ and even `at the C-level`_.
+
+.. _`parsing XML`: parsing.html
+.. _`XPath and XSLT`: xpathxslt.html
+.. _`validating XML`: validation.html
+.. _`SAX-API`: sax.html
+.. _`XPath functions in Python`: extensions.html
+.. _`Python element classes`: element_classes.html
+.. _`at the C-level`: capi.html
+.. _`URL resolvers`: resolvers.html
+.. _`generated API documentation`: api/index.html
+
+
+.. contents::
+..
+ 1 lxml.etree
+ 2 Other Element APIs
+ 3 Trees and Documents
+ 4 Iteration
+ 5 Error handling on exceptions
+ 6 Error logging
+ 7 Serialisation
+ 8 Incremental XML generation
+ 9 CDATA
+ 10 XInclude and ElementInclude
+
+..
+ >>> from io import BytesIO
+ >>> def StringIO(s=None):
+ ... if isinstance(s, str): s = s.encode("UTF-8")
+ ... return BytesIO(s)
+
+
+lxml.etree
+----------
+
+lxml.etree tries to follow the `ElementTree API`_ wherever it can. There are
+however some incompatibilities (see `compatibility`_). The extensions are
+documented here.
+
+.. _`ElementTree API`: http://effbot.org/zone/element-index.htm
+.. _`compatibility`: compatibility.html
+
+If you need to know which version of lxml is installed, you can access the
+``lxml.etree.LXML_VERSION`` attribute to retrieve a version tuple. Note,
+however, that it did not exist before version 1.0, so you will get an
+AttributeError in older versions. The versions of libxml2 and libxslt are
+available through the attributes ``LIBXML_VERSION`` and ``LIBXSLT_VERSION``.
+
+The following examples usually assume this to be executed first:
+
+.. sourcecode:: pycon
+
+ >>> from lxml import etree
+
+..
+ >>> import sys
+ >>> from lxml import etree as _etree
+ >>> if sys.version_info[0] >= 3:
+ ... class etree_mock(object):
+ ... def __getattr__(self, name): return getattr(_etree, name)
+ ... def tostring(self, *args, **kwargs):
+ ... s = _etree.tostring(*args, **kwargs)
+ ... if isinstance(s, bytes) and bytes([10]) in s: s = s.decode("utf-8") # CR
+ ... if s[-1] == '\n': s = s[:-1]
+ ... return s
+ ... else:
+ ... class etree_mock(object):
+ ... def __getattr__(self, name): return getattr(_etree, name)
+ ... def tostring(self, *args, **kwargs):
+ ... s = _etree.tostring(*args, **kwargs)
+ ... if s[-1] == '\n': s = s[:-1]
+ ... return s
+ >>> etree = etree_mock()
+
+
+Other Element APIs
+------------------
+
+While lxml.etree itself uses the ElementTree API, it is possible to replace
+the Element implementation by `custom element subclasses`_. This has been
+used to implement well-known XML APIs on top of lxml. For example, lxml ships
+with a data-binding implementation called `objectify`_, which is similar to
+the `Amara bindery`_ tool.
+
+lxml.etree comes with a number of `different lookup schemes`_ to customize the
+mapping between libxml2 nodes and the Element classes used by lxml.etree.
+
+.. _`custom element subclasses`: element_classes.html
+.. _`objectify`: objectify.html
+.. _`different lookup schemes`: element_classes.html#setting-up-a-class-lookup-scheme
+.. _`Amara bindery`: http://uche.ogbuji.net/tech/4suite/amara/
+
+
+Trees and Documents
+-------------------
+
+Compared to the original ElementTree API, lxml.etree has an extended tree
+model. It knows about parents and siblings of elements:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.Element("root")
+ >>> a = etree.SubElement(root, "a")
+ >>> b = etree.SubElement(root, "b")
+ >>> c = etree.SubElement(root, "c")
+ >>> d = etree.SubElement(root, "d")
+ >>> e = etree.SubElement(d, "e")
+ >>> b.getparent() == root
+ True
+ >>> print(b.getnext().tag)
+ c
+ >>> print(c.getprevious().tag)
+ b
+
+Elements always live within a document context in lxml. This implies that
+there is also a notion of an absolute document root. You can retrieve an
+ElementTree for the root node of a document from any of its elements.
+
+.. sourcecode:: pycon
+
+ >>> tree = d.getroottree()
+ >>> print(tree.getroot().tag)
+ root
+
+Note that this is different from wrapping an Element in an ElementTree. You
+can use ElementTrees to create XML trees with an explicit root node:
+
+.. sourcecode:: pycon
+
+ >>> tree = etree.ElementTree(d)
+ >>> print(tree.getroot().tag)
+ d
+ >>> etree.tostring(tree)
+ b'<d><e/></d>'
+
+ElementTree objects are serialised as complete documents, including
+preceding or trailing processing instructions and comments.
+
+All operations that you run on such an ElementTree (like XPath, XSLT, etc.)
+will understand the explicitly chosen root as root node of a document. They
+will not see any elements outside the ElementTree. However, ElementTrees do
+not modify their Elements:
+
+.. sourcecode:: pycon
+
+ >>> element = tree.getroot()
+ >>> print(element.tag)
+ d
+ >>> print(element.getparent().tag)
+ root
+ >>> print(element.getroottree().getroot().tag)
+ root
+
+The rule is that all operations that are applied to Elements use either the
+Element itself as reference point, or the absolute root of the document that
+contains this Element (e.g. for absolute XPath expressions). All operations
+on an ElementTree use its explicit root node as reference.
+
+
+Iteration
+---------
+
+The ElementTree API makes Elements iterable to supports iteration over their
+children. Using the tree defined above, we get:
+
+.. sourcecode:: pycon
+
+ >>> [ child.tag for child in root ]
+ ['a', 'b', 'c', 'd']
+
+To iterate in the opposite direction, use the builtin ``reversed()`` function.
+
+Tree traversal should use the ``element.iter()`` method:
+
+.. sourcecode:: pycon
+
+ >>> [ el.tag for el in root.iter() ]
+ ['root', 'a', 'b', 'c', 'd', 'e']
+
+lxml.etree also supports this, but additionally features an extended API for
+iteration over the children, following/preceding siblings, ancestors and
+descendants of an element, as defined by the respective XPath axis:
+
+.. sourcecode:: pycon
+
+ >>> [ child.tag for child in root.iterchildren() ]
+ ['a', 'b', 'c', 'd']
+ >>> [ child.tag for child in root.iterchildren(reversed=True) ]
+ ['d', 'c', 'b', 'a']
+ >>> [ sibling.tag for sibling in b.itersiblings() ]
+ ['c', 'd']
+ >>> [ sibling.tag for sibling in c.itersiblings(preceding=True) ]
+ ['b', 'a']
+ >>> [ ancestor.tag for ancestor in e.iterancestors() ]
+ ['d', 'root']
+ >>> [ el.tag for el in root.iterdescendants() ]
+ ['a', 'b', 'c', 'd', 'e']
+
+Note how ``element.iterdescendants()`` does not include the element
+itself, as opposed to ``element.iter()``. The latter effectively
+implements the 'descendant-or-self' axis in XPath.
+
+All of these iterators support one (or more, since lxml 3.0) additional
+arguments that filter the generated elements by tag name:
+
+.. sourcecode:: pycon
+
+ >>> [ child.tag for child in root.iterchildren('a') ]
+ ['a']
+ >>> [ child.tag for child in d.iterchildren('a') ]
+ []
+ >>> [ el.tag for el in root.iterdescendants('d') ]
+ ['d']
+ >>> [ el.tag for el in root.iter('d') ]
+ ['d']
+ >>> [ el.tag for el in root.iter('d', 'a') ]
+ ['a', 'd']
+
+Note that the order of the elements is determined by the iteration order,
+which is the document order in most cases (except for preceding siblings
+and ancestors, where it is the reversed document order). The order of
+the tag selection arguments is irrelevant, as you can see in the last
+example.
+
+The most common way to traverse an XML tree is depth-first, which
+traverses the tree in document order. This is implemented by the
+``.iter()`` method. While there is no dedicated method for
+breadth-first traversal, it is almost as simple if you use the
+``collections.deque`` type.
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML('<root><a><b/><c/></a><d><e/></d></root>')
+ >>> print(etree.tostring(root, pretty_print=True, encoding='unicode'))
+ <root>
+ <a>
+ <b/>
+ <c/>
+ </a>
+ <d>
+ <e/>
+ </d>
+ </root>
+
+ >>> from collections import deque
+ >>> queue = deque([root])
+ >>> while queue:
+ ... el = queue.popleft() # pop next element
+ ... queue.extend(el) # append its children
+ ... print(el.tag)
+ root
+ a
+ d
+ b
+ c
+ e
+
+See also the section on the utility functions ``iterparse()`` and
+``iterwalk()`` in the `parser documentation`_.
+
+.. _`parser documentation`: parsing.html#iterparse-and-iterwalk
+
+
+Error handling on exceptions
+----------------------------
+
+Libxml2 provides error messages for failures, be it during parsing, XPath
+evaluation or schema validation. The preferred way of accessing them is
+through the local ``error_log`` property of the respective evaluator or
+transformer object. See their documentation for details.
+
+However, lxml also keeps a global error log of all errors that occurred at the
+application level. Whenever an exception is raised, you can retrieve the
+errors that occurred and "might have" lead to the problem from the error log
+copy attached to the exception:
+
+.. sourcecode:: pycon
+
+ >>> etree.clear_error_log()
+ >>> broken_xml = '''
+ ... <root>
+ ... <a>
+ ... </root>
+ ... '''
+ >>> try:
+ ... etree.parse(StringIO(broken_xml))
+ ... except etree.XMLSyntaxError, e:
+ ... pass # just put the exception into e
+
+..
+ >>> etree.clear_error_log()
+ >>> try:
+ ... etree.parse(StringIO(broken_xml))
+ ... except etree.XMLSyntaxError:
+ ... import sys; e = sys.exc_info()[1]
+
+Once you have caught this exception, you can access its ``error_log`` property
+to retrieve the log entries or filter them by a specific type, error domain or
+error level:
+
+.. sourcecode:: pycon
+
+ >>> log = e.error_log.filter_from_level(etree.ErrorLevels.FATAL)
+ >>> print(log[0])
+ <string>:4:8:FATAL:PARSER:ERR_TAG_NAME_MISMATCH: Opening and ending tag mismatch: a line 3 and root
+
+This might look a little cryptic at first, but it is the information that
+libxml2 gives you. At least the message at the end should give you a hint
+what went wrong and you can see that the fatal errors (FATAL) happened during
+parsing (PARSER) lines 4, column 8 and line 5, column 1 of a string (<string>,
+or the filename if available). Here, PARSER is the so-called error domain,
+see ``lxml.etree.ErrorDomains`` for that. You can get it from a log entry
+like this:
+
+.. sourcecode:: pycon
+
+ >>> entry = log[0]
+ >>> print(entry.domain_name)
+ PARSER
+ >>> print(entry.type_name)
+ ERR_TAG_NAME_MISMATCH
+ >>> print(entry.filename)
+ <string>
+
+There is also a convenience attribute ``error_log.last_error`` that returns the
+last error or fatal error that occurred, so that it's easy to test if there was
+an error at all. Note, however, that there might have been more than one error,
+and the first error that occurred might be more relevant in some cases.
+
+
+Error logging
+-------------
+
+lxml.etree supports logging libxml2 messages to the Python stdlib logging
+module. This is done through the ``etree.PyErrorLog`` class. It disables the
+error reporting from exceptions and forwards log messages to a Python logger.
+To use it, see the descriptions of the function ``etree.useGlobalPythonLog``
+and the class ``etree.PyErrorLog`` for help. Note that this does not affect
+the local error logs of XSLT, XMLSchema, etc.
+
+
+Serialisation
+-------------
+
+C14N
+....
+
+lxml.etree has support for `C14N 1.0 <https://www.w3.org/TR/xml-exc-c14n/>`_
+and `C14N 2.0 <https://www.w3.org/TR/xml-c14n2/>`_. When serialising an XML
+tree using ``ElementTree.write()`` or ``tostring()``, you can pass the option
+``method="c14n"`` for 1.0 or ``method="c14n2"`` for 2.0.
+
+Additionally, there is a function ``etree.canonicalize()`` which can be used
+to convert serialised XML to its canonical form directly, without creating
+a tree in memory. By default, it returns the canonical output, but can be
+directed to write it to a file instead.
+
+.. sourcecode:: pycon
+
+ >>> c14n_xml = etree.canonicalize("<root><test z='1' y='2'/></root>")
+ >>> print(c14n_xml)
+ <root><test y="2" z="1"></test></root>
+
+Pretty printing
+...............
+
+Functions like ``ElementTree.write()`` and ``tostring()`` also support pretty
+printing XML through a keyword argument:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<root><test/></root>")
+ >>> etree.tostring(root)
+ b'<root><test/></root>'
+
+ >>> print(etree.tostring(root, pretty_print=True))
+ <root>
+ <test/>
+ </root>
+
+Note the newline that is appended at the end when pretty printing the
+output. It was added in lxml 2.0.
+
+XML declaration
+...............
+
+By default, lxml (just as ElementTree) outputs the XML declaration only if it
+is required by the standard:
+
+.. sourcecode:: pycon
+
+ >>> unicode_root = etree.Element( u"t\u3120st" )
+ >>> unicode_root.text = u"t\u0A0Ast"
+ >>> etree.tostring(unicode_root, encoding="utf-8")
+ b'<t\xe3\x84\xa0st>t\xe0\xa8\x8ast</t\xe3\x84\xa0st>'
+
+ >>> print(etree.tostring(unicode_root, encoding="iso-8859-1"))
+ <?xml version='1.0' encoding='iso-8859-1'?>
+ <t&#12576;st>t&#2570;st</t&#12576;st>
+
+Also see the general remarks on `Unicode support`_.
+
+.. _`Unicode support`: parsing.html#python-unicode-strings
+
+You can enable or disable the declaration explicitly by passing another
+keyword argument for the serialisation:
+
+.. sourcecode:: pycon
+
+ >>> print(etree.tostring(root, xml_declaration=True))
+ <?xml version='1.0' encoding='ASCII'?>
+ <root><test/></root>
+
+ >>> unicode_root.clear()
+ >>> etree.tostring(unicode_root, encoding="UTF-16LE",
+ ... xml_declaration=False)
+ b'<\x00t\x00 1s\x00t\x00/\x00>\x00'
+
+Note that a standard compliant XML parser will not consider the last line
+well-formed XML if the encoding is not explicitly provided somehow, e.g. in an
+underlying transport protocol:
+
+.. sourcecode:: pycon
+
+ >>> notxml = etree.tostring(unicode_root, encoding="UTF-16LE",
+ ... xml_declaration=False)
+ >>> root = etree.XML(notxml) #doctest: +ELLIPSIS
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XMLSyntaxError: ...
+
+Since version 2.3, the serialisation can override the internal subset
+of the document with a user provided DOCTYPE:
+
+.. sourcecode:: pycon
+
+ >>> xml = '<!DOCTYPE root>\n<root/>'
+ >>> tree = etree.parse(StringIO(xml))
+
+ >>> print(etree.tostring(tree))
+ <!DOCTYPE root>
+ <root/>
+
+ >>> print(etree.tostring(tree,
+ ... doctype='<!DOCTYPE root SYSTEM "/tmp/test.dtd">'))
+ <!DOCTYPE root SYSTEM "/tmp/test.dtd">
+ <root/>
+
+The content will be encoded, but otherwise copied verbatim into the
+output stream. It is therefore left to the user to take care for a
+correct doctype format, including the name of the root node.
+
+
+Incremental XML generation
+--------------------------
+
+Since version 3.1, lxml provides an ``xmlfile`` API for incrementally
+generating XML using the ``with`` statement. It's main purpose is to
+freely and safely mix surrounding elements with pre-built in-memory
+trees, e.g. to write out large documents that consist mostly of
+repetitive subtrees (like database dumps). But it can be useful in
+many cases where memory consumption matters or where XML is naturally
+generated in sequential steps. Since lxml 3.4.1, there is an equivalent
+context manager for HTML serialisation called ``htmlfile``.
+
+The API can serialise to real files (given as file path or file
+object), as well as file-like objects, e.g. ``io.BytesIO()``.
+Here is a simple example::
+
+ >>> f = BytesIO()
+ >>> with etree.xmlfile(f) as xf:
+ ... with xf.element('abc'):
+ ... xf.write('text')
+
+ >>> print(f.getvalue().decode('utf-8'))
+ <abc>text</abc>
+
+``xmlfile()`` accepts a file path as first argument, or a file(-like)
+object, as in the example above. In the first case, it takes care to
+open and close the file itself, whereas file(-like) objects are not
+closed by default. This is left to the code that opened them. Since
+lxml 3.4, however, you can pass the argument ``close=True`` to make
+lxml call the object's ``.close()`` method when exiting the xmlfile
+context manager.
+
+To insert pre-constructed Elements and subtrees, just pass them
+into ``write()``::
+
+ >>> f = BytesIO()
+ >>> with etree.xmlfile(f) as xf:
+ ... with xf.element('abc'):
+ ... with xf.element('in'):
+ ...
+ ... for value in '123':
+ ... # construct a really complex XML tree
+ ... el = etree.Element('xyz', attr=value)
+ ...
+ ... xf.write(el)
+ ...
+ ... # no longer needed, discard it right away!
+ ... el = None
+
+ >>> print(f.getvalue().decode('utf-8'))
+ <abc><in><xyz attr="1"/><xyz attr="2"/><xyz attr="3"/></in></abc>
+
+It is a common pattern to have one or more nested ``element()``
+blocks, and then build in-memory XML subtrees in a loop (using the
+ElementTree API, the builder API, XSLT, or whatever) and write them
+out into the XML file one after the other. That way, they can be
+removed from memory right after their construction, which can largely
+reduce the memory footprint of an application, while keeping the
+overall XML generation easy, safe and correct.
+
+Together with Python coroutines, this can be used to generate XML
+in an asynchronous, non-blocking fashion, e.g. for a stream protocol
+like the instant messaging protocol
+`XMPP <https://en.wikipedia.org/wiki/Extensible_Messaging_and_Presence_Protocol>`_::
+
+ def writer(out_stream):
+ with xmlfile(out_stream) as xf:
+ with xf.element('{http://etherx.jabber.org/streams}stream'):
+ while True:
+ el = (yield)
+ xf.write(el)
+ xf.flush()
+
+ w = writer(stream)
+ next(w) # start writing (run up to 'yield')
+
+Then, whenever XML elements are available for writing, call
+
+::
+
+ w.send(element)
+
+And when done::
+
+ w.close()
+
+Note the additional ``xf.flush()`` call in the example above, which is
+available since lxml 3.4. Normally, the output stream is buffered to
+avoid excessive I/O calls. Whenever the internal buffer fills up, its
+content is written out. In the case above, however, we want to make
+sure that each message that we write (i.e. each element subtree) is
+written out immediately, so we flush the content explicitly at the
+right point.
+
+Alternatively, if buffering is not desired at all, it can be disabled
+by passing the flag ``buffered=False`` into ``xmlfile()`` (also since
+lxml 3.4).
+
+Here is a similar example using an async coroutine in Py3.5 or later, which is
+supported since lxml 4.0. The output stream is expected to have methods
+``async def write(self, data)`` and ``async def close(self)`` in this case.
+
+::
+
+ async def writer(out_stream, xml_messages):
+ async with xmlfile(out_stream) as xf:
+ async with xf.element('{http://etherx.jabber.org/streams}stream'):
+ async for el in xml_messages:
+ await xf.write(el)
+ await xf.flush()
+
+
+ class DummyAsyncOut(object):
+ async def write(self, data):
+ print(data.decode('utf8'))
+
+ async def close(self):
+ pass
+
+ stream = DummyAsyncOut()
+ async_writer = writer(stream, async_message_stream)
+
+
+CDATA
+-----
+
+By default, lxml's parser will strip CDATA sections from the tree and
+replace them by their plain text content. As real applications for
+CDATA are rare, this is the best way to deal with this issue.
+
+However, in some cases, keeping CDATA sections or creating them in a
+document is required to adhere to existing XML language definitions.
+For these special cases, you can instruct the parser to leave CDATA
+sections in the document:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser(strip_cdata=False)
+ >>> root = etree.XML('<root><![CDATA[test]]></root>', parser)
+ >>> root.text
+ 'test'
+
+ >>> etree.tostring(root)
+ b'<root><![CDATA[test]]></root>'
+
+Note how the ``.text`` property does not give any indication that the
+text content is wrapped by a CDATA section. If you want to make sure
+your data is wrapped by a CDATA block, you can use the ``CDATA()``
+text wrapper:
+
+.. sourcecode:: pycon
+
+ >>> root.text = 'test'
+
+ >>> root.text
+ 'test'
+ >>> etree.tostring(root)
+ b'<root>test</root>'
+
+ >>> root.text = etree.CDATA(root.text)
+
+ >>> root.text
+ 'test'
+ >>> etree.tostring(root)
+ b'<root><![CDATA[test]]></root>'
+
+
+XInclude and ElementInclude
+---------------------------
+
+You can let lxml process xinclude statements in a document by calling the
+xinclude() method on a tree:
+
+.. sourcecode:: pycon
+
+ >>> data = StringIO('''\
+ ... <doc xmlns:xi="http://www.w3.org/2001/XInclude">
+ ... <foo/>
+ ... <xi:include href="doc/test.xml" />
+ ... </doc>''')
+
+ >>> tree = etree.parse(data)
+ >>> tree.xinclude()
+ >>> print(etree.tostring(tree.getroot()))
+ <doc xmlns:xi="http://www.w3.org/2001/XInclude">
+ <foo/>
+ <a xml:base="doc/test.xml"/>
+ </doc>
+
+Note that the ElementTree compatible ElementInclude_ module is also supported
+as ``lxml.ElementInclude``. It has the additional advantage of supporting
+custom `URL resolvers`_ at the Python level. The normal XInclude mechanism
+cannot deploy these. If you need ElementTree compatibility or custom
+resolvers, you have to stick to the external Python module.
+
+.. _ElementInclude: http://effbot.org/zone/element-xinclude.htm
diff --git a/doc/api/Makefile b/doc/api/Makefile
new file mode 100644
index 0000000..dc8e304
--- /dev/null
+++ b/doc/api/Makefile
@@ -0,0 +1,23 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS ?=
+SPHINXBUILD ?= sphinx-build
+SOURCEDIR = .
+BUILDDIR = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+html:
+ @$(SPHINXBUILD) -b html "$(SOURCEDIR)" -d "$(BUILDDIR)/doctrees" ../html/apidoc $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/doc/api/conf.py b/doc/api/conf.py
new file mode 100644
index 0000000..75aa281
--- /dev/null
+++ b/doc/api/conf.py
@@ -0,0 +1,56 @@
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../../src'))
+
+from lxml import __version__ as lxml_version
+
+# -- Project information -----------------------------------------------------
+
+project = 'lxml'
+copyright = '2020, lxml dev team'
+author = 'lxml dev team'
+version = lxml_version
+
+
+# -- General configuration ---------------------------------------------------
+
+extensions = [
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.viewcode',
+ 'sphinx_rtd_theme',
+]
+
+language = 'en'
+
+exclude_patterns = ['_build']
+
+
+# -- Options for HTML output -------------------------------------------------
+
+html_theme = 'sphinx_rtd_theme'
+
+html_logo = '../html/python-xml.png'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+#html_static_path = ['_static']
+
+html_theme_options = {
+ 'collapse_navigation': False,
+ 'titles_only': True,
+}
+
+# -- Extension configuration -------------------------------------------------
+
+autodoc_default_options = {
+ 'ignore-module-all': True,
+ 'private-members': True,
+}
+
+autodoc_member_order = 'groupwise'
+
+# -- Options for todo extension ----------------------------------------------
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+#todo_include_todos = True
diff --git a/doc/api/index.rst b/doc/api/index.rst
new file mode 100644
index 0000000..ccf1bad
--- /dev/null
+++ b/doc/api/index.rst
@@ -0,0 +1,14 @@
+lxml API Reference
+==================
+
+.. toctree::
+ :maxdepth: 4
+
+ lxml
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/doc/build.txt b/doc/build.txt
new file mode 100644
index 0000000..8d375f7
--- /dev/null
+++ b/doc/build.txt
@@ -0,0 +1,342 @@
+How to build lxml from source
+=============================
+
+To build lxml from source, you need libxml2 and libxslt properly
+installed, *including the header files*. These are likely shipped in
+separate ``-dev`` or ``-devel`` packages like ``libxml2-dev``, which
+you must install before trying to build lxml.
+
+.. contents::
+..
+ 1 Cython
+ 2 Github, git and hg
+ 3 Building the sources
+ 4 Running the tests and reporting errors
+ 5 Building an egg
+ 6 Building lxml on MacOS-X
+ 7 Static linking on Windows
+ 8 Building Debian packages from SVN sources
+
+
+Cython
+------
+
+.. _pip: http://pypi.python.org/pypi/pip
+.. _Cython: http://cython.org
+.. _wheel: https://wheel.readthedocs.io/en/latest/
+
+The lxml.etree and lxml.objectify modules are written in Cython_.
+Since we distribute the Cython-generated .c files with lxml releases,
+however, you do not need Cython to build lxml from the normal release
+sources. We even encourage you to *not install Cython* for a normal
+release build, as the generated C code can vary quite heavily between
+Cython versions, which may or may not generate correct code for lxml.
+The pre-generated release sources were tested and therefore are known
+to work.
+
+So, if you want a reliable build of lxml, we suggest to a) use a
+source release of lxml and b) disable or uninstall Cython for the
+build.
+
+*Only* if you are interested in building lxml from a checkout of the
+developer sources (e.g. to test a bug fix that has not been release
+yet) or if you want to be an lxml developer, then you do need a
+working Cython installation. You can use pip_ to install it::
+
+ pip install -r requirements.txt
+
+https://github.com/lxml/lxml/blob/master/requirements.txt
+
+lxml currently requires at least Cython 0.26.1, later release versions
+should work as well. For Python 3.7 support, at least Cython 0.29 is
+required.
+
+
+Github, git and hg
+-------------------
+
+The lxml package is developed in a repository on Github_ using
+Mercurial_ and the `hg-git`_ plugin. You can retrieve the current
+developer version using::
+
+ hg clone git+ssh://git@github.com/lxml/lxml.git lxml
+
+Or, using git::
+
+ git clone ssh://git@github.com/lxml/lxml.git lxml
+
+This will create a directory ``lxml`` and download the source into it,
+including the complete development history. Don't be afraid, the
+repository download is fairly quick. You can also browse the
+`lxml repository`_ through the web or download a ZIP archive with the
+`latest master branch <https://github.com/lxml/lxml/archive/master.zip>`_.
+
+.. _Github: https://github.com/lxml/
+.. _Mercurial: http://mercurial.selenic.com/
+.. _`hg-git`: http://hg-git.github.com/
+.. _`lxml repository`: https://github.com/lxml/lxml
+.. _`source tar-ball`: https://github.com/lxml/lxml/tarball/master
+
+
+Building the sources
+---------------------
+
+Clone the source repository as described above (or download
+the `source tar-ball`_ and unpack it) and then type::
+
+ python setup.py build
+
+or::
+
+ python setup.py bdist_egg # requires 'setuptools' or 'distribute'
+
+To (re-)build the C sources with Cython, you must additionally pass the
+option ``--with-cython``::
+
+ python setup.py build --with-cython
+
+If you want to test lxml from the source directory, it is better to build it
+in-place like this::
+
+ python setup.py build_ext -i --with-cython
+
+or, in Unix-like environments::
+
+ make inplace
+
+To speed up the build in test environments (e.g. on a continuous
+integration server), set the ``CFLAGS`` environment variable to
+disable C compiler optimisations (e.g. "-O0" for gcc, that's
+minus-oh-zero), for example::
+
+ CFLAGS="-O0" make inplace
+
+If you get errors about missing header files (e.g. ``Python.h`` or
+``libxml/xmlversion.h``) then you need to make sure the development
+packages of Python, libxml2 and libxslt are properly installed. On
+Linux distributions, they are usually called something like
+``libxml2-dev`` or ``libxslt-devel``. If these packages were
+installed in non-standard places, try passing the following option to
+setup.py to make sure the right config is found::
+
+ python setup.py build --with-xslt-config=/path/to/xslt-config
+
+There are also env vars to allow overriding the config tool::
+
+ env XML2_CONFIG=/path/to/xml2-config python build
+
+You may also use ``pkg-config`` as the tools::
+
+ env XSLT_CONFIG="pkg-config libxslt" python setup.py build
+
+If this doesn't help, you may have to add the location of the header
+files to the include path like::
+
+ python setup.py build_ext -i -I /usr/include/libxml2
+
+where the file is in ``/usr/include/libxml2/libxml/xmlversion.h``
+
+To use lxml.etree in-place, you can place lxml's ``src`` directory
+on your Python module search path (PYTHONPATH) and then import
+``lxml.etree`` to play with it::
+
+ # cd lxml
+ # PYTHONPATH=src python
+ Python 2.7.2
+ Type "help", "copyright", "credits" or "license" for more information.
+ >>> from lxml import etree
+ >>>
+
+To make sure everything gets recompiled cleanly after changes, you can
+run ``make clean`` or delete the file ``src/lxml/etree.c``.
+
+
+Running the tests and reporting errors
+--------------------------------------
+
+The source distribution (tgz) and the source repository contain a test
+suite for lxml. You can run it from the top-level directory::
+
+ python test.py
+
+Note that the test script only tests the in-place build (see distutils
+building above), as it searches the ``src`` directory. You can use the
+following one-step command to trigger an in-place build and test it::
+
+ make test
+
+This also runs the ElementTree and cElementTree compatibility tests. To call
+them separately, make sure you have lxml on your PYTHONPATH first, then run::
+
+ python selftest.py
+
+and::
+
+ python selftest2.py
+
+If the tests give failures, errors, or worse, segmentation faults, we'd really
+like to know. Please contact us on the `mailing list`_, and please specify
+the version of lxml, libxml2, libxslt and Python you were using, as well as
+your operating system type (Linux, Windows, MacOS-X, ...).
+
+.. _`mailing list`: http://lxml.de/mailinglist/
+
+
+Building an egg or wheel
+------------------------
+
+This is the procedure to make an lxml egg or wheel_ for your platform.
+It assumes that you have ``setuptools`` or ``distribute`` installed, as well
+as the ``wheel`` package.
+
+First, download the lxml-x.y.tar.gz release. This contains the pregenerated
+C files so that you can be sure you build exactly from the release sources.
+Unpack them and ``cd`` into the resulting directory. Then, to build a wheel,
+simply run the command
+
+::
+
+ python setup.py bdist_wheel
+
+or, to build a statically linked wheel with all of libxml2, libxslt and
+friends compiled in, run
+
+ python setup.py bdist_wheel --static-deps
+
+The resulting .whl file will be written into the ``dist`` directory.
+
+To build an egg file, run
+
+::
+
+ python setup.py build_egg
+
+If you are on a Unix-like platform, you can first build the extension modules
+using
+
+::
+
+ python setup.py build
+
+and then ``cd`` into the directory ``build/lib.your.platform`` to call
+``strip`` on any ``.so`` file you find there. This reduces the size of
+the binary distribution considerably. Then, from the package root directory,
+call
+
+::
+
+ python setup.py bdist_egg
+
+This will quickly package the pre-built packages into an egg file and
+drop it into the ``dist`` directory.
+
+
+Building lxml on MacOS-X
+------------------------
+
+Apple regularly ships new system releases with horribly outdated
+system libraries. This is specifically the case for libxml2 and
+libxslt, where the system provided versions used to be too old
+to even build lxml for a long time.
+
+While the Unix environment in MacOS-X makes it relatively easy to
+install Unix/Linux style package management tools and new software, it
+actually seems to be hard to get libraries set up for exclusive usage
+that MacOS-X ships in an older version. Alternative distributions
+(like macports) install their libraries in addition to the system
+libraries, but the compiler and the runtime loader on MacOS still sees
+the system libraries before the new libraries. This can lead to
+undebuggable crashes where the newer library seems to be loaded but
+the older system library is used.
+
+Apple discourages static building against libraries, which would help
+working around this problem. Apple does not ship static library
+binaries with its system and several package management systems follow
+this decision. Therefore, building static binaries requires building
+the dependencies first. The ``setup.py`` script does this
+automatically when you call it like this::
+
+ python setup.py build --static-deps
+
+This will download and build the latest versions of libxml2 and
+libxslt from the official FTP download site. If you want to use
+specific versions, or want to prevent any online access, you can
+download both ``tar.gz`` release files yourself, place them into a
+subdirectory ``libs`` in the lxml distribution, and call ``setup.py``
+with the desired target versions like this::
+
+ python setup.py build --static-deps \
+ --libxml2-version=2.9.1 \
+ --libxslt-version=1.1.28 \
+
+ sudo python setup.py install
+
+Instead of ``build``, you can use any target, like ``bdist_egg``
+if you want to use setuptools to build an installable egg, or
+``bdist_wheel`` for a wheel package.
+
+Note that this also works with pip_. Since you can't pass
+command line options in this case, you have to use an environment
+variable instead::
+
+ STATIC_DEPS=true pip install lxml
+
+To install the package into the system Python package directory,
+run the installation with "sudo"::
+
+ STATIC_DEPS=true sudo pip install lxml
+
+The ``STATICBUILD`` environment variable is handled equivalently to
+the ``STATIC_DEPS`` variable, but is used by some other extension
+packages, too.
+
+If you decide to do a non-static build, you may also have to install
+the command line tools in addition to the XCode build environment.
+They are available as a restricted download from here:
+
+https://developer.apple.com/downloads/index.action?=command%20line%20tools#
+
+Without them, the compiler may not find the necessary header files of
+the XML libraries, according to the second comment in this ticket:
+
+https://bugs.launchpad.net/lxml/+bug/1244094
+
+
+Static linking on Windows
+-------------------------
+
+Most operating systems have proper package management that makes installing
+current versions of libxml2 and libxslt easy. The most famous exception is
+Microsoft Windows, which entirely lacks these capabilities. To work around
+the limits of this platform, lxml's installation can download pre-built
+packages of the dependencies and build statically against them. Assuming
+you have a proper C compiler setup to build Python extensions, this should
+work::
+
+ python setup.py bdist_wininst --static-deps
+
+It should create a windows installer in the ``pkg`` directory.
+
+
+Building Debian packages from SVN sources
+-----------------------------------------
+
+`Andreas Pakulat`_ proposed the following approach.
+
+.. _`Andreas Pakulat`: http://thread.gmane.org/gmane.comp.python.lxml.devel/1239/focus=1249
+
+* ``apt-get source lxml``
+* remove the unpacked directory
+* tar.gz the lxml SVN version and replace the orig.tar.gz that lies in the
+ directory
+* check md5sum of created tar.gz file and place new sum and size in dsc file
+* do ``dpkg-source -x lxml-[VERSION].dsc`` and cd into the newly created directory
+* run ``dch -i`` and add a comment like "use trunk version", this will
+ increase the debian version number so apt/dpkg won't get confused
+* run ``dpkg-buildpackage -rfakeroot -us -uc`` to build the package
+
+In case ``dpkg-buildpackage`` tells you that some dependencies are missing, you
+can either install them manually or run ``apt-get build-dep lxml``.
+
+That will give you .deb packages in the parent directory which can be
+installed using ``dpkg -i``.
diff --git a/doc/capi.txt b/doc/capi.txt
new file mode 100644
index 0000000..0167a5a
--- /dev/null
+++ b/doc/capi.txt
@@ -0,0 +1,122 @@
+==============================
+The public C-API of lxml.etree
+==============================
+
+As of version 1.1, lxml.etree provides a public C-API. This allows external
+C extensions to efficiently access public functions and classes of lxml,
+without going through the Python API.
+
+The API is described in the file `etreepublic.pxd`_, which is directly
+c-importable by extension modules implemented in Pyrex_ or Cython_.
+
+.. _`etreepublic.pxd`: https://github.com/lxml/lxml/blob/master/src/lxml/includes/etreepublic.pxd
+.. _Cython: http://cython.org
+.. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/
+
+.. contents::
+..
+ 1 Passing generated trees through Python
+ 2 Writing external modules in Cython
+ 3 Writing external modules in C
+
+
+Passing generated trees through Python
+--------------------------------------
+
+This is the most simple way to integrate with lxml. It does not require
+any C-level integration but uses a Python function to wrap an externally
+generated libxml2 document in lxml.
+
+The external module that creates the libxml2 tree must pack the document
+pointer into a `PyCapsule <https://docs.python.org/3/c-api/capsule.html>`_
+object. This can then be passed into lxml with the function
+``lxml.etree.adopt_external_document()``. It also takes an optional lxml
+parser instance to associate with the document, in order to configure the
+Element class lookup, relative URL lookups, etc.
+
+See the `API reference <api/lxml.etree-module.html#adopt_external_document>`_
+for further details.
+
+The same functionality is available as part of the public C-API in form
+of the C function ``adoptExternalDocument()``.
+
+
+Writing external modules in Cython
+----------------------------------
+
+This is the easiest way of extending lxml at the C level. A Cython_
+(or Pyrex_) module should start like this::
+
+ # My Cython extension
+
+ # directive pointing compiler to lxml header files;
+ # use ``aliases={"LXML_PACKAGE_DIR": lxml.__path__}``
+ # argument to cythonize in setup.py to dynamically
+ # determine dir at compile time
+ # distutils: include_dirs = LXML_PACKAGE_DIR
+
+ # import the public functions and classes of lxml.etree
+ cimport lxml.includes.etreepublic as cetree
+
+ # import the lxml.etree module in Python
+ cdef object etree
+ from lxml import etree
+
+ # initialize the access to the C-API of lxml.etree
+ cetree.import_lxml__etree()
+
+From this line on, you can access all public functions of lxml.etree
+from the ``cetree`` namespace like this::
+
+ # build a tag name from namespace and element name
+ py_tag = cetree.namespacedNameFromNsName("http://some/url", "myelement")
+
+Public lxml classes are easily subclassed. For example, to implement
+and set a new default element class, you can write Cython code like
+the following::
+
+ from lxml.includes.etreepublic cimport ElementBase
+ cdef class NewElementClass(ElementBase):
+ def set_value(self, myval):
+ self.set("my_attribute", myval)
+
+ etree.set_element_class_lookup(
+ etree.ElementDefaultClassLookup(element=NewElementClass))
+
+
+Writing external modules in C
+-----------------------------
+
+If you really feel like it, you can also interface with lxml.etree straight
+from C code. All you have to do is include the header file for the public
+API, import the ``lxml.etree`` module and then call the import function:
+
+.. sourcecode:: c
+
+ /* My C extension */
+
+ /* common includes */
+ #include "Python.h"
+ #include "stdio.h"
+ #include "string.h"
+ #include "stdarg.h"
+ #include "libxml/xmlversion.h"
+ #include "libxml/encoding.h"
+ #include "libxml/hash.h"
+ #include "libxml/tree.h"
+ #include "libxml/xmlIO.h"
+ #include "libxml/xmlsave.h"
+ #include "libxml/globals.h"
+ #include "libxml/xmlstring.h"
+
+ /* lxml.etree specific includes */
+ #include "lxml-version.h"
+ #include "etree_defs.h"
+ #include "etree.h"
+
+ /* setup code */
+ import_lxml__etree()
+
+Note that including ``etree.h`` does not automatically include the
+header files it requires. Note also that the above list of common
+includes may not be sufficient.
diff --git a/doc/compatibility.txt b/doc/compatibility.txt
new file mode 100644
index 0000000..e23d181
--- /dev/null
+++ b/doc/compatibility.txt
@@ -0,0 +1,196 @@
+=======================================
+ElementTree compatibility of lxml.etree
+=======================================
+
+A lot of care has been taken to ensure compatibility between etree and
+ElementTree. Nonetheless, some differences and incompatibilities exist:
+
+* Importing etree is obviously different; etree uses a lower-case
+ package name, while ElementTree uses a combination of upper-case and
+ lower case in imports:
+
+ .. sourcecode:: python
+
+ # etree
+ from lxml.etree import Element
+
+ # ElementTree
+ from elementtree.ElementTree import Element
+
+ # ElementTree in the Python 2.5 standard library
+ from xml.etree.ElementTree import Element
+
+ When switching over code from ElementTree to lxml.etree, and you're using
+ the package name prefix 'ElementTree', you can do the following:
+
+ .. sourcecode:: python
+
+ # instead of
+ from elementtree import ElementTree
+ # use
+ from lxml import etree as ElementTree
+
+* lxml.etree offers a lot more functionality, such as XPath, XSLT, Relax NG,
+ and XML Schema support, which (c)ElementTree does not offer.
+
+* etree has a different idea about Python unicode strings than ElementTree.
+ In most parts of the API, ElementTree uses plain strings and unicode strings
+ as what they are. This includes Element.text, Element.tail and many other
+ places. However, the ElementTree parsers assume by default that any string
+ (`str` or `unicode`) contains ASCII data. They raise an exception if
+ strings do not match the expected encoding.
+
+ etree has the same idea about plain strings (`str`) as ElementTree. For
+ unicode strings, however, etree assumes throughout the API that they are
+ Python unicode encoded strings rather than byte data. This includes the
+ parsers. It is therefore perfectly correct to pass XML unicode data into
+ the etree parsers in form of Python unicode strings. It is an error, on the
+ other hand, if unicode strings specify an encoding in their XML declaration,
+ as this conflicts with the characteristic encoding of Python unicode
+ strings.
+
+* ElementTree allows you to place an Element in two different trees at the
+ same time. Thus, this:
+
+ .. sourcecode:: python
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = Element('c')
+ c.append(b)
+
+ will result in the following tree a:
+
+ .. sourcecode:: xml
+
+ <a><b /></a>
+
+ and the following tree c:
+
+ .. sourcecode:: xml
+
+ <c><b /></c>
+
+ In lxml, this behavior is different, because lxml is built on top of a tree
+ that maintains parent relationships for elements (like W3C DOM). This means
+ an element can only exist in a single tree at the same time. Adding an
+ element in some tree to another tree will cause this element to be moved.
+
+ So, for tree a we will get:
+
+ .. sourcecode:: xml
+
+ <a></a>
+
+ and for tree c we will get:
+
+ .. sourcecode:: xml
+
+ <c><b/></c>
+
+ Unfortunately this is a rather fundamental difference in behavior, which is
+ hard to change. It won't affect some applications, but if you want to port
+ code you must unfortunately make sure that it doesn't affect yours.
+
+* etree allows navigation to the parent of a node by the ``getparent()``
+ method and to the siblings by calling ``getnext()`` and ``getprevious()``.
+ This is not possible in ElementTree as the underlying tree model does not
+ have this information.
+
+* When trying to set a subelement using __setitem__ that is in fact not an
+ Element but some other object, etree raises a TypeError, and ElementTree
+ raises an AssertionError. This also applies to some other places of the
+ API. In general, etree tries to avoid AssertionErrors in favour of being
+ more specific about the reason for the exception.
+
+* When parsing fails in ``iterparse()``, ElementTree up to version
+ 1.2.x raises a low-level ``ExpatError`` instead of a ``SyntaxError``
+ as the other parsers. Both lxml and ElementTree 1.3 raise a
+ ``ParseError`` for parser errors.
+
+* The ``iterparse()`` function in lxml is implemented based on the libxml2
+ parser and tree generator. This means that modifications of the document
+ root or the ancestors of the current element during parsing can irritate the
+ parser and even segfault. While this is not a problem in the Python object
+ structure used by ElementTree, the C tree underlying lxml suffers from it.
+ The golden rule for ``iterparse()`` on lxml therefore is: do not touch
+ anything that will have to be touched again by the parser later on. See the
+ lxml parser documentation on this.
+
+* ElementTree ignores comments and processing instructions when parsing XML,
+ while etree will read them in and treat them as Comment or
+ ProcessingInstruction elements respectively. This is especially visible
+ where comments are found inside text content, which is then split by the
+ Comment element.
+
+ You can disable this behaviour by passing the boolean ``remove_comments``
+ and/or ``remove_pis`` keyword arguments to the parser you use. For
+ convenience and to support portable code, you can also use the
+ ``etree.ETCompatXMLParser`` instead of the default ``etree.XMLParser``. It
+ tries to provide a default setup that is as close to the ElementTree parser
+ as possible.
+
+* The ``TreeBuilder`` class of ``lxml.etree`` uses a different
+ signature for the ``start()`` method. It accepts an additional
+ argument ``nsmap`` to propagate the namespace declarations of an
+ element in addition to its own namespace. To assure compatibility
+ with ElementTree (which does not support this argument), lxml checks
+ if the method accepts 3 arguments before calling it, and otherwise
+ drops the namespace mapping. This should work with most existing
+ ElementTree code, although there may still be conflicting cases.
+
+* ElementTree 1.2 has a bug when serializing an empty Comment (no text
+ argument given) to XML, etree serializes this successfully.
+
+* ElementTree adds whitespace around comments on serialization, lxml does
+ not. This means that a comment text "text" that ElementTree serializes as
+ "<!-- text -->" will become "<!--text-->" in lxml.
+
+* When the string '*' is used as tag filter in the ``Element.getiterator()``
+ method, ElementTree returns all elements in the tree, including comments and
+ processing instructions. lxml.etree only returns real Elements, i.e. tree
+ nodes that have a string tag name. Without a filter, both libraries iterate
+ over all nodes.
+
+ Note that currently only lxml.etree supports passing the ``Element`` factory
+ function as filter to select only Elements. Both libraries support passing
+ the ``Comment`` and ``ProcessingInstruction`` factories to select the
+ respective tree nodes.
+
+* ElementTree merges the target of a processing instruction into ``PI.text``,
+ while lxml.etree puts it into the ``.target`` property and leaves it out of
+ the ``.text`` property. The ``pi.text`` in ElementTree therefore
+ correspondents to ``pi.target + " " + pi.text`` in lxml.etree.
+
+* Because etree is built on top of libxml2, which is namespace prefix aware,
+ etree preserves namespaces declarations and prefixes while ElementTree tends
+ to come up with its own prefixes (ns0, ns1, etc). When no namespace prefix
+ is given, however, etree creates ElementTree style prefixes as well.
+
+* etree has a 'prefix' attribute (read-only) on elements giving the Element's
+ prefix, if this is known, and None otherwise (in case of no namespace at
+ all, or default namespace).
+
+* etree further allows passing an 'nsmap' dictionary to the Element and
+ SubElement element factories to explicitly map namespace prefixes to
+ namespace URIs. These will be translated into namespace declarations on
+ that element. This means that in the probably rare case that you need to
+ construct an attribute called 'nsmap', you need to be aware that unlike in
+ ElementTree, you cannot pass it as a keyword argument to the Element and
+ SubElement factories directly.
+
+* ElementTree allows QName objects as attribute values and resolves their
+ prefix on serialisation (e.g. an attribute value ``QName("{myns}myname")``
+ becomes "p:myname" if "p" is the namespace prefix of "myns"). lxml.etree
+ also allows you to set attribute values from QName instances (and also .text
+ values), but it resolves their prefix immediately and stores the plain text
+ value. So, if prefixes are modified later on, e.g. by moving a subtree to a
+ different tree (which reassigns the prefix mappings), the text values will
+ not be updated and you might end up with an undefined prefix.
+
+* etree elements can be copied using ``copy.deepcopy()`` and ``copy.copy()``,
+ just like ElementTree's. However, ``copy.copy()`` does *not* create a
+ shallow copy where elements are shared between trees, as this makes no sense
+ in the context of libxml2 trees. Note that lxml can deep-copy trees
+ considerably faster than ElementTree, so a deep copy might still be fast
+ enough to replace a shallow copy in your case.
diff --git a/doc/cssselect.txt b/doc/cssselect.txt
new file mode 100644
index 0000000..64b3d7b
--- /dev/null
+++ b/doc/cssselect.txt
@@ -0,0 +1,126 @@
+==============
+lxml.cssselect
+==============
+
+lxml supports a number of interesting languages for tree traversal and element
+selection. The most important is obviously XPath_, but there is also
+ObjectPath_ in the `lxml.objectify`_ module. The newest child of this family
+is `CSS selection`_, which is made available in form of the ``lxml.cssselect``
+module.
+
+Although it started its life in lxml, cssselect_ is now an independent project.
+It translates CSS selectors to XPath 1.0 expressions that can be used with
+lxml's XPath engine. ``lxml.cssselect`` adds a few convenience shortcuts into
+that package.
+
+To install ``cssselect``, run
+
+::
+
+ pip install cssselect
+
+lxml will then import and use it automatically.
+
+
+.. _XPath: xpathxslt.html#xpath
+.. _ObjectPath: objectify.html#objectpath
+.. _`lxml.objectify`: objectify.html
+.. _`CSS selection`: http://www.w3.org/TR/CSS21/selector.html
+.. _cssselect: http://packages.python.org/cssselect/
+
+.. contents::
+..
+ 1 The CSSSelector class
+ 2 CSS Selectors
+ 2.1 Namespaces
+ 3 Limitations
+
+
+The CSSSelector class
+=====================
+
+The most important class in the ``lxml.cssselect`` module is ``CSSSelector``. It
+provides the same interface as the XPath_ class, but accepts a CSS selector
+expression as input:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.cssselect import CSSSelector
+ >>> sel = CSSSelector('div.content')
+ >>> sel #doctest: +ELLIPSIS
+ <CSSSelector ... for 'div.content'>
+ >>> sel.css
+ 'div.content'
+
+The selector actually compiles to XPath, and you can see the
+expression by inspecting the object:
+
+.. sourcecode:: pycon
+
+ >>> sel.path
+ "descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' content ')]"
+
+To use the selector, simply call it with a document or element
+object:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.etree import fromstring
+ >>> h = fromstring('''<div id="outer">
+ ... <div id="inner" class="content body">
+ ... text
+ ... </div></div>''')
+ >>> [e.get('id') for e in sel(h)]
+ ['inner']
+
+Using ``CSSSelector`` is equivalent to translating with ``cssselect``
+and using the ``XPath`` class:
+
+.. sourcecode:: pycon
+
+ >>> from cssselect import GenericTranslator
+ >>> from lxml.etree import XPath
+ >>> sel = XPath(GenericTranslator().css_to_xpath('div.content'))
+
+``CSSSelector`` takes a ``translator`` parameter to let you choose which
+translator to use. It can be ``'xml'`` (the default), ``'xhtml'``, ``'html'``
+or a `Translator object`_.
+
+.. _Translator object: http://packages.python.org/cssselect/#cssselect.GenericTranslator
+
+
+The cssselect method
+====================
+
+lxml ``Element`` objects have a ``cssselect`` convenience method.
+
+.. sourcecode:: pycon
+
+ >>> h.cssselect('div.content') == sel(h)
+ True
+
+Note however that pre-compiling the expression with the ``CSSSelector`` or
+``XPath`` class can provide a substantial speedup.
+
+The method also accepts a ``translator`` parameter. On ``HtmlElement``
+objects, the default is changed to ``'html'``.
+
+
+Supported Selectors
+===================
+
+Most `Level 3`_ selectors are supported. The details are in the
+`cssselect documentation`_.
+
+.. _Level 3: http://www.w3.org/TR/2011/REC-css3-selectors-20110929/
+.. _cssselect documentation: http://packages.python.org/cssselect/#supported-selectors
+
+
+Namespaces
+==========
+
+In CSS you can use ``namespace-prefix|element``, similar to
+``namespace-prefix:element`` in an XPath expression. In fact, it maps
+one-to-one, and the same rules are used to map namespace prefixes to
+namespace URIs: the ``CSSSelector`` class accepts a dictionary as its
+``namespaces`` argument.
diff --git a/doc/docstructure.py b/doc/docstructure.py
new file mode 100644
index 0000000..9a8e27b
--- /dev/null
+++ b/doc/docstructure.py
@@ -0,0 +1,32 @@
+
+import os
+
+if os.path.exists(os.path.join(os.path.dirname(__file__), '..', 'funding.txt')):
+ funding = ('../funding.txt',)
+else:
+ funding = ()
+
+SITE_STRUCTURE = [
+ ('lxml', ('main.txt', 'intro.txt', '../INSTALL.txt', # 'lxml2.txt',
+ 'performance.txt', 'compatibility.txt', 'FAQ.txt') + funding),
+ ('Developing with lxml', ('tutorial.txt', '@API reference',
+ 'api.txt', 'parsing.txt',
+ 'validation.txt', 'xpathxslt.txt',
+ 'objectify.txt', 'lxmlhtml.txt',
+ 'cssselect.txt', 'elementsoup.txt',
+ 'html5parser.txt')),
+ ('Extending lxml', ('resolvers.txt', 'extensions.txt',
+ 'element_classes.txt', 'sax.txt', 'capi.txt')),
+ ('Developing lxml', ('build.txt', 'lxml-source-howto.txt',
+ '@Release Changelog', '../CREDITS.txt')),
+ ]
+
+HREF_MAP = {
+ "API reference" : "apidoc/lxml.html"
+}
+
+BASENAME_MAP = {
+ 'main' : 'index',
+ 'INSTALL' : 'installation',
+ 'CREDITS' : 'credits',
+}
diff --git a/doc/element_classes.txt b/doc/element_classes.txt
new file mode 100644
index 0000000..4b1e72e
--- /dev/null
+++ b/doc/element_classes.txt
@@ -0,0 +1,615 @@
+====================================
+Using custom Element classes in lxml
+====================================
+
+lxml has very sophisticated support for custom Element classes. You
+can provide your own classes for Elements and have lxml use them by
+default for all elements generated by a specific parser, only for a
+specific tag name in a specific namespace or even for an exact element
+at a specific position in the tree.
+
+Custom Elements must inherit from the ``lxml.etree.ElementBase`` class, which
+provides the Element interface for subclasses:
+
+.. sourcecode:: pycon
+
+ >>> from lxml import etree
+
+ >>> class honk(etree.ElementBase):
+ ... @property
+ ... def honking(self):
+ ... return self.get('honking') == 'true'
+
+This defines a new Element class ``honk`` with a property ``honking``.
+
+The following document describes how you can make lxml.etree use these
+custom Element classes.
+
+.. contents::
+..
+ 1 Background on Element proxies
+ 2 Element initialization
+ 3 Setting up a class lookup scheme
+ 3.1 Default class lookup
+ 3.2 Namespace class lookup
+ 3.3 Attribute based lookup
+ 3.4 Custom element class lookup
+ 3.5 Tree based element class lookup in Python
+ 4 Generating XML with custom classes
+ 5 Implementing namespaces
+
+
+Background on Element proxies
+=============================
+
+Being based on libxml2, lxml.etree holds the entire XML tree in a C
+structure. To communicate with Python code, it creates Python proxy
+objects for the XML elements on demand.
+
+ .. image:: proxies.png
+
+The mapping between C elements and Python Element classes is
+completely configurable. When you ask lxml.etree for an Element by
+using its API, it will instantiate your classes for you. All you have
+to do is tell lxml which class to use for which kind of Element. This
+is done through a class lookup scheme, as described in the sections
+below.
+
+
+Element initialization
+======================
+
+There is one thing to know up front. Element classes *must not* have
+an ``__init___`` or ``__new__`` method. There should not be any
+internal state either, except for the data stored in the underlying
+XML tree. Element instances are created and garbage collected at
+need, so there is normally no way to predict when and how often a
+proxy is created for them. Even worse, when the ``__init__`` method
+is called, the object is not even initialized yet to represent the XML
+tag, so there is not much use in providing an ``__init__`` method in
+subclasses.
+
+Most use cases will not require any class initialisation or proxy
+state, so you can content yourself with skipping to the next section
+for now. However, if you really need to set up your element class on
+instantiation, or need a way to persistently store state in the proxy
+instances instead of the XML tree, here is a way to do so.
+
+There is one important guarantee regarding Element proxies. Once a
+proxy has been instantiated, it will keep alive as long as there is a
+Python reference to it, and any access to the XML element in the tree
+will return this very instance. Therefore, if you need to store local
+state in a custom Element class (which is generally discouraged), you
+can do so by keeping the Elements in a tree alive. If the tree
+doesn't change, you can simply do this:
+
+.. sourcecode:: python
+
+ proxy_cache = list(root.iter())
+
+or
+
+.. sourcecode:: python
+
+ proxy_cache = set(root.iter())
+
+or use any other suitable container. Note that you have to keep this
+cache manually up to date if the tree changes, which can get tricky in
+cases.
+
+For proxy initialisation, ElementBase classes have an ``_init()``
+method that can be overridden, as oppose to the normal ``__init__()``
+method. It can be used to modify the XML tree, e.g. to construct
+special children or verify and update attributes.
+
+The semantics of ``_init()`` are as follows:
+
+* It is called once on Element class instantiation time. That is,
+ when a Python representation of the element is created by lxml. At
+ that time, the element object is completely initialized to represent
+ a specific XML element within the tree.
+
+* The method has complete access to the XML tree. Modifications can be done
+ in exactly the same way as anywhere else in the program.
+
+* Python representations of elements may be created multiple times during the
+ lifetime of an XML element in the underlying C tree. The ``_init()`` code
+ provided by subclasses must take special care by itself that multiple
+ executions either are harmless or that they are prevented by some kind of
+ flag in the XML tree. The latter can be achieved by modifying an attribute
+ value or by removing or adding a specific child node and then verifying this
+ before running through the init process.
+
+* Any exceptions raised in ``_init()`` will be propagated through the API
+ call that lead to the creation of the Element. So be careful with the code
+ you write here as its exceptions may turn up in various unexpected places.
+
+
+Setting up a class lookup scheme
+================================
+
+The first thing to do when deploying custom element classes is to register a
+class lookup scheme on a parser. lxml.etree provides quite a number of
+different schemes that also support class lookup based on namespaces or
+attribute values. Most lookups support fallback chaining, which allows the
+next lookup mechanism to take over when the previous one fails to find a
+class.
+
+For example, setting the ``honk`` Element as a default element class
+for a parser works as follows:
+
+.. sourcecode:: pycon
+
+ >>> parser_lookup = etree.ElementDefaultClassLookup(element=honk)
+ >>> parser = etree.XMLParser()
+ >>> parser.set_element_class_lookup(parser_lookup)
+
+There is one drawback of the parser based scheme: the ``Element()`` factory
+does not know about your specialised parser and creates a new document that
+deploys the default parser:
+
+.. sourcecode:: pycon
+
+ >>> el = etree.Element("root")
+ >>> print(isinstance(el, honk))
+ False
+
+You should therefore avoid using this factory function in code that
+uses custom classes. The ``makeelement()`` method of parsers provides
+a simple replacement:
+
+.. sourcecode:: pycon
+
+ >>> el = parser.makeelement("root")
+ >>> print(isinstance(el, honk))
+ True
+
+If you use a parser at the module level, you can easily redirect a module
+level ``Element()`` factory to the parser method by adding code like this:
+
+.. sourcecode:: pycon
+
+ >>> module_level_parser = etree.XMLParser()
+ >>> Element = module_level_parser.makeelement
+
+While the ``XML()`` and ``HTML()`` factories also depend on the default
+parser, you can pass them a different parser as second argument:
+
+.. sourcecode:: pycon
+
+ >>> element = etree.XML("<test/>")
+ >>> print(isinstance(element, honk))
+ False
+
+ >>> element = etree.XML("<test/>", parser)
+ >>> print(isinstance(element, honk))
+ True
+
+Whenever you create a document with a parser, it will inherit the lookup
+scheme and all subsequent element instantiations for this document will use
+it:
+
+.. sourcecode:: pycon
+
+ >>> element = etree.fromstring("<test/>", parser)
+ >>> print(isinstance(element, honk))
+ True
+ >>> el = etree.SubElement(element, "subel")
+ >>> print(isinstance(el, honk))
+ True
+
+For testing code in the Python interpreter and for small projects, you
+may also consider setting a lookup scheme on the default parser. To
+avoid interfering with other modules, however, it is usually a better
+idea to use a dedicated parser for each module (or a parser pool when
+using threads) and then register the required lookup scheme only for
+this parser.
+
+
+Default class lookup
+--------------------
+
+This is the most simple lookup mechanism. It always returns the default
+element class. Consequently, no further fallbacks are supported, but this
+scheme is a nice fallback for other custom lookup mechanisms. Specifically,
+it also handles comments and processing instructions, which are easy to
+forget about when mapping proxies to classes.
+
+Usage:
+
+.. sourcecode:: pycon
+
+ >>> lookup = etree.ElementDefaultClassLookup()
+ >>> parser = etree.XMLParser()
+ >>> parser.set_element_class_lookup(lookup)
+
+Note that the default for new parsers is to use the global fallback, which is
+also the default lookup (if not configured otherwise).
+
+To change the default element implementation, you can pass your new class to
+the constructor. While it accepts classes for ``element``, ``comment`` and
+``pi`` nodes, most use cases will only override the element class:
+
+.. sourcecode:: pycon
+
+ >>> el = parser.makeelement("myelement")
+ >>> print(isinstance(el, honk))
+ False
+
+ >>> lookup = etree.ElementDefaultClassLookup(element=honk)
+ >>> parser.set_element_class_lookup(lookup)
+
+ >>> el = parser.makeelement("myelement")
+ >>> print(isinstance(el, honk))
+ True
+ >>> el.honking
+ False
+ >>> el = parser.makeelement("myelement", honking='true')
+ >>> etree.tostring(el)
+ b'<myelement honking="true"/>'
+ >>> el.honking
+ True
+
+ >>> root = etree.fromstring(
+ ... '<root honking="true"><!--comment--></root>', parser)
+ >>> root.honking
+ True
+ >>> print(root[0].text)
+ comment
+
+
+Namespace class lookup
+----------------------
+
+This is an advanced lookup mechanism that supports namespace/tag-name specific
+element classes. You can select it by calling:
+
+.. sourcecode:: pycon
+
+ >>> lookup = etree.ElementNamespaceClassLookup()
+ >>> parser = etree.XMLParser()
+ >>> parser.set_element_class_lookup(lookup)
+
+See the separate section on `implementing namespaces`_ below to learn how to
+make use of it.
+
+.. _`implementing namespaces`: #implementing-namespaces
+
+This scheme supports a fallback mechanism that is used in the case where the
+namespace is not found or no class was registered for the element name.
+Normally, the default class lookup is used here. To change it, pass the
+desired fallback lookup scheme to the constructor:
+
+.. sourcecode:: pycon
+
+ >>> fallback = etree.ElementDefaultClassLookup(element=honk)
+ >>> lookup = etree.ElementNamespaceClassLookup(fallback)
+ >>> parser.set_element_class_lookup(lookup)
+
+ >>> root = etree.fromstring(
+ ... '<root honking="true"><!--comment--></root>', parser)
+ >>> root.honking
+ True
+ >>> print(root[0].text)
+ comment
+
+
+Attribute based lookup
+----------------------
+
+This scheme uses a mapping from attribute values to classes. An attribute
+name is set at initialisation time and is then used to find the corresponding
+value in a dictionary. It is set up as follows:
+
+.. sourcecode:: pycon
+
+ >>> id_class_mapping = {'1234' : honk} # maps attribute values to classes
+
+ >>> lookup = etree.AttributeBasedElementClassLookup(
+ ... 'id', id_class_mapping)
+ >>> parser = etree.XMLParser()
+ >>> parser.set_element_class_lookup(lookup)
+
+And here is how to use it:
+
+.. sourcecode:: pycon
+
+ >>> xml = '<a id="123"><b id="1234"/><b id="1234" honking="true"/></a>'
+ >>> a = etree.fromstring(xml, parser)
+
+ >>> a.honking # id does not match !
+ Traceback (most recent call last):
+ AttributeError: 'lxml.etree._Element' object has no attribute 'honking'
+
+ >>> a[0].honking
+ False
+ >>> a[1].honking
+ True
+
+This lookup scheme uses its fallback if the attribute is not found or
+its value is not in the mapping. Normally, the default class lookup
+is used here. If you want to use the namespace lookup, for example,
+you can use this code:
+
+.. sourcecode:: pycon
+
+ >>> fallback = etree.ElementNamespaceClassLookup()
+ >>> lookup = etree.AttributeBasedElementClassLookup(
+ ... 'id', id_class_mapping, fallback)
+ >>> parser = etree.XMLParser()
+ >>> parser.set_element_class_lookup(lookup)
+
+
+Custom element class lookup
+---------------------------
+
+This is the most customisable way of finding element classes on a per-element
+basis. It allows you to implement a custom lookup scheme in a subclass:
+
+.. sourcecode:: pycon
+
+ >>> class MyLookup(etree.CustomElementClassLookup):
+ ... def lookup(self, node_type, document, namespace, name):
+ ... if node_type == 'element':
+ ... return honk # be a bit more selective here ...
+ ... else:
+ ... return None # pass on to (default) fallback
+
+ >>> parser = etree.XMLParser()
+ >>> parser.set_element_class_lookup(MyLookup())
+
+ >>> root = etree.fromstring(
+ ... '<root honking="true"><!--comment--></root>', parser)
+ >>> root.honking
+ True
+ >>> print(root[0].text)
+ comment
+
+The ``.lookup()`` method must return either None (which triggers the
+fallback mechanism) or a subclass of ``lxml.etree.ElementBase``. It
+can take any decision it wants based on the node type (one of
+"element", "comment", "PI", "entity"), the XML document of the
+element, or its namespace or tag name.
+
+
+Tree based element class lookup in Python
+-----------------------------------------
+
+Taking more elaborate decisions than allowed by the custom scheme is
+difficult to achieve in pure Python, as it results in a
+chicken-and-egg problem. It would require access to the tree - before
+the elements in the tree have been instantiated as Python Element
+proxies.
+
+Luckily, there is a way to do this. The ``PythonElementClassLookup``
+works similar to the custom lookup scheme:
+
+.. sourcecode:: pycon
+
+ >>> class MyLookup(etree.PythonElementClassLookup):
+ ... def lookup(self, document, element):
+ ... return MyElementClass # defined elsewhere
+
+ >>> parser = etree.XMLParser()
+ >>> parser.set_element_class_lookup(MyLookup())
+
+As before, the first argument to the ``lookup()`` method is the opaque
+document instance that contains the Element. The second arguments is a
+lightweight Element proxy implementation that is only valid during the lookup.
+Do not try to keep a reference to it. Once the lookup is finished, the proxy
+will become invalid. You will get an ``AssertionError`` if you access any of
+the properties or methods outside the scope of the lookup call where they were
+instantiated.
+
+During the lookup, the element object behaves mostly like a normal Element
+instance. It provides the properties ``tag``, ``text``, ``tail`` etc. and
+supports indexing, slicing and the ``getchildren()``, ``getparent()``
+etc. methods. It does *not* support iteration, nor does it support any kind
+of modification. All of its properties are read-only and it cannot be removed
+or inserted into other trees. You can use it as a starting point to freely
+traverse the tree and collect any kind of information that its elements
+provide. Once you have taken the decision which class to use for this
+element, you can simply return it and have lxml take care of cleaning up the
+instantiated proxy classes.
+
+Sidenote: this lookup scheme originally lived in a separate module called
+``lxml.pyclasslookup``.
+
+
+Generating XML with custom classes
+==================================
+
+Up to lxml 2.1, you could not instantiate proxy classes yourself.
+Only lxml.etree could do that when creating an object representation
+of an existing XML element. Since lxml 2.2, however, instantiating
+this class will simply create a new Element:
+
+.. sourcecode:: pycon
+
+ >>> el = honk(honking='true')
+ >>> el.tag
+ 'honk'
+ >>> el.honking
+ True
+
+Note, however, that the proxy you create here will be garbage
+collected just like any other proxy. You can therefore not count on
+lxml.etree using the same class that you instantiated when you access
+this Element a second time after letting its reference go. You should
+therefore always use a corresponding class lookup scheme that returns
+your Element proxy classes for the elements that they create. The
+``ElementNamespaceClassLookup`` is generally a good match.
+
+You can use custom Element classes to quickly create XML fragments:
+
+.. sourcecode:: pycon
+
+ >>> class hale(etree.ElementBase): pass
+ >>> class bopp(etree.ElementBase): pass
+
+ >>> el = hale( "some ", honk(honking = 'true'), bopp, " text" )
+
+ >>> print(etree.tostring(el, encoding='unicode'))
+ <hale>some <honk honking="true"/><bopp/> text</hale>
+
+
+Implementing namespaces
+=======================
+
+lxml allows you to implement namespaces, in a rather literal sense. After
+setting up the namespace class lookup mechanism as described above, you can
+build a new element namespace (or retrieve an existing one) by calling the
+``get_namespace(uri)`` method of the lookup:
+
+.. sourcecode:: pycon
+
+ >>> lookup = etree.ElementNamespaceClassLookup()
+ >>> parser = etree.XMLParser()
+ >>> parser.set_element_class_lookup(lookup)
+
+ >>> namespace = lookup.get_namespace('http://hui.de/honk')
+
+and then register the new element type with that namespace, say, under the tag
+name ``honk``:
+
+.. sourcecode:: pycon
+
+ >>> namespace['honk'] = honk
+
+If you have many Element classes declared in one module, and they are
+all named like the elements they create, you can simply use
+``namespace.update(globals())`` at the end of your module to declare them
+automatically. The implementation is smart enough to ignore
+everything that is not an Element class.
+
+After this, you create and use your XML elements through the normal API of
+lxml:
+
+.. sourcecode:: pycon
+
+ >>> xml = '<honk xmlns="http://hui.de/honk" honking="true"/>'
+ >>> honk_element = etree.XML(xml, parser)
+ >>> print(honk_element.honking)
+ True
+
+The same works when creating elements by hand:
+
+.. sourcecode:: pycon
+
+ >>> honk_element = parser.makeelement('{http://hui.de/honk}honk',
+ ... honking='true')
+ >>> print(honk_element.honking)
+ True
+
+Essentially, what this allows you to do, is to give Elements a custom API
+based on their namespace and tag name.
+
+A somewhat related topic are `extension functions`_ which use a similar
+mechanism for registering Python functions for use in XPath and XSLT.
+
+.. _`extension functions`: extensions.html
+
+In the setup example above, we associated the ``honk`` Element class
+only with the 'honk' element. If an XML tree contains different
+elements in the same namespace, they do not pick up the same
+implementation:
+
+.. sourcecode:: pycon
+
+ >>> xml = ('<honk xmlns="http://hui.de/honk" honking="true">'
+ ... '<bla/><!--comment-->'
+ ... '</honk>')
+ >>> honk_element = etree.XML(xml, parser)
+ >>> print(honk_element.honking)
+ True
+ >>> print(honk_element[0].honking)
+ Traceback (most recent call last):
+ ...
+ AttributeError: 'lxml.etree._Element' object has no attribute 'honking'
+ >>> print(honk_element[1].text)
+ comment
+
+You can therefore provide one implementation per element name in each
+namespace and have lxml select the right one on the fly. If you want one
+element implementation per namespace (ignoring the element name) or prefer
+having a common class for most elements except a few, you can specify a
+default implementation for an entire namespace by registering that class with
+the empty element name (``None``).
+
+You may consider following an object oriented approach here. If you build a
+class hierarchy of element classes, you can also implement a base class for a
+namespace that is used if no specific element class is provided. Again, you
+can just pass None as an element name:
+
+.. sourcecode:: pycon
+
+ >>> class HonkNSElement(etree.ElementBase):
+ ... def honk(self):
+ ... return "HONK"
+ >>> namespace[None] = HonkNSElement # default Element for namespace
+
+ >>> class HonkElement(HonkNSElement):
+ ... @property
+ ... def honking(self):
+ ... return self.get('honking') == 'true'
+ >>> namespace['honk'] = HonkElement # Element for specific tag
+
+Now you can rely on lxml to always return objects of type HonkNSElement or its
+subclasses for elements of this namespace:
+
+.. sourcecode:: pycon
+
+ >>> xml = ('<honk xmlns="http://hui.de/honk" honking="true">'
+ ... '<bla/><!--comment-->'
+ ... '</honk>')
+ >>> honk_element = etree.fromstring(xml, parser)
+
+ >>> print(type(honk_element))
+ <class 'HonkElement'>
+ >>> print(type(honk_element[0]))
+ <class 'HonkNSElement'>
+
+ >>> print(honk_element.honking)
+ True
+ >>> print(honk_element.honk())
+ HONK
+
+ >>> print(honk_element[0].honk())
+ HONK
+ >>> print(honk_element[0].honking)
+ Traceback (most recent call last):
+ ...
+ AttributeError: 'HonkNSElement' object has no attribute 'honking'
+
+ >>> print(honk_element[1].text) # uses fallback for non-elements
+ comment
+
+Since lxml 4.1, the registration is more conveniently done with
+class decorators. The namespace registry object is callable with
+a name (or ``None``) as argument and can then be used as decorator.
+
+.. sourcecode:: pycon
+
+ >>> honk_elements = lookup.get_namespace('http://hui.de/honk')
+
+ >>> @honk_elements(None)
+ ... class HonkNSElement(etree.ElementBase):
+ ... def honk(self):
+ ... return "HONK"
+
+If the class has the same name as the tag, you can also leave out the call
+and use the blank decorator instead:
+
+ >>> @honk_elements
+ ... class honkel(HonkNSElement):
+ ... @property
+ ... def honking(self):
+ ... return self.get('honking') == 'true'
+
+ >>> xml = '<honkel xmlns="http://hui.de/honk" honking="true"><bla/><!--comment--></honkel>'
+ >>> honk_element = etree.fromstring(xml, parser)
+
+ >>> print(type(honk_element))
+ <class 'honkel'>
+ >>> print(type(honk_element[0]))
+ <class 'HonkNSElement'>
diff --git a/doc/elementsoup.txt b/doc/elementsoup.txt
new file mode 100644
index 0000000..9317f65
--- /dev/null
+++ b/doc/elementsoup.txt
@@ -0,0 +1,222 @@
+====================
+BeautifulSoup Parser
+====================
+
+BeautifulSoup_ is a Python package for working with real-world and broken HTML,
+just like `lxml.html <lxmlhtml.html>`_. As of version 4.x, it can use
+`different HTML parsers
+<http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser>`_,
+each of which has its advantages and disadvantages (see the link).
+
+lxml can make use of BeautifulSoup as a parser backend, just like BeautifulSoup
+can employ lxml as a parser. When using BeautifulSoup from lxml, however, the
+default is to use Python's integrated HTML parser in the
+`html.parser <https://docs.python.org/3/library/html.parser.html>`_ module.
+In order to make use of the HTML5 parser of
+`html5lib <https://pypi.python.org/pypi/html5lib>`_ instead, it is better
+to go directly through the `html5parser module <html5parser.html>`_ in
+``lxml.html``.
+
+A very nice feature of BeautifulSoup is its excellent `support for encoding
+detection`_ which can provide better results for real-world HTML pages that
+do not (correctly) declare their encoding.
+
+.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
+.. _`support for encoding detection`: http://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit
+.. _ElementSoup: http://effbot.org/zone/element-soup.htm
+
+lxml interfaces with BeautifulSoup through the ``lxml.html.soupparser``
+module. It provides three main functions: ``fromstring()`` and ``parse()``
+to parse a string or file using BeautifulSoup into an ``lxml.html``
+document, and ``convert_tree()`` to convert an existing BeautifulSoup
+tree into a list of top-level Elements.
+
+.. contents::
+..
+ 1 Parsing with the soupparser
+ 2 Entity handling
+ 3 Using soupparser as a fallback
+ 4 Using only the encoding detection
+
+
+Parsing with the soupparser
+===========================
+
+The functions ``fromstring()`` and ``parse()`` behave as known from
+lxml. The first returns a root Element, the latter returns an
+ElementTree.
+
+There is also a legacy module called ``lxml.html.ElementSoup``, which
+mimics the interface provided by Fredrik Lundh's ElementSoup_
+module. Note that the ``soupparser`` module was added in lxml 2.0.3.
+Previous versions of lxml 2.0.x only have the ``ElementSoup`` module.
+
+Here is a document full of tag soup, similar to, but not quite like, HTML:
+
+.. sourcecode:: pycon
+
+ >>> tag_soup = '''
+ ... <meta/><head><title>Hello</head><body onload=crash()>Hi all<p>'''
+
+All you need to do is pass it to the ``fromstring()`` function:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.html.soupparser import fromstring
+ >>> root = fromstring(tag_soup)
+
+To see what we have here, you can serialise it:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.etree import tostring
+ >>> print(tostring(root, pretty_print=True).strip())
+ <html>
+ <meta/>
+ <head>
+ <title>Hello</title>
+ </head>
+ <body onload="crash()">Hi all<p/></body>
+ </html>
+
+Not quite what you'd expect from an HTML page, but, well, it was broken
+already, right? The parser did its best, and so now it's a tree.
+
+To control how Element objects are created during the conversion
+of the tree, you can pass a ``makeelement`` factory function to
+``parse()`` and ``fromstring()``. By default, this is based on the
+HTML parser defined in ``lxml.html``.
+
+For a quick comparison, libxml2 2.9.1 parses the same tag soup as
+follows. The only difference is that libxml2 tries harder to adhere
+to the structure of an HTML document and moves misplaced tags where
+they (likely) belong. Note, however, that the result can vary between
+parser versions.
+
+.. sourcecode:: html
+
+ <html>
+ <head>
+ <meta/>
+ <title>Hello</title>
+ </head>
+ <body onload="crash()">Hi all<p/></body>
+ </html>
+
+
+Entity handling
+===============
+
+By default, the BeautifulSoup parser also replaces the entities it
+finds by their character equivalent.
+
+.. sourcecode:: pycon
+
+ >>> tag_soup = '<body>&copy;&euro;&#45;&#245;&#445;<p>'
+ >>> body = fromstring(tag_soup).find('.//body')
+ >>> body.text
+ u'\xa9\u20ac-\xf5\u01bd'
+
+If you want them back on the way out, you can just serialise with the
+default encoding, which is 'US-ASCII'.
+
+.. sourcecode:: pycon
+
+ >>> tostring(body)
+ '<body>&#169;&#8364;-&#245;&#445;<p/></body>'
+
+ >>> tostring(body, method="html")
+ '<body>&#169;&#8364;-&#245;&#445;<p></p></body>'
+
+Any other encoding will output the respective byte sequences.
+
+.. sourcecode:: pycon
+
+ >>> tostring(body, encoding="utf-8")
+ '<body>\xc2\xa9\xe2\x82\xac-\xc3\xb5\xc6\xbd<p/></body>'
+
+ >>> tostring(body, method="html", encoding="utf-8")
+ '<body>\xc2\xa9\xe2\x82\xac-\xc3\xb5\xc6\xbd<p></p></body>'
+
+ >>> tostring(body, encoding='unicode')
+ u'<body>\xa9\u20ac-\xf5\u01bd<p/></body>'
+
+ >>> tostring(body, method="html", encoding='unicode')
+ u'<body>\xa9\u20ac-\xf5\u01bd<p></p></body>'
+
+
+Using soupparser as a fallback
+==============================
+
+The downside of using this parser is that it is `much slower`_ than
+the C implemented HTML parser of libxml2 that lxml uses. So if
+performance matters, you might want to consider using ``soupparser``
+only as a fallback for certain cases.
+
+.. _`much slower`: http://blog.ianbicking.org/2008/03/30/python-html-parser-performance/
+
+One common problem of lxml's parser is that it might not get the
+encoding right in cases where the document contains a ``<meta>`` tag
+at the wrong place. In this case, you can exploit the fact that lxml
+serialises much faster than most other HTML libraries for Python.
+Just serialise the document to unicode and if that gives you an
+exception, re-parse it with BeautifulSoup to see if that works
+better.
+
+.. sourcecode:: pycon
+
+ >>> tag_soup = '''\
+ ... <meta http-equiv="Content-Type"
+ ... content="text/html;charset=utf-8" />
+ ... <html>
+ ... <head>
+ ... <title>Hello W\xc3\xb6rld!</title>
+ ... </head>
+ ... <body>Hi all</body>
+ ... </html>'''
+
+ >>> import lxml.html
+ >>> import lxml.html.soupparser
+
+ >>> root = lxml.html.fromstring(tag_soup)
+ >>> try:
+ ... ignore = tostring(root, encoding='unicode')
+ ... except UnicodeDecodeError:
+ ... root = lxml.html.soupparser.fromstring(tag_soup)
+
+
+Using only the encoding detection
+=================================
+
+Even if you prefer lxml's fast HTML parser, you can still benefit
+from BeautifulSoup's `support for encoding detection`_ in the
+``UnicodeDammit`` class. Once it succeeds in decoding the data,
+you can simply pass the resulting Unicode string into lxml's parser.
+
+.. sourcecode:: pycon
+
+ >>> try:
+ ... from bs4 import UnicodeDammit # BeautifulSoup 4
+ ...
+ ... def decode_html(html_string):
+ ... converted = UnicodeDammit(html_string)
+ ... if not converted.unicode_markup:
+ ... raise UnicodeDecodeError(
+ ... "Failed to detect encoding, tried [%s]",
+ ... ', '.join(converted.tried_encodings))
+ ... # print converted.original_encoding
+ ... return converted.unicode_markup
+ ...
+ ... except ImportError:
+ ... from BeautifulSoup import UnicodeDammit # BeautifulSoup 3
+ ...
+ ... def decode_html(html_string):
+ ... converted = UnicodeDammit(html_string, isHTML=True)
+ ... if not converted.unicode:
+ ... raise UnicodeDecodeError(
+ ... "Failed to detect encoding, tried [%s]",
+ ... ', '.join(converted.triedEncodings))
+ ... # print converted.originalEncoding
+ ... return converted.unicode
+
+ >>> root = lxml.html.fromstring(decode_html(tag_soup))
diff --git a/doc/extensions.txt b/doc/extensions.txt
new file mode 100644
index 0000000..45bcf97
--- /dev/null
+++ b/doc/extensions.txt
@@ -0,0 +1,621 @@
+====================================
+Python extensions for XPath and XSLT
+====================================
+
+This document describes how to use Python extension functions in XPath
+and XSLT like this:
+
+.. sourcecode:: xml
+
+ <xsl:value-of select="f:myPythonFunction(.//sometag)" />
+
+and extension elements in XSLT as in the following example:
+
+.. sourcecode:: xml
+
+ <xsl:template match="*">
+ <my:python-extension>
+ <some-content />
+ </my:python-extension>
+ </xsl:template>
+
+
+.. contents::
+..
+ 1 XPath Extension functions
+ 1.1 The FunctionNamespace
+ 1.2 Global prefix assignment
+ 1.3 The XPath context
+ 1.4 Evaluators and XSLT
+ 1.5 Evaluator-local extensions
+ 1.6 What to return from a function
+ 2 XSLT extension elements
+ 2.1 Declaring extension elements
+ 2.2 Applying XSL templates
+ 2.3 Working with read-only elements
+
+..
+ >>> try: from StringIO import StringIO
+ ... except ImportError:
+ ... from io import BytesIO
+ ... def StringIO(s):
+ ... if isinstance(s, str): s = s.encode("UTF-8")
+ ... return BytesIO(s)
+
+
+XPath Extension functions
+=========================
+
+Here is how an extension function looks like. As the first argument,
+it always receives a context object (see below). The other arguments
+are provided by the respective call in the XPath expression, one in
+the following examples. Any number of arguments is allowed:
+
+.. sourcecode:: pycon
+
+ >>> def hello(context, a):
+ ... return "Hello %s" % a
+ >>> def ola(context, a):
+ ... return "Ola %s" % a
+ >>> def loadsofargs(context, *args):
+ ... return "Got %d arguments." % len(args)
+
+
+The FunctionNamespace
+---------------------
+
+In order to use a function in XPath or XSLT, it needs to have a
+(namespaced) name by which it can be called during evaluation. This
+is done using the FunctionNamespace class. For simplicity, we choose
+the empty namespace (None):
+
+.. sourcecode:: pycon
+
+ >>> from lxml import etree
+ >>> ns = etree.FunctionNamespace(None)
+ >>> ns['hello'] = hello
+ >>> ns['countargs'] = loadsofargs
+
+This registers the function `hello` with the name `hello` in the default
+namespace (None), and the function `loadsofargs` with the name `countargs`.
+
+Since lxml 4.1, it is preferred to use the ``FunctionNamespace`` as a decorator.
+Either pass an explicit function name (``@ns("countargs")``), or just use the
+bare decorator to register the function under its own name:
+
+.. sourcecode:: pycon
+
+ >>> @ns
+ ... def hello(context, a):
+ ... return "Hello %s" % a
+
+Now we're going to create a document that we can run XPath expressions
+against:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML('<a><b>Haegar</b></a>')
+ >>> doc = etree.ElementTree(root)
+
+Done. Now we can have XPath expressions call our new function:
+
+.. sourcecode:: pycon
+
+ >>> print(root.xpath("hello('Dr. Falken')"))
+ Hello Dr. Falken
+ >>> print(root.xpath('hello(local-name(*))'))
+ Hello b
+ >>> print(root.xpath('hello(string(b))'))
+ Hello Haegar
+ >>> print(root.xpath('countargs(., b, ./*)'))
+ Got 3 arguments.
+
+Note how we call both a Python function (``hello()``) and an XPath built-in
+function (``string()``) in exactly the same way. Normally, however, you would
+want to separate the two in different namespaces. The FunctionNamespace class
+allows you to do this:
+
+.. sourcecode:: pycon
+
+ >>> ns = etree.FunctionNamespace('http://mydomain.org/myfunctions')
+ >>> ns['hello'] = hello
+
+ >>> prefixmap = {'f' : 'http://mydomain.org/myfunctions'}
+ >>> print(root.xpath('f:hello(local-name(*))', namespaces=prefixmap))
+ Hello b
+
+
+Global prefix assignment
+------------------------
+
+In the last example, you had to specify a prefix for the function namespace.
+If you always use the same prefix for a function namespace, you can also
+register it with the namespace:
+
+.. sourcecode:: pycon
+
+ >>> ns = etree.FunctionNamespace('http://mydomain.org/myother/functions')
+ >>> ns.prefix = 'es'
+ >>> ns['hello'] = ola
+
+ >>> print(root.xpath('es:hello(local-name(*))'))
+ Ola b
+
+This is a global assignment, so take care not to assign the same prefix to
+more than one namespace. The resulting behaviour in that case is completely
+undefined. It is always a good idea to consistently use the same meaningful
+prefix for each namespace throughout your application.
+
+The prefix assignment only works with functions and FunctionNamespace objects,
+not with the general Namespace object that registers element classes. The
+reasoning is that elements in lxml do not care about prefixes anyway, so it
+would rather complicate things than be of any help.
+
+
+The XPath context
+-----------------
+
+Functions get a context object as first parameter. In lxml 1.x, this value
+was None, but since lxml 2.0 it provides two properties: ``eval_context`` and
+``context_node``. The context node is the Element where the current function
+is called:
+
+.. sourcecode:: pycon
+
+ >>> def print_tag(context, nodes):
+ ... print("%s: %s" % (context.context_node.tag, [ n.tag for n in nodes ]))
+
+ >>> ns = etree.FunctionNamespace('http://mydomain.org/printtag')
+ >>> ns.prefix = "pt"
+ >>> ns["print_tag"] = print_tag
+
+ >>> ignore = root.xpath("//*[pt:print_tag(.//*)]")
+ a: ['b']
+ b: []
+
+The ``eval_context`` is a dictionary that is local to the evaluation. It
+allows functions to keep state:
+
+.. sourcecode:: pycon
+
+ >>> def print_context(context):
+ ... context.eval_context[context.context_node.tag] = "done"
+ ... print(sorted(context.eval_context.items()))
+ >>> ns["print_context"] = print_context
+
+ >>> ignore = root.xpath("//*[pt:print_context()]")
+ [('a', 'done')]
+ [('a', 'done'), ('b', 'done')]
+
+
+Evaluators and XSLT
+-------------------
+
+Extension functions work for all ways of evaluating XPath expressions and for
+XSL transformations:
+
+.. sourcecode:: pycon
+
+ >>> e = etree.XPathEvaluator(doc)
+ >>> print(e('es:hello(local-name(/a))'))
+ Ola a
+
+ >>> namespaces = {'f' : 'http://mydomain.org/myfunctions'}
+ >>> e = etree.XPathEvaluator(doc, namespaces=namespaces)
+ >>> print(e('f:hello(local-name(/a))'))
+ Hello a
+
+ >>> xslt = etree.XSLT(etree.XML('''
+ ... <stylesheet version="1.0"
+ ... xmlns="http://www.w3.org/1999/XSL/Transform"
+ ... xmlns:es="http://mydomain.org/myother/functions">
+ ... <output method="text" encoding="ASCII"/>
+ ... <template match="/">
+ ... <value-of select="es:hello(string(//b))"/>
+ ... </template>
+ ... </stylesheet>
+ ... '''))
+ >>> print(xslt(doc))
+ Ola Haegar
+
+It is also possible to register namespaces with a single evaluator after its
+creation. While the following example involves no functions, the idea should
+still be clear:
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO('<a xmlns="http://mydomain.org/myfunctions" />')
+ >>> ns_doc = etree.parse(f)
+ >>> e = etree.XPathEvaluator(ns_doc)
+ >>> e('/a')
+ []
+
+This returns nothing, as we did not ask for the right namespace. When we
+register the namespace with the evaluator, however, we can access it via a
+prefix:
+
+.. sourcecode:: pycon
+
+ >>> e.register_namespace('foo', 'http://mydomain.org/myfunctions')
+ >>> e('/foo:a')[0].tag
+ '{http://mydomain.org/myfunctions}a'
+
+Note that this prefix mapping is only known to this evaluator, as opposed to
+the global mapping of the FunctionNamespace objects:
+
+.. sourcecode:: pycon
+
+ >>> e2 = etree.XPathEvaluator(ns_doc)
+ >>> e2('/foo:a')
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XPathEvalError: Undefined namespace prefix
+
+
+Evaluator-local extensions
+--------------------------
+
+Apart from the global registration of extension functions, there is also a way
+of making extensions known to a single Evaluator or XSLT. All evaluators and
+the XSLT object accept a keyword argument ``extensions`` in their constructor.
+The value is a dictionary mapping (namespace, name) tuples to functions:
+
+.. sourcecode:: pycon
+
+ >>> extensions = {('local-ns', 'local-hello') : hello}
+ >>> namespaces = {'l' : 'local-ns'}
+
+ >>> e = etree.XPathEvaluator(doc, namespaces=namespaces, extensions=extensions)
+ >>> print(e('l:local-hello(string(b))'))
+ Hello Haegar
+
+For larger numbers of extension functions, you can define classes or modules
+and use the ``Extension`` helper:
+
+.. sourcecode:: pycon
+
+ >>> class MyExt:
+ ... def function1(self, _, arg):
+ ... return '1'+arg
+ ... def function2(self, _, arg):
+ ... return '2'+arg
+ ... def function3(self, _, arg):
+ ... return '3'+arg
+
+ >>> ext_module = MyExt()
+ >>> functions = ('function1', 'function2')
+ >>> extensions = etree.Extension( ext_module, functions, ns='local-ns' )
+
+ >>> e = etree.XPathEvaluator(doc, namespaces=namespaces, extensions=extensions)
+ >>> print(e('l:function1(string(b))'))
+ 1Haegar
+
+The optional second argument to ``Extension`` can either be a
+sequence of names to select from the module, a dictionary that
+explicitly maps function names to their XPath alter-ego or ``None``
+(explicitly passed) to take all available functions under their
+original name (if their name does not start with '_').
+
+The additional ``ns`` keyword argument takes a namespace URI or
+``None`` (also if left out) for the default namespace. The following
+examples will therefore all do the same thing:
+
+.. sourcecode:: pycon
+
+ >>> functions = ('function1', 'function2', 'function3')
+ >>> extensions = etree.Extension( ext_module, functions )
+ >>> e = etree.XPathEvaluator(doc, extensions=extensions)
+ >>> print(e('function1(function2(function3(string(b))))'))
+ 123Haegar
+
+ >>> extensions = etree.Extension( ext_module, functions, ns=None )
+ >>> e = etree.XPathEvaluator(doc, extensions=extensions)
+ >>> print(e('function1(function2(function3(string(b))))'))
+ 123Haegar
+
+ >>> extensions = etree.Extension(ext_module)
+ >>> e = etree.XPathEvaluator(doc, extensions=extensions)
+ >>> print(e('function1(function2(function3(string(b))))'))
+ 123Haegar
+
+ >>> functions = {
+ ... 'function1' : 'function1',
+ ... 'function2' : 'function2',
+ ... 'function3' : 'function3'
+ ... }
+ >>> extensions = etree.Extension(ext_module, functions)
+ >>> e = etree.XPathEvaluator(doc, extensions=extensions)
+ >>> print(e('function1(function2(function3(string(b))))'))
+ 123Haegar
+
+For convenience, you can also pass a sequence of extensions:
+
+.. sourcecode:: pycon
+
+ >>> extensions1 = etree.Extension(ext_module)
+ >>> extensions2 = etree.Extension(ext_module, ns='local-ns')
+ >>> e = etree.XPathEvaluator(doc, extensions=[extensions1, extensions2],
+ ... namespaces=namespaces)
+ >>> print(e('function1(l:function2(function3(string(b))))'))
+ 123Haegar
+
+
+What to return from a function
+------------------------------
+
+.. _`XPath return values`: xpathxslt.html#xpath-return-values
+
+Extension functions can return any data type for which there is an XPath
+equivalent (see the documentation on `XPath return values`). This includes
+numbers, boolean values, elements and lists of elements. Note that integers
+will also be returned as floats:
+
+.. sourcecode:: pycon
+
+ >>> def returnsFloat(_):
+ ... return 1.7
+ >>> def returnsInteger(_):
+ ... return 1
+ >>> def returnsBool(_):
+ ... return True
+ >>> def returnFirstNode(_, nodes):
+ ... return nodes[0]
+
+ >>> ns = etree.FunctionNamespace(None)
+ >>> ns['float'] = returnsFloat
+ >>> ns['int'] = returnsInteger
+ >>> ns['bool'] = returnsBool
+ >>> ns['first'] = returnFirstNode
+
+ >>> e = etree.XPathEvaluator(doc)
+ >>> e("float()")
+ 1.7
+ >>> e("int()")
+ 1.0
+ >>> int( e("int()") )
+ 1
+ >>> e("bool()")
+ True
+ >>> e("count(first(//b))")
+ 1.0
+
+As the last example shows, you can pass the results of functions back into
+the XPath expression. Elements and sequences of elements are treated as
+XPath node-sets:
+
+.. sourcecode:: pycon
+
+ >>> def returnsNodeSet(_):
+ ... results1 = etree.Element('results1')
+ ... etree.SubElement(results1, 'result').text = "Alpha"
+ ... etree.SubElement(results1, 'result').text = "Beta"
+ ...
+ ... results2 = etree.Element('results2')
+ ... etree.SubElement(results2, 'result').text = "Gamma"
+ ... etree.SubElement(results2, 'result').text = "Delta"
+ ...
+ ... results3 = etree.SubElement(results2, 'subresult')
+ ... return [results1, results2, results3]
+
+ >>> ns['new-node-set'] = returnsNodeSet
+
+ >>> e = etree.XPathEvaluator(doc)
+
+ >>> r = e("new-node-set()/result")
+ >>> print([ t.text for t in r ])
+ ['Alpha', 'Beta', 'Gamma', 'Delta']
+
+ >>> r = e("new-node-set()")
+ >>> print([ t.tag for t in r ])
+ ['results1', 'results2', 'subresult']
+ >>> print([ len(t) for t in r ])
+ [2, 3, 0]
+ >>> r[0][0].text
+ 'Alpha'
+
+ >>> etree.tostring(r[0])
+ b'<results1><result>Alpha</result><result>Beta</result></results1>'
+
+ >>> etree.tostring(r[1])
+ b'<results2><result>Gamma</result><result>Delta</result><subresult/></results2>'
+
+ >>> etree.tostring(r[2])
+ b'<subresult/>'
+
+The current implementation deep-copies newly created elements in node-sets.
+Only the elements and their children are passed on, no outlying parents or
+tail texts will be available in the result. This also means that in the above
+example, the `subresult` elements in `results2` and `results3` are no longer
+identical within the node-set, they belong to independent trees:
+
+.. sourcecode:: pycon
+
+ >>> print("%s - %s" % (r[1][-1].tag, r[2].tag))
+ subresult - subresult
+ >>> print(r[1][-1] == r[2])
+ False
+ >>> print(r[1][-1].getparent().tag)
+ results2
+ >>> print(r[2].getparent())
+ None
+
+This is an implementation detail that you should be aware of, but you should
+avoid relying on it in your code. Note that elements taken from the source
+document (the most common case) do not suffer from this restriction. They
+will always be passed unchanged.
+
+
+XSLT extension elements
+=======================
+
+Just like the XPath extension functions described above, lxml supports
+custom extension *elements* in XSLT. This means, you can write XSLT
+code like this:
+
+.. sourcecode:: xml
+
+ <xsl:template match="*">
+ <my:python-extension>
+ <some-content />
+ </my:python-extension>
+ </xsl:template>
+
+And then you can implement the element in Python like this:
+
+.. sourcecode:: pycon
+
+ >>> class MyExtElement(etree.XSLTExtension):
+ ... def execute(self, context, self_node, input_node, output_parent):
+ ... print("Hello from XSLT!")
+ ... output_parent.text = "I did it!"
+ ... # just copy own content input to output
+ ... output_parent.extend( list(self_node) )
+
+The arguments passed to the ``.execute()`` method are
+
+context
+ The opaque evaluation context. You need this when calling back
+ into the XSLT processor.
+
+self_node
+ A read-only Element object that represents the extension element
+ in the stylesheet.
+
+input_node
+ The current context Element in the input document (also read-only).
+
+output_parent
+ The current insertion point in the output document. You can
+ append elements or set the text value (not the tail). Apart from
+ that, the Element is read-only.
+
+
+Declaring extension elements
+----------------------------
+
+In XSLT, extension elements can be used like any other XSLT element,
+except that they must be declared as extensions using the standard
+XSLT ``extension-element-prefixes`` option:
+
+.. sourcecode:: pycon
+
+ >>> xslt_ext_tree = etree.XML('''
+ ... <xsl:stylesheet version="1.0"
+ ... xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ ... xmlns:my="testns"
+ ... extension-element-prefixes="my">
+ ... <xsl:template match="/">
+ ... <foo><my:ext><child>XYZ</child></my:ext></foo>
+ ... </xsl:template>
+ ... <xsl:template match="child">
+ ... <CHILD>--xyz--</CHILD>
+ ... </xsl:template>
+ ... </xsl:stylesheet>''')
+
+To register the extension, add its namespace and name to the extension
+mapping of the XSLT object:
+
+.. sourcecode:: pycon
+
+ >>> my_extension = MyExtElement()
+ >>> extensions = { ('testns', 'ext') : my_extension }
+ >>> transform = etree.XSLT(xslt_ext_tree, extensions = extensions)
+
+Note how we pass an instance here, not the class of the extension.
+Now we can run the transformation and see how our extension is
+called:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML('<dummy/>')
+ >>> result = transform(root)
+ Hello from XSLT!
+ >>> str(result)
+ '<?xml version="1.0"?>\n<foo>I did it!<child>XYZ</child></foo>\n'
+
+
+Applying XSL templates
+----------------------
+
+XSLT extensions are a very powerful feature that allows you to
+interact directly with the XSLT processor. You have full read-only
+access to the input document and the stylesheet, and you can even call
+back into the XSLT processor to process templates. Here is an example
+that passes an Element into the ``.apply_templates()`` method of the
+``XSLTExtension`` instance:
+
+.. sourcecode:: pycon
+
+ >>> class MyExtElement(etree.XSLTExtension):
+ ... def execute(self, context, self_node, input_node, output_parent):
+ ... child = self_node[0]
+ ... results = self.apply_templates(context, child)
+ ... output_parent.append(results[0])
+
+ >>> my_extension = MyExtElement()
+ >>> extensions = { ('testns', 'ext') : my_extension }
+ >>> transform = etree.XSLT(xslt_ext_tree, extensions = extensions)
+
+ >>> root = etree.XML('<dummy/>')
+ >>> result = transform(root)
+ >>> str(result)
+ '<?xml version="1.0"?>\n<foo><CHILD>--xyz--</CHILD></foo>\n'
+
+Here, we applied the templates to a child of the extension element
+itself, i.e. to an element inside the stylesheet instead of an element
+of the input document.
+
+The return value of ``.apply_templates()`` is always a list. It may
+contain a mix of elements and strings, collected from the XSLT processing
+result. If you want to append these values to the output parent, be aware
+that you cannot use the ``.append()`` method to add strings. In many
+cases, you would only be interested in elements anyway, so you can discard
+strings (e.g. formatting whitespace) and append the rest.
+
+If you want to include string results in the output, you can either build
+an appropriate tree yourself and append that, or you can manually add the
+string values to the current output tree, e.g. by concatenating them with
+the ``.tail`` of the last element that was appended.
+
+Note that you can also let lxml build the result tree for you by passing
+the ``output_parent`` into the ``.apply_templates()`` method. In this
+case, the result will be None and all content found by applying templates
+will be appended to the output parent.
+
+If you do not care about string results at all, e.g. because you already
+know that they will only contain whitespace, you can pass the option
+``elements_only=True`` to the ``.apply_templates()`` method, or pass
+``remove_blank_text=True`` to remove only those strings that consist
+entirely of whitespace.
+
+
+Working with read-only elements
+-------------------------------
+
+There is one important thing to keep in mind: all Elements that the
+``execute()`` method gets to deal with are read-only Elements, so you
+cannot modify them. They also will not easily work in the API. For
+example, you cannot pass them to the ``tostring()`` function or wrap
+them in an ``ElementTree``.
+
+What you can do, however, is to deepcopy them to make them normal
+Elements, and then modify them using the normal etree API. So this
+will work:
+
+.. sourcecode:: pycon
+
+ >>> from copy import deepcopy
+ >>> class MyExtElement(etree.XSLTExtension):
+ ... def execute(self, context, self_node, input_node, output_parent):
+ ... child = deepcopy(self_node[0])
+ ... child.text = "NEW TEXT"
+ ... output_parent.append(child)
+
+ >>> my_extension = MyExtElement()
+ >>> extensions = { ('testns', 'ext') : my_extension }
+ >>> transform = etree.XSLT(xslt_ext_tree, extensions = extensions)
+
+ >>> root = etree.XML('<dummy/>')
+ >>> result = transform(root)
+ >>> str(result)
+ '<?xml version="1.0"?>\n<foo><child>NEW TEXT</child></foo>\n'
diff --git a/doc/html/flattr-badge-large.png b/doc/html/flattr-badge-large.png
new file mode 100644
index 0000000..1105305
--- /dev/null
+++ b/doc/html/flattr-badge-large.png
Binary files differ
diff --git a/doc/html/paypal_btn_donateCC_LG.gif b/doc/html/paypal_btn_donateCC_LG.gif
new file mode 100644
index 0000000..ab78d8b
--- /dev/null
+++ b/doc/html/paypal_btn_donateCC_LG.gif
Binary files differ
diff --git a/doc/html/paypal_btn_donateCC_LG.png b/doc/html/paypal_btn_donateCC_LG.png
new file mode 100644
index 0000000..46f7b89
--- /dev/null
+++ b/doc/html/paypal_btn_donateCC_LG.png
Binary files differ
diff --git a/doc/html/proxies.png b/doc/html/proxies.png
new file mode 100644
index 0000000..c00ccbc
--- /dev/null
+++ b/doc/html/proxies.png
Binary files differ
diff --git a/doc/html/python-xml-title.png b/doc/html/python-xml-title.png
new file mode 100644
index 0000000..1233952
--- /dev/null
+++ b/doc/html/python-xml-title.png
Binary files differ
diff --git a/doc/html/python-xml.png b/doc/html/python-xml.png
new file mode 100644
index 0000000..2a630df
--- /dev/null
+++ b/doc/html/python-xml.png
Binary files differ
diff --git a/doc/html/style.css b/doc/html/style.css
new file mode 100644
index 0000000..4cc454a
--- /dev/null
+++ b/doc/html/style.css
@@ -0,0 +1,399 @@
+body {
+ font: 11pt Arial, Verdana, Helvetica, sans-serif;
+ text-align: center;
+}
+
+@media screen and (min-width: 960px) and (min-height: 720px) and (max-resolution: 150dpi),
+ screen and (min-width: 1280px) and (min-height: 720px) {
+ body {
+ padding: 1em 3em 1em 26em;
+ }
+}
+
+@media screen {
+ div.document, div.footer {
+ max-width: 45em;
+ background-color: white;
+ }
+}
+
+@media print {
+ div.document, div.footer {
+ width: auto;
+ padding-left: 0;
+ }
+
+ div.sidemenu {
+ display: none;
+ }
+}
+
+div.document, div.footer {
+ margin: 1em auto 1em auto;
+ color: #222;
+}
+
+div.document {
+ text-align: left;
+}
+
+div.footer {
+ text-align: center;
+ font-size: 70%;
+}
+
+.center {
+ text-align: center;
+}
+
+/*** TOC ***/
+
+div.contents.topic ul {
+ margin-top: 0;
+}
+
+div.contents.topic ul > li {
+ text-decoration: none;
+ line-height: 1.2em;
+}
+
+div.contents.topic > p > a {
+ text-decoration: none;
+}
+
+/*** side menu ***/
+
+@media (min-width: 1280px) and (min-height: 720px),
+ (min-width: 960px) and (min-height: 720px) and (max-resolution: 150dpi) {
+ div.sidemenu .menutrigger {
+ display: none;
+ }
+
+ div.sidemenu {
+ position: absolute;
+ top: 0;
+ left: 0;
+ width: 26em;
+ font-size: 8pt;
+ text-align: left;
+ border-right: groove gray;
+ border-bottom: groove gray;
+ padding-right: 1ex;
+ background: #FFFAFA /* url(python-xml.png) no-repeat top right */ ;
+ }
+
+ html > body div.sidemenu {
+ /* ignored by IE -> everyone else knows 'fixed', right? */
+ position: fixed;
+ }
+}
+
+@media (max-width: 959px), (max-height: 719px), (max-width: 1279px) and (min-resolution: 151dpi) {
+ div.sidemenu > div.menutrigger {
+ display: block;
+ border: solid darkgreen 2px;
+ padding: 2px;
+ text-align: center;
+ width: 6ex;
+ }
+
+ div.sidemenu > div.menu {
+ display: none;
+ position: absolute;
+ z-index: 999;
+ font-size: 9pt;
+ text-align: left;
+ border: groove gray;
+ padding-right: 1ex;
+ background: #FFFAFA /* url(python-xml.png) no-repeat top right */ ;
+ }
+
+ div.sidemenu:hover > div.menu,
+ div.sidemenu.visible > div.menu {
+ display: block;
+ }
+}
+
+div.sidemenu > div.menu span.section.title {
+ line-height: 1.2em;
+ font-size: 130%;
+}
+
+div.sidemenu > div.menu ul.menu.current li {
+ color: #CC0000;
+ font-size: 105%;
+}
+
+div.sidemenu > div.menu ul.menu.current > li > a {
+ color: #CC0000;
+}
+
+div.sidemenu > div.menu ul.menu.current ul.submenu {
+ display: block;
+}
+
+div.sidemenu > div.menu ul.menu.foreign ul.submenu li {
+ padding-top: 2px;
+ padding-bottom: 2px;
+}
+
+div.sidemenu > div.menu ul.menu.foreign li.menu:hover ul.submenu {
+ display: block;
+ position: absolute;
+ border: groove #990000;
+ padding: 1ex 1ex 1ex 3ex;
+ margin-top: 0;
+ margin-left: 4em;
+ margin-right: -20em;
+ color: #990000;
+ background-color: white;
+}
+
+div.sidemenu > div.menu ul.submenu {
+ display: none;
+}
+
+div.sidemenu > div.menu ul {
+ line-height: 1em;
+ margin: 1ex;
+ padding-left: 1em;
+}
+
+div.banner {
+ font-size: 133%;
+ border: 2px solid darkred;
+ color: darkgreen;
+ line-height: 1em;
+ margin: 1ex;
+ padding: 3pt;
+}
+
+div.banner_link > a {
+ color: darkgreen;
+}
+
+div.banner_image img {
+ max-height: 3em;
+ max-width: 60pt;
+ float: right;
+}
+
+div.document > div.banner {
+ text-align: center;
+}
+
+@media (min-width: 480pt) {
+ div.document > div.banner br.first {
+ display: none;
+ }
+ div.document > div.banner img {
+ max-height: 2em;
+ }
+}
+
+/*** headings ***/
+
+h1.title {
+ background: url(python-xml-title.png) no-repeat;
+ padding: 20px 0 0 160px;
+ min-height: 60px;
+ font-size: 200%;
+}
+
+h1.title, h1 a, h2 a, h3 a {
+ color: #666;
+ font-weight: bold;
+ font-family: Helvetica, sans-serif;
+}
+
+@media screen {
+ div.section > h1:before {
+ margin-left: -2ex;
+ color: #CC0000;
+ content: "\00BB" " ";
+ }
+}
+
+h1 {
+ font-size: 150%;
+}
+
+h2 {
+ font-size: 130%;
+}
+
+h3 {
+ font-size: 110%;
+}
+
+/*** content ***/
+
+a, a:visited {
+ background-color: transparent;
+ font-weight: bold;
+ color: Black;
+ text-decoration: none;
+}
+
+p a:active, ul a:active {
+ color: Red;
+}
+
+p a:hover, ul a:hover {
+ text-decoration: underline;
+}
+
+p {
+ /*margin: 0.5em 0em 1em 0em;*/
+ text-align: justify;
+ line-height: 1.5em;
+ margin: 0.5em 0 0 0;
+}
+
+th.docinfo-name {
+ padding-left: 3ex;
+ text-align: right;
+ font-weight: bold;
+}
+
+hr {
+ clear: both;
+ height: 1px;
+ color: #8CACBB;
+ background-color: transparent;
+}
+
+dt {
+ line-height: 1.5em;
+ margin-left: 1em;
+}
+
+dt:before {
+ content: "\00BB" " ";
+}
+
+div.section ul {
+ line-height: 1.5em;
+ padding-left: 1ex;
+ margin-left: 1.5em;
+}
+
+div.section li {
+ padding-left: 0;
+}
+
+ol {
+ line-height: 1.5em;
+ margin-left: 0;
+}
+
+blockquote {
+ font-family: Times, "Times New Roman", serif;
+ font-style: italic;
+}
+
+div.eyecatcher, p.eyecatcher {
+ font-family: Times, "Times New Roman", serif;
+ text-align: center;
+ font-size: 140%;
+ line-height: 1.2em;
+ margin-left: 12%;
+ margin-right: 12%;
+}
+
+div.pagequote {
+ position: absolute;
+ top: 0;
+ right: 0;
+ padding: 10px 10px 0 0;
+ margin-left: 6em;
+ text-align: right;
+ font-size: 80%;
+ color: #990000;
+}
+
+div.pagequote .reference {
+ font-size: 140%;
+}
+
+html > .pagequote {
+ /* ignored by IE -> everyone else knows 'fixed', right? */
+ position: fixed;
+}
+
+code {
+ color: Black;
+ background-color: #f0f0f0;
+ font-family: "Courier New", Courier, monospace;
+}
+
+pre {
+ padding: 0.5em;
+ border: 1px solid #8cacbb;
+ color: Black;
+ background-color: #f0f0f0;
+ font-family: "Courier New", Courier, monospace;
+}
+
+/* Syntax highlighting */
+
+.syntax { background: #f0f0f0; }
+.syntax .c { color: #60a0b0; font-style: italic } /* Comment */
+.syntax .err { border: 1px solid #FF0000 } /* Error */
+.syntax .k { color: #007020; font-weight: bold } /* Keyword */
+.syntax .o { color: #666666 } /* Operator */
+.syntax .cm { color: #60a0b0; font-style: italic } /* Comment.Multiline */
+.syntax .cp { color: #007020 } /* Comment.Preproc */
+.syntax .c1 { color: #60a0b0; font-style: italic } /* Comment.Single */
+.syntax .cs { color: #60a0b0; background-color: #fff0f0 } /* Comment.Special */
+.syntax .gd { color: #A00000 } /* Generic.Deleted */
+.syntax .ge { font-style: italic } /* Generic.Emph */
+.syntax .gr { color: #FF0000 } /* Generic.Error */
+.syntax .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+.syntax .gi { color: #00A000 } /* Generic.Inserted */
+.syntax .go { color: #404040 } /* Generic.Output */
+.syntax .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */
+.syntax .gs { font-weight: bold } /* Generic.Strong */
+.syntax .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+.syntax .gt { color: #0040D0 } /* Generic.Traceback */
+.syntax .kc { color: #007020; font-weight: bold } /* Keyword.Constant */
+.syntax .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */
+.syntax .kp { color: #007020 } /* Keyword.Pseudo */
+.syntax .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */
+.syntax .kt { color: #902000 } /* Keyword.Type */
+.syntax .m { color: #40a070 } /* Literal.Number */
+.syntax .s { color: #4070a0 } /* Literal.String */
+.syntax .na { color: #4070a0 } /* Name.Attribute */
+.syntax .nb { color: #007020 } /* Name.Builtin */
+.syntax .nc { color: #0e84b5; font-weight: bold } /* Name.Class */
+.syntax .no { color: #60add5 } /* Name.Constant */
+.syntax .nd { color: #555555; font-weight: bold } /* Name.Decorator */
+.syntax .ni { color: #d55537; font-weight: bold } /* Name.Entity */
+.syntax .ne { color: #007020 } /* Name.Exception */
+.syntax .nf { color: #06287e } /* Name.Function */
+.syntax .nl { color: #002070; font-weight: bold } /* Name.Label */
+.syntax .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */
+.syntax .nt { color: #062873; font-weight: bold } /* Name.Tag */
+.syntax .nv { color: #bb60d5 } /* Name.Variable */
+.syntax .ow { color: #007020; font-weight: bold } /* Operator.Word */
+.syntax .w { color: #bbbbbb } /* Text.Whitespace */
+.syntax .mf { color: #40a070 } /* Literal.Number.Float */
+.syntax .mh { color: #40a070 } /* Literal.Number.Hex */
+.syntax .mi { color: #40a070 } /* Literal.Number.Integer */
+.syntax .mo { color: #40a070 } /* Literal.Number.Oct */
+.syntax .sb { color: #4070a0 } /* Literal.String.Backtick */
+.syntax .sc { color: #4070a0 } /* Literal.String.Char */
+.syntax .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */
+.syntax .s2 { color: #4070a0 } /* Literal.String.Double */
+.syntax .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */
+.syntax .sh { color: #4070a0 } /* Literal.String.Heredoc */
+.syntax .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */
+.syntax .sx { color: #c65d09 } /* Literal.String.Other */
+.syntax .sr { color: #235388 } /* Literal.String.Regex */
+.syntax .s1 { color: #4070a0 } /* Literal.String.Single */
+.syntax .ss { color: #517918 } /* Literal.String.Symbol */
+.syntax .bp { color: #007020 } /* Name.Builtin.Pseudo */
+.syntax .vc { color: #bb60d5 } /* Name.Variable.Class */
+.syntax .vg { color: #bb60d5 } /* Name.Variable.Global */
+.syntax .vi { color: #bb60d5 } /* Name.Variable.Instance */
+.syntax .il { color: #40a070 } /* Literal.Number.Integer.Long */
diff --git a/doc/html/tagpython-big.png b/doc/html/tagpython-big.png
new file mode 100644
index 0000000..8eeb96a
--- /dev/null
+++ b/doc/html/tagpython-big.png
Binary files differ
diff --git a/doc/html5parser.txt b/doc/html5parser.txt
new file mode 100644
index 0000000..b03bb3a
--- /dev/null
+++ b/doc/html5parser.txt
@@ -0,0 +1,80 @@
+===============
+html5lib Parser
+===============
+
+`html5lib`_ is a Python package that implements the HTML5 parsing algorithm
+which is heavily influenced by current browsers and based on the `WHATWG
+HTML5 specification`_.
+
+.. _html5lib: http://code.google.com/p/html5lib/
+.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
+.. _WHATWG HTML5 specification: http://www.whatwg.org/specs/web-apps/current-work/
+
+lxml can benefit from the parsing capabilities of `html5lib` through
+the ``lxml.html.html5parser`` module. It provides a similar interface
+to the ``lxml.html`` module by providing ``fromstring()``,
+``parse()``, ``document_fromstring()``, ``fragment_fromstring()`` and
+``fragments_fromstring()`` that work like the regular html parsing
+functions.
+
+
+Differences to regular HTML parsing
+===================================
+
+There are a few differences in the returned tree to the regular HTML
+parsing functions from ``lxml.html``. html5lib normalizes some elements
+and element structures to a common format. For example even if a tables
+does not have a `tbody` html5lib will inject one automatically:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.html import tostring, html5parser
+ >>> tostring(html5parser.fromstring("<table><td>foo"))
+ '<table><tbody><tr><td>foo</td></tr></tbody></table>'
+
+Also the parameters the functions accept are different.
+
+
+Function Reference
+==================
+
+``parse(filename_url_or_file)``:
+ Parses the named file or url, or if the object has a ``.read()``
+ method, parses from that.
+
+``document_fromstring(html, guess_charset=True)``:
+ Parses a document from the given string. This always creates a
+ correct HTML document, which means the parent node is ``<html>``,
+ and there is a body and possibly a head.
+
+ If a bytestring is passed and ``guess_charset`` is true the chardet
+ library (if installed) will guess the charset if ambiguities exist.
+
+``fragment_fromstring(string, create_parent=False, guess_charset=False)``:
+ Returns an HTML fragment from a string. The fragment must contain
+ just a single element, unless ``create_parent`` is given;
+ e.g., ``fragment_fromstring(string, create_parent='div')`` will
+ wrap the element in a ``<div>``. If ``create_parent`` is true the
+ default parent tag (div) is used.
+
+ If a bytestring is passed and ``guess_charset`` is true the chardet
+ library (if installed) will guess the charset if ambiguities exist.
+
+``fragments_fromstring(string, no_leading_text=False, parser=None)``:
+ Returns a list of the elements found in the fragment. The first item in
+ the list may be a string. If ``no_leading_text`` is true, then it will
+ be an error if there is leading text, and it will always be a list of
+ only elements.
+
+ If a bytestring is passed and ``guess_charset`` is true the chardet
+ library (if installed) will guess the charset if ambiguities exist.
+
+``fromstring(string)``:
+ Returns ``document_fromstring`` or ``fragment_fromstring``, based
+ on whether the string looks like a full document, or just a
+ fragment.
+
+Additionally all parsing functions accept an ``parser`` keyword argument
+that can be set to a custom parser instance. To create custom parsers
+you can subclass the ``HTMLParser`` and ``XHTMLParser`` from the same
+module. Note that these are the parser classes provided by html5lib.
diff --git a/doc/intro.txt b/doc/intro.txt
new file mode 100644
index 0000000..584c2f2
--- /dev/null
+++ b/doc/intro.txt
@@ -0,0 +1,82 @@
+Why lxml?
+=========
+
+.. contents::
+..
+ 1 Motto
+ 2 Aims
+
+
+Motto
+-----
+
+"the thrills without the strangeness"
+
+To explain the motto:
+
+"Programming with libxml2 is like the thrilling embrace of an exotic stranger.
+It seems to have the potential to fulfill your wildest dreams, but there's a
+nagging voice somewhere in your head warning you that you're about to get
+screwed in the worst way." (`a quote by Mark Pilgrim`_)
+
+Mark Pilgrim was describing in particular the experience a Python programmer
+has when dealing with libxml2. The default Python bindings of libxml2 are
+fast, thrilling, powerful, and your code might fail in some horrible way that
+you really shouldn't have to worry about when writing Python code. lxml
+combines the power of libxml2 with the ease of use of Python.
+
+.. _`a quote by Mark Pilgrim`: https://web.archive.org/web/20110902041836/http://diveintomark.org/archives/2004/02/18/libxml2
+
+
+Aims
+----
+
+The C libraries libxml2_ and libxslt_ have huge benefits:
+
+* Standards-compliant XML support.
+
+* Support for (broken) HTML.
+
+* Full-featured.
+
+* Actively maintained by XML experts.
+
+* fast. fast! FAST!
+
+.. _libxml2: http://www.xmlsoft.org
+
+.. _libxslt: http://xmlsoft.org/XSLT
+
+
+These libraries already ship with Python bindings, but these Python bindings
+mimic the C-level interface. This yields a number of problems:
+
+* very low level and C-ish (not Pythonic).
+
+* underdocumented and huge, you get lost in them.
+
+* UTF-8 in API, instead of Python unicode strings.
+
+* Can easily cause segfaults from Python.
+
+* Require manual memory management!
+
+
+lxml is a new Python binding for libxml2 and libxslt, completely independent
+from these existing Python bindings. Its aims:
+
+* Pythonic API.
+
+* Documented.
+
+* Use Python unicode strings in API.
+
+* Safe (no segfaults).
+
+* No manual memory management!
+
+lxml aims to provide a Pythonic API by following as much as possible the
+`ElementTree API`_. We're trying to avoid inventing too many new APIs, or you
+having to learn new things -- XML is complicated enough.
+
+.. _`ElementTree API`: http://effbot.org/zone/element-index.htm
diff --git a/doc/licenses/BSD.txt b/doc/licenses/BSD.txt
new file mode 100644
index 0000000..a76d0ed
--- /dev/null
+++ b/doc/licenses/BSD.txt
@@ -0,0 +1,29 @@
+Copyright (c) 2004 Infrae. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+ 1. Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ 2. Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in
+ the documentation and/or other materials provided with the
+ distribution.
+
+ 3. Neither the name of Infrae nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INFRAE OR
+CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/doc/licenses/GPL.txt b/doc/licenses/GPL.txt
new file mode 100644
index 0000000..d60c31a
--- /dev/null
+++ b/doc/licenses/GPL.txt
@@ -0,0 +1,340 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/doc/licenses/ZopePublicLicense.txt b/doc/licenses/ZopePublicLicense.txt
new file mode 100644
index 0000000..44e0648
--- /dev/null
+++ b/doc/licenses/ZopePublicLicense.txt
@@ -0,0 +1,59 @@
+Zope Public License (ZPL) Version 2.0
+-----------------------------------------------
+
+This software is Copyright (c) Zope Corporation (tm) and
+Contributors. All rights reserved.
+
+This license has been certified as open source. It has also
+been designated as GPL compatible by the Free Software
+Foundation (FSF).
+
+Redistribution and use in source and binary forms, with or
+without modification, are permitted provided that the
+following conditions are met:
+
+1. Redistributions in source code must retain the above
+ copyright notice, this list of conditions, and the following
+ disclaimer.
+
+2. Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions, and the following
+ disclaimer in the documentation and/or other materials
+ provided with the distribution.
+
+3. The name Zope Corporation (tm) must not be used to
+ endorse or promote products derived from this software
+ without prior written permission from Zope Corporation.
+
+4. The right to distribute this software or to use it for
+ any purpose does not give you the right to use Servicemarks
+ (sm) or Trademarks (tm) of Zope Corporation. Use of them is
+ covered in a separate agreement (see
+ http://www.zope.com/Marks).
+
+5. If any files are modified, you must cause the modified
+ files to carry prominent notices stating that you changed
+ the files and the date of any change.
+
+Disclaimer
+
+ THIS SOFTWARE IS PROVIDED BY ZOPE CORPORATION ``AS IS''
+ AND ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT
+ NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
+ NO EVENT SHALL ZOPE CORPORATION OR ITS CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+ DAMAGE.
+
+
+This software consists of contributions made by Zope
+Corporation and many individuals on behalf of Zope
+Corporation. Specific attributions are listed in the
+accompanying credits file.
diff --git a/doc/licenses/elementtree.txt b/doc/licenses/elementtree.txt
new file mode 100644
index 0000000..dde28f1
--- /dev/null
+++ b/doc/licenses/elementtree.txt
@@ -0,0 +1,25 @@
+The ElementTree / XML Toys Library is
+
+Copyright (c) 1999-2003 by Secret Labs AB
+Copyright (c) 1999-2003 by Fredrik Lundh
+
+By obtaining, using, and/or copying this software and/or its
+associated documentation, you agree that you have read, understood,
+and will comply with the following terms and conditions:
+
+Permission to use, copy, modify, and distribute this software and its
+associated documentation for any purpose and without fee is hereby
+granted, provided that the above copyright notice appears in all
+copies, and that both that copyright notice and this permission notice
+appear in supporting documentation, and that the name of Secret Labs
+AB or the author not be used in advertising or publicity pertaining to
+distribution of the software without specific, written prior
+permission.
+
+SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
+THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
+FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
+ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
diff --git a/doc/lxml-source-howto.txt b/doc/lxml-source-howto.txt
new file mode 100644
index 0000000..327eae8
--- /dev/null
+++ b/doc/lxml-source-howto.txt
@@ -0,0 +1,313 @@
+==============================
+How to read the source of lxml
+==============================
+
+:Author:
+ Stefan Behnel
+
+.. meta::
+ :description: How to read and work on the source code of lxml
+ :keywords: lxml, XML, Cython, source code, develop, comprehend, understand
+
+This document describes how to read the source code of lxml_ and how
+to start working on it. You might also be interested in the companion
+document that describes `how to build lxml from sources`_.
+
+.. _lxml: http://lxml.de/
+.. _`how to build lxml from sources`: build.html
+.. _`ReStructured Text`: http://docutils.sourceforge.net/rst.html
+.. _epydoc: http://epydoc.sourceforge.net/
+.. _docutils: http://docutils.sourceforge.net/
+.. _`C-level API`: capi.html
+
+.. contents::
+..
+ 1 What is Cython?
+ 2 Where to start?
+ 2.1 Concepts
+ 2.2 The documentation
+ 3 lxml.etree
+ 4 lxml.objectify
+ 5 lxml.html
+
+
+What is Cython?
+===============
+
+.. _Cython: http://cython.org/
+.. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/
+
+Cython_ is the language that lxml is written in. It is a very
+Python-like language that was specifically designed for writing Python
+extension modules.
+
+The reason why Cython (or actually its predecessor Pyrex_ at the time)
+was chosen as an implementation language for lxml, is that it makes it
+very easy to interface with both the Python world and external C code.
+Cython generates all the necessary glue code for the Python API,
+including Python types, calling conventions and reference counting.
+On the other side of the table, calling into C code is not more than
+declaring the signature of the function and maybe some variables as
+being C types, pointers or structs, and then calling it. The rest of
+the code is just plain Python code.
+
+The Cython language is so close to Python that the Cython compiler can
+actually compile many, many Python programs to C without major
+modifications. But the real speed gains of a C compilation come from
+type annotations that were added to the language and that allow Cython
+to generate very efficient C code.
+
+Even if you are not familiar with Cython, you should keep in mind that
+a slow implementation of a feature is better than none. So, if you
+want to contribute and have an idea what code you want to write, feel
+free to start with a pure Python implementation. Chances are, if you
+get the change officially accepted and integrated, others will take
+the time to optimise it so that it runs fast in Cython.
+
+
+Where to start?
+===============
+
+First of all, read `how to build lxml from sources`_ to learn how to
+retrieve the source code from the GitHub repository and how to
+build it. The source code lives in the subdirectory ``src`` of the
+checkout.
+
+The main extension modules in lxml are ``lxml.etree`` and
+``lxml.objectify``. All main modules have the file extension
+``.pyx``, which shows the descendence from Pyrex. As usual in Python,
+the main files start with a short description and a couple of imports.
+Cython distinguishes between the run-time ``import`` statement (as
+known from Python) and the compile-time ``cimport`` statement, which
+imports C declarations, either from external libraries or from other
+Cython modules.
+
+
+Concepts
+--------
+
+lxml's tree API is based on proxy objects. That means, every Element
+object (or rather ``_Element`` object) is a proxy for a libxml2 node
+structure. The class declaration is (mainly)::
+
+ cdef class _Element:
+ cdef _Document _doc
+ cdef xmlNode* _c_node
+
+It is a naming convention that C variables and C level class members
+that are passed into libxml2 start with a prefixed ``c_`` (commonly
+libxml2 struct pointers), and that C level class members are prefixed
+with an underscore. So you will often see names like ``c_doc`` for an
+``xmlDoc*`` variable (or ``c_node`` for an ``xmlNode*``), or the above
+``_c_node`` for a class member that points to an ``xmlNode`` struct
+(or ``_c_doc`` for an ``xmlDoc*``).
+
+It is important to know that every proxy in lxml has a factory
+function that properly sets up C level members. Proxy objects must
+*never* be instantiated outside of that factory. For example, to
+instantiate an _Element object or its subclasses, you must always call
+its factory function::
+
+ cdef xmlNode* c_node
+ cdef _Document doc
+ cdef _Element element
+ ...
+ element = _elementFactory(doc, c_node)
+
+A good place to see how this factory is used are the Element methods
+``getparent()``, ``getnext()`` and ``getprevious()``.
+
+
+The documentation
+-----------------
+
+An important part of lxml is the documentation that lives in the
+``doc`` directory. It describes a large part of the API and comprises
+a lot of example code in the form of doctests.
+
+The documentation is written in the `ReStructured Text`_ format, a
+very powerful text markup language that looks almost like plain text.
+It is part of the docutils_ package.
+
+The project web site of lxml_ is completely generated from these text
+documents. Even the side menu is just collected from the table of
+contents that the ReST processor writes into each HTML page.
+Obviously, we use lxml for this.
+
+The easiest way to generate the HTML pages is by calling::
+
+ make html
+
+This will call the script ``doc/mkhtml.py`` to run the ReST processor
+on the files. After generating an HTML page the script parses it back
+in to build the side menu, and injects the complete menu into each
+page at the very end.
+
+Running the ``make`` command will also generate the API documentation
+if you have epydoc_ installed. The epydoc package will import and
+introspect the extension modules and also introspect and parse the
+Python modules of lxml. The aggregated information will then be
+written out into an HTML documentation site.
+
+
+lxml.etree
+==========
+
+The main module, ``lxml.etree``, is in the file `lxml.etree.pyx
+<https://github.com/lxml/lxml/blob/master/src/lxml/etree.pyx>`_. It
+implements the main functions and types of the ElementTree API, as
+well as all the factory functions for proxies. It is the best place
+to start if you want to find out how a specific feature is
+implemented.
+
+At the very end of the file, it contains a series of ``include``
+statements that merge the rest of the implementation into the
+generated C code. Yes, you read right: no importing, no source file
+namespacing, just plain good old include and a huge C code result of
+more than 100,000 lines that we throw right into the C compiler.
+
+The main include files are:
+
+apihelpers.pxi
+ Private C helper functions. Except for the factory functions,
+ most of the little functions that are used all over the place are
+ defined here. This includes things like reading out the text
+ content of a libxml2 tree node, checking input from the API level,
+ creating a new Element node or handling attribute values. If you
+ want to work on the lxml code, you should keep these functions in
+ the back of your head, as they will definitely make your life
+ easier.
+
+classlookup.pxi
+ Element class lookup mechanisms. The main API and engines for
+ those who want to define custom Element classes and inject them
+ into lxml.
+
+docloader.pxi
+ Support for custom document loaders. Base class and registry for
+ custom document resolvers.
+
+extensions.pxi
+ Infrastructure for extension functions in XPath/XSLT, including
+ XPath value conversion and function registration.
+
+iterparse.pxi
+ Incremental XML parsing. An iterator class that builds iterparse
+ events while parsing.
+
+nsclasses.pxi
+ Namespace implementation and registry. The registry and engine
+ for Element classes that use the ElementNamespaceClassLookup
+ scheme.
+
+parser.pxi
+ Parsers for XML and HTML. This is the main parser engine. It's
+ the reason why you can parse a document from various sources in
+ two lines of Python code. It's definitely not the right place to
+ start reading lxml's source code.
+
+parsertarget.pxi
+ An ElementTree compatible parser target implementation based on
+ the SAX2 interface of libxml2.
+
+proxy.pxi
+ Very low-level functions for memory allocation/deallocation
+ and Element proxy handling. Ignoring this for the beginning
+ will safe your head from exploding.
+
+public-api.pxi
+ The set of C functions that are exported to other extension
+ modules at the C level. For example, ``lxml.objectify`` makes use
+ of these. See the `C-level API` documentation.
+
+readonlytree.pxi
+ A separate read-only implementation of the Element API. This is
+ used in places where non-intrusive access to a tree is required,
+ such as the ``PythonElementClassLookup`` or XSLT extension
+ elements.
+
+saxparser.pxi
+ SAX-like parser interfaces as known from ElementTree's TreeBuilder.
+
+serializer.pxi
+ XML output functions. Basically everything that creates byte
+ sequences from XML trees.
+
+xinclude.pxi
+ XInclude support.
+
+xmlerror.pxi
+ Error log handling. All error messages that libxml2 generates
+ internally walk through the code in this file to end up in lxml's
+ Python level error logs.
+
+ At the end of the file, you will find a long list of named error
+ codes. It is generated from the libxml2 HTML documentation (using
+ lxml, of course). See the script ``update-error-constants.py``
+ for this.
+
+xmlid.pxi
+ XMLID and IDDict, a dictionary-like way to find Elements by their
+ XML-ID attribute.
+
+xpath.pxi
+ XPath evaluators.
+
+xslt.pxi
+ XSL transformations, including the ``XSLT`` class, document lookup
+ handling and access control.
+
+The different schema languages (DTD, RelaxNG, XML Schema and
+Schematron) are implemented in the following include files:
+
+* dtd.pxi
+* relaxng.pxi
+* schematron.pxi
+* xmlschema.pxi
+
+
+Python modules
+==============
+
+The ``lxml`` package also contains a number of pure Python modules:
+
+builder.py
+ The E-factory and the ElementBuilder class. These provide a
+ simple interface to XML tree generation.
+
+cssselect.py
+ A CSS selector implementation based on XPath. The main class is
+ called ``CSSSelector``.
+
+doctestcompare.py
+ A relaxed comparison scheme for XML/HTML markup in doctest.
+
+ElementInclude.py
+ XInclude-like document inclusion, compatible with ElementTree.
+
+_elementpath.py
+ XPath-like path language, compatible with ElementTree.
+
+sax.py
+ SAX2 compatible interfaces to copy lxml trees from/to SAX compatible
+ tools.
+
+usedoctest.py
+ Wrapper module for ``doctestcompare.py`` that simplifies its usage
+ from inside a doctest.
+
+
+lxml.objectify
+==============
+
+A Cython implemented extension module that uses the public C-API of
+lxml.etree. It provides a Python object-like interface to XML trees.
+The implementation resides in the file `lxml.objectify.pyx
+<https://github.com/lxml/lxml/blob/master/src/lxml/objectify.pyx>`_.
+
+
+lxml.html
+=========
+
+A specialised toolkit for HTML handling, based on lxml.etree. This is
+implemented in pure Python.
diff --git a/doc/lxml.mgp b/doc/lxml.mgp
new file mode 100644
index 0000000..1b2386f
--- /dev/null
+++ b/doc/lxml.mgp
@@ -0,0 +1,122 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%deffont "standard" xfont "helvetica-medium-r"
+%deffont "thick" xfont "helvetica-bold-r"
+%deffont "typewriter" xfont "courier-medium-r"
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%
+%% Default settings per each line numbers.
+%%
+%default 1 area 90 90, leftfill, size 2, fore "gray20", back "white", font "standard", hgap 0
+%default 2 size 7, vgap 10, prefix " ", ccolor "blue"
+%default 3 size 2, bar "gray70", vgap 10
+%default 4 size 5, fore "gray20", vgap 30, prefix " ", font "standard"
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%
+%% Default settings that are applied to TAB-indented lines.
+%%
+%tab 1 size 5, vgap 40, prefix " ", icon box "red" 50
+%tab 2 size 4, vgap 40, prefix " ", icon arc "yellow" 50
+%tab 3 size 3, vgap 40, prefix " ", icon delta3 "white" 40
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%page
+
+lxml - a sane Python wrapper for libxml
+
+
+
+%center
+Martijn Faassen, Infrae
+faassen@infrae.com
+
+%page
+
+The C library libxml has huge benefits
+
+
+ Standards-compliant XML support
+
+ full-featured
+
+ actively maintained by XML exports
+
+ fast. fast! FAST!
+
+%page
+
+Features of libxml
+
+
+ Parsing
+
+ Tree based (DOM-ish) XML structure
+
+ XPath support
+
+ XSLT support (libxslt)
+
+ Relax NG (schema) support
+
+ And more
+
+%page
+
+But libxml already has Python bindings!
+
+
+ very low level and C-ish (not Pythonic)
+
+ underdocumented. huge, you get lost in them
+
+ works with UTF-8, not native Python unicode
+
+ can cause segfaults from Python
+
+ have to do manual memory management!
+
+%page
+
+lxml is a new Python binding for libxml
+
+Aims (read: TODOS)
+
+ Pythonic API
+
+ Documented
+
+ Use Python unicode strings in API
+
+ Safe (no segfaults)
+
+ No manual memory management!
+
+%page
+
+Tradeoffs
+
+
+ Slower because of better wrapping.
+
+ But libxml is so fast this likely doesn't matter much.
+
+ Not all features of libxml exposed (unless you help)
+
+%page
+
+What is there now - Proof of concept
+
+
+ Automatic destruction of documents (refcounted)
+
+ Start of ElementTree style API for tree
+
+%page
+
+Future
+
+
+ Fix bugs, add features
+
+ Moving into svn repository on codespeak.net
+
+ Help!
+
diff --git a/doc/lxml2.txt b/doc/lxml2.txt
new file mode 100644
index 0000000..8ee5345
--- /dev/null
+++ b/doc/lxml2.txt
@@ -0,0 +1,269 @@
+=======================
+What's new in lxml 2.0?
+=======================
+
+.. contents::
+..
+ 1 Changes in etree and objectify
+ 1.1 Incompatible changes
+ 1.2 Enhancements
+ 1.3 Deprecation
+ 2 New modules
+ 2.1 lxml.usedoctest
+ 2.2 lxml.html
+ 2.3 lxml.cssselect
+
+
+During the development of the lxml 1.x series, a couple of quirks were
+discovered in the design that made the API less obvious and its future
+extensions harder than necessary. lxml 2.0 is a soft evolution of lxml 1.x
+towards a simpler, more consistent and more powerful API - with some major
+extensions. Wherever possible, lxml 1.3 comes close to the semantics of lxml
+2.0, so that migrating should be easier for code that currently runs with 1.3.
+
+One of the important internal changes was the switch from the Pyrex_
+compiler to Cython_, which provides better optimisation and improved
+support for newer Python language features. This allows the code of
+lxml to become more Python-like again, while the performance improves
+as Cython continues its own development. The code simplification,
+which will continue throughout the 2.x series, will hopefully make it
+even easier for users to contribute.
+
+.. _Cython: http://www.cython.org/
+.. _Pyrex: http://www.cosc.canterbury.ac.nz/~greg/python/Pyrex/
+
+
+Changes in etree and objectify
+==============================
+
+A graduation towards a more consistent API cannot go without a certain amount
+of incompatible changes. The following is a list of those differences that
+applications need to take into account when migrating from lxml 1.x to lxml
+2.0.
+
+Incompatible changes
+--------------------
+
+* lxml 0.9 introduced a feature called `namespace implementation`_. The
+ global ``Namespace`` factory was added to register custom element classes
+ and have lxml.etree look them up automatically. However, the later
+ development of further class lookup mechanisms made it appear less and less
+ adequate to register this mapping at a global level, so lxml 1.1 first
+ removed the namespace based lookup from the default setup and lxml 2.0
+ finally removes the global namespace registry completely. As all other
+ lookup mechanisms, the namespace lookup is now local to a parser, including
+ the registry itself. Applications that use a module-level parser can easily
+ map its ``get_namespace()`` method to a global ``Namespace`` function to
+ mimic the old behaviour.
+
+ .. _`namespace implementation`: element_classes.html#implementing-namespaces
+
+* Some API functions now require passing options as keyword arguments,
+ as opposed to positional arguments. This restriction was introduced
+ to make the API usage independent of future extensions such as the
+ addition of new positional arguments. Users should not rely on the
+ position of an optional argument in function signatures and instead
+ pass it explicitly named. This also improves code readability - it
+ is common good practice to pass options in a consistent way
+ independent of their position, so many people may not even notice
+ the change in their code. Another important reason is compatibility
+ with cElementTree, which also enforces keyword-only arguments in a
+ couple of places.
+
+* XML tag names are validated when creating an Element. This does not
+ apply to HTML tags, where only HTML special characters are
+ forbidden. The distinction is made by the ``SubElement()`` factory,
+ which tests if the tree it works on is an HTML tree, and by the
+ ``.makeelement()`` methods of parsers, which behave differently for
+ the ``XMLParser()`` and the ``HTMLParser()``.
+
+* XPath now raises exceptions specific to the part of the execution that
+ failed: ``XPathSyntaxError`` for parser errors and ``XPathEvalError`` for
+ errors that occurred during the evaluation. Note that the distinction only
+ works for the ``XPath()`` class. The other two evaluators only have a
+ single evaluation call that includes the parsing step, and will therefore
+ only raise an ``XPathEvalError``. Applications can catch both exceptions
+ through the common base class ``XPathError`` (which also exists in earlier
+ lxml versions).
+
+* Network access in parsers is now disabled by default, i.e. the
+ ``no_network`` option defaults to True. Due to a somewhat 'interesting'
+ implementation in libxml2, this does not affect the first document (i.e. the
+ URL that is parsed), but only subsequent documents, such as a DTD when
+ parsing with validation. This means that you will have to check the URL you
+ pass, instead of relying on lxml to prevent *any* access to external
+ resources. As this can be helpful in some use cases, lxml does not work
+ around it.
+
+* The type annotations in lxml.objectify (the ``pytype`` attribute) now use
+ ``NoneType`` for the None value as this is the correct Python type name.
+ Previously, lxml 1.x used a lower case ``none``.
+
+* Another change in objectify regards the way it deals with ambiguous types.
+ Previously, setting a value like the string ``"3"`` through normal attribute
+ access would let it come back as an integer when reading the object
+ attribute. lxml 2.0 prevents this by always setting the ``pytype``
+ attribute to the type the user passed in, so ``"3"`` will come back as a
+ string, while the number ``3`` will come back as a number. To remove the
+ type annotation on serialisation, you can use the ``deannotate()`` function.
+
+* The C-API function ``findOrBuildNodeNs()`` was replaced by the more generic
+ ``findOrBuildNodeNsPrefix()`` that accepts an additional default prefix.
+
+
+Enhancements
+------------
+
+Most of the enhancements of lxml 2.0 were made under the hood. Most people
+won't even notice them, but they make the maintenance of lxml easier and thus
+facilitate further enhancements and an improved integration between lxml's
+features.
+
+* lxml.objectify now has its own implementation of the `E factory`_. It uses
+ the built-in type lookup mechanism of lxml.objectify, thus removing the need
+ for an additional type registry mechanism (as previously available through
+ the ``typemap`` parameter).
+
+* XML entities are supported through the ``Entity()`` factory, an Entity
+ element class and a parser option ``resolve_entities`` that allows to keep
+ entities in the element tree when set to False. Also, the parser will now
+ report undefined entities as errors if it needs to resolve them (which is
+ still the default, as in lxml 1.x).
+
+* A major part of the XPath code was rewritten and can now benefit from a
+ bigger overlap with the XSLT code. The main benefits are improved thread
+ safety in the XPath evaluators and Python RegExp support in standard XPath.
+
+* The string results of an XPath evaluation have become 'smart' string
+ subclasses. Formerly, there was no easy way to find out where a
+ string originated from. In lxml 2.0, you can call its
+ ``getparent()`` method to `find the Element that carries it`_. This
+ works for attributes (``//@attribute``) and for ``text()`` nodes,
+ i.e. Element text and tails. Strings that were constructed in the
+ path expression, e.g. by the ``string()`` function or extension
+ functions, will return None as their parent.
+
+* Setting a ``QName`` object as value of the ``.text`` property or as
+ an attribute value will resolve its prefix in the respective context
+
+* Following ElementTree 1.3, the ``iterfind()`` method supports
+ efficient iteration based on XPath-like expressions.
+
+The parsers also received some major enhancements:
+
+* ``iterparse()`` can parse HTML when passing the boolean ``html``
+ keyword.
+
+* Parse time XML Schema validation by passing an
+ XMLSchema object to the ``schema`` keyword argument of a parser.
+
+* Support for a ``target`` object that implements ElementTree's
+ `TreeBuilder interface`_.
+
+* The ``encoding`` keyword allows overriding the document encoding.
+
+
+.. _`E factory`: objectify.html#tree-generation-with-the-e-factory
+.. _`find the Element that carries it`: tutorial.html#using-xpath-to-find-text
+.. _`TreeBuilder interface`: http://effbot.org/elementtree/elementtree-treebuilder.htm
+
+
+Deprecation
+-----------
+
+The following functions and methods are now deprecated. They are
+still available in lxml 2.0 and will be removed in lxml 2.1:
+
+* The ``tounicode()`` function was replaced by the call
+ ``tostring(encoding='unicode')``.
+
+* CamelCaseNamed module functions and methods were renamed to their
+ underscore equivalents to follow `PEP 8`_ in naming.
+
+ - ``etree.clearErrorLog()``, use ``etree.clear_error_log()``
+
+ - ``etree.useGlobalPythonLog()``, use
+ ``etree.use_global_python_log()``
+
+ - ``etree.ElementClassLookup.setFallback()``, use
+ ``etree.ElementClassLookup.set_fallback()``
+
+ - ``etree.getDefaultParser()``, use ``etree.get_default_parser()``
+
+ - ``etree.setDefaultParser()``, use ``etree.set_default_parser()``
+
+ - ``etree.setElementClassLookup()``, use
+ ``etree.set_element_class_lookup()``
+
+ - ``XMLParser.setElementClassLookup()``, use ``.set_element_class_lookup()``
+
+ - ``HTMLParser.setElementClassLookup()``, use ``.set_element_class_lookup()``
+
+ Note that ``parser.setElementClassLookup()`` has not been removed
+ yet, although ``parser.set_element_class_lookup()`` should be used
+ instead.
+
+ - ``xpath_evaluator.registerNamespace()``, use
+ ``xpath_evaluator.register_namespace()``
+
+ - ``xpath_evaluator.registerNamespaces()``, use
+ ``xpath_evaluator.register_namespaces()``
+
+ - ``objectify.setPytypeAttributeTag``, use
+ ``objectify.set_pytype_attribute_tag``
+
+ - ``objectify.setDefaultParser()``, use
+ ``objectify.set_default_parser()``
+
+* The ``.getiterator()`` method on Elements and ElementTrees was
+ renamed to ``.iter()`` to follow ElementTree 1.3.
+
+.. _`PEP 8`: http://www.python.org/dev/peps/pep-0008/
+
+
+New modules
+===========
+
+The most visible changes in lxml 2.0 regard the new modules that were added.
+
+
+lxml.usedoctest
+---------------
+
+A very useful module for doctests based on XML or HTML is
+``lxml.doctestcompare``. It provides a relaxed comparison mechanism
+for XML and HTML in doctests. Using it for XML comparisons is as
+simple as:
+
+.. sourcecode:: pycon
+
+ >>> import lxml.usedoctest
+
+and for HTML comparisons:
+
+.. sourcecode:: pycon
+
+ >>> import lxml.html.usedoctest
+
+
+lxml.html
+---------
+
+The largest new package that was added to lxml 2.0 is `lxml.html`_. It
+contains various tools and modules for HTML handling. The major features
+include support for cleaning up HTML (removing unwanted content), a readable
+HTML diff and various tools for working with links.
+
+.. _`lxml.html`: lxmlhtml.html
+
+
+lxml.cssselect
+--------------
+
+The Cascading Stylesheet Language (CSS_) has a very short and generic path
+language for pointing at elements in XML/HTML trees (`CSS selectors`_). The module
+lxml.cssselect_ provides an implementation based on XPath.
+
+.. _lxml.cssselect: cssselect.html
+.. _CSS: http://www.w3.org/Style/CSS/
+.. _`CSS selectors`: http://www.w3.org/TR/CSS21/selector.html
diff --git a/doc/lxmlhtml.txt b/doc/lxmlhtml.txt
new file mode 100644
index 0000000..9827ed9
--- /dev/null
+++ b/doc/lxmlhtml.txt
@@ -0,0 +1,766 @@
+=========
+lxml.html
+=========
+
+:Author:
+ Ian Bicking
+
+Since version 2.0, lxml comes with a dedicated Python package for
+dealing with HTML: ``lxml.html``. It is based on lxml's HTML parser,
+but provides a special Element API for HTML elements, as well as a
+number of utilities for common HTML processing tasks.
+
+.. contents::
+..
+ 1 Parsing HTML
+ 1.1 Parsing HTML fragments
+ 1.2 Really broken pages
+ 2 HTML Element Methods
+ 3 Running HTML doctests
+ 4 Creating HTML with the E-factory
+ 4.1 Viewing your HTML
+ 5 Working with links
+ 5.1 Functions
+ 6 Forms
+ 6.1 Form Filling Example
+ 6.2 Form Submission
+ 7 Cleaning up HTML
+ 7.1 autolink
+ 7.2 wordwrap
+ 8 HTML Diff
+ 9 Examples
+ 9.1 Microformat Example
+
+The main API is based on the `lxml.etree`_ API, and thus, on the ElementTree_
+API.
+
+.. _`lxml.etree`: tutorial.html
+.. _ElementTree: http://effbot.org/zone/element-index.htm
+
+
+Parsing HTML
+============
+
+Parsing HTML fragments
+----------------------
+
+There are several functions available to parse HTML:
+
+``parse(filename_url_or_file)``:
+ Parses the named file or url, or if the object has a ``.read()``
+ method, parses from that.
+
+ If you give a URL, or if the object has a ``.geturl()`` method (as
+ file-like objects from ``urllib.urlopen()`` have), then that URL
+ is used as the base URL. You can also provide an explicit
+ ``base_url`` keyword argument.
+
+``document_fromstring(string)``:
+ Parses a document from the given string. This always creates a
+ correct HTML document, which means the parent node is ``<html>``,
+ and there is a body and possibly a head.
+
+``fragment_fromstring(string, create_parent=False)``:
+ Returns an HTML fragment from a string. The fragment must contain
+ just a single element, unless ``create_parent`` is given;
+ e.g., ``fragment_fromstring(string, create_parent='div')`` will
+ wrap the element in a ``<div>``.
+
+``fragments_fromstring(string)``:
+ Returns a list of the elements found in the fragment.
+
+``fromstring(string)``:
+ Returns ``document_fromstring`` or ``fragment_fromstring``, based
+ on whether the string looks like a full document, or just a
+ fragment.
+
+Really broken pages
+-------------------
+
+The normal HTML parser is capable of handling broken HTML, but for
+pages that are far enough from HTML to call them 'tag soup', it may
+still fail to parse the page in a useful way. A way to deal with this
+is ElementSoup_, which deploys the well-known BeautifulSoup_ parser to
+build an lxml HTML tree.
+
+.. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
+.. _ElementSoup: elementsoup.html
+
+However, note that the most common problem with web pages is the lack
+of (or the existence of incorrect) encoding declarations. It is
+therefore often sufficient to only use the encoding detection of
+BeautifulSoup, called UnicodeDammit, and to leave the rest to lxml's
+own HTML parser, which is several times faster.
+
+
+HTML Element Methods
+====================
+
+HTML elements have all the methods that come with ElementTree, but
+also include some extra methods:
+
+``.drop_tree()``:
+ Drops the element and all its children. Unlike
+ ``el.getparent().remove(el)`` this does *not* remove the tail
+ text; with ``drop_tree`` the tail text is merged with the previous
+ element.
+
+``.drop_tag()``:
+ Drops the tag, but keeps its children and text.
+
+``.find_class(class_name)``:
+ Returns a list of all the elements with the given CSS class name.
+ Note that class names are space separated in HTML, so
+ ``doc.find_class_name('highlight')`` will find an element like
+ ``<div class="sidebar highlight">``. Class names *are* case
+ sensitive.
+
+``.find_rel_links(rel)``:
+ Returns a list of all the ``<a rel="{rel}">`` elements. E.g.,
+ ``doc.find_rel_links('tag')`` returns all the links `marked as
+ tags <http://microformats.org/wiki/rel-tag>`_.
+
+``.get_element_by_id(id, default=None)``:
+ Return the element with the given ``id``, or the ``default`` if
+ none is found. If there are multiple elements with the same id
+ (which there shouldn't be, but there often is), this returns only
+ the first.
+
+``.text_content()``:
+ Returns the text content of the element, including the text
+ content of its children, with no markup.
+
+``.cssselect(expr)``:
+ Select elements from this element and its children, using a CSS
+ selector expression. (Note that ``.xpath(expr)`` is also
+ available as on all lxml elements.)
+
+``.label``:
+ Returns the corresponding ``<label>`` element for this element, if
+ any exists (None if there is none). Label elements have a
+ ``label.for_element`` attribute that points back to the element.
+
+``.base_url``:
+ The base URL for this element, if one was saved from the parsing.
+ This attribute is not settable. Is None when no base URL was
+ saved.
+
+``.classes``:
+ Returns a set-like object that allows accessing and modifying the
+ names in the 'class' attribute of the element. (New in lxml 3.5).
+
+``.set(key, value=None)``:
+ Sets an HTML attribute. If no value is given, or if the value is
+ ``None``, it creates a boolean attribute like ``<form novalidate></form>``
+ or ``<div custom-attribute></div>``. In XML, attributes must
+ have at least the empty string as their value like ``<form
+ novalidate=""></form>``, but HTML boolean attributes can also be
+ just present or absent from an element without having a value.
+
+Running HTML doctests
+=====================
+
+One of the interesting modules in the ``lxml.html`` package deals with
+doctests. It can be hard to compare two HTML pages for equality, as
+whitespace differences aren't meaningful and the structural formatting
+can differ. This is even more a problem in doctests, where output is
+tested for equality and small differences in whitespace or the order
+of attributes can let a test fail. And given the verbosity of
+tag-based languages, it may take more than a quick look to find the
+actual differences in the doctest output.
+
+Luckily, lxml provides the ``lxml.doctestcompare`` module that
+supports relaxed comparison of XML and HTML pages and provides a
+readable diff in the output when a test fails. The HTML comparison is
+most easily used by importing the ``usedoctest`` module in a doctest:
+
+.. sourcecode:: pycon
+
+ >>> import lxml.html.usedoctest
+
+Now, if you have an HTML document and want to compare it to an expected result
+document in a doctest, you can do the following:
+
+.. sourcecode:: pycon
+
+ >>> import lxml.html
+ >>> html = lxml.html.fromstring('''\
+ ... <html><body onload="" color="white">
+ ... <p>Hi !</p>
+ ... </body></html>
+ ... ''')
+
+ >>> print lxml.html.tostring(html)
+ <html><body onload="" color="white"><p>Hi !</p></body></html>
+
+ >>> print lxml.html.tostring(html)
+ <html> <body color="white" onload=""> <p>Hi !</p> </body> </html>
+
+ >>> print lxml.html.tostring(html)
+ <html>
+ <body color="white" onload="">
+ <p>Hi !</p>
+ </body>
+ </html>
+
+In documentation, you would likely prefer the pretty printed HTML output, as
+it is the most readable. However, the three documents are equivalent from the
+point of view of an HTML tool, so the doctest will silently accept any of the
+above. This allows you to concentrate on readability in your doctests, even
+if the real output is a straight ugly HTML one-liner.
+
+Note that there is also an ``lxml.usedoctest`` module which you can
+import for XML comparisons. The HTML parser notably ignores
+namespaces and some other XMLisms.
+
+
+Creating HTML with the E-factory
+================================
+
+.. _`E-factory`: http://online.effbot.org/2006_11_01_archive.htm#et-builder
+
+lxml.html comes with a predefined HTML vocabulary for the `E-factory`_,
+originally written by Fredrik Lundh. This allows you to quickly generate HTML
+pages and fragments:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.html import builder as E
+ >>> from lxml.html import usedoctest
+ >>> html = E.HTML(
+ ... E.HEAD(
+ ... E.LINK(rel="stylesheet", href="great.css", type="text/css"),
+ ... E.TITLE("Best Page Ever")
+ ... ),
+ ... E.BODY(
+ ... E.H1(E.CLASS("heading"), "Top News"),
+ ... E.P("World News only on this page", style="font-size: 200%"),
+ ... "Ah, and here's some more text, by the way.",
+ ... lxml.html.fromstring("<p>... and this is a parsed fragment ...</p>")
+ ... )
+ ... )
+
+ >>> print lxml.html.tostring(html)
+ <html>
+ <head>
+ <link href="great.css" rel="stylesheet" type="text/css">
+ <title>Best Page Ever</title>
+ </head>
+ <body>
+ <h1 class="heading">Top News</h1>
+ <p style="font-size: 200%">World News only on this page</p>
+ Ah, and here's some more text, by the way.
+ <p>... and this is a parsed fragment ...</p>
+ </body>
+ </html>
+
+Note that you should use ``lxml.html.tostring`` and **not**
+``lxml.tostring``. ``lxml.tostring(doc)`` will return the XML
+representation of the document, which is not valid HTML. In
+particular, things like ``<script src="..."></script>`` will be
+serialized as ``<script src="..." />``, which completely confuses
+browsers.
+
+Viewing your HTML
+-----------------
+
+A handy method for viewing your HTML:
+``lxml.html.open_in_browser(lxml_doc)`` will write the document to
+disk and open it in a browser (with the `webbrowser module
+<http://python.org/doc/current/lib/module-webbrowser.html>`_).
+
+Working with links
+==================
+
+There are several methods on elements that allow you to see and modify
+the links in a document.
+
+``.iterlinks()``:
+ This yields ``(element, attribute, link, pos)`` for every link in
+ the document. ``attribute`` may be None if the link is in the
+ text (as will be the case with a ``<style>`` tag with
+ ``@import``).
+
+ This finds any link in an ``action``, ``archive``, ``background``,
+ ``cite``, ``classid``, ``codebase``, ``data``, ``href``,
+ ``longdesc``, ``profile``, ``src``, ``usemap``, ``dynsrc``, or
+ ``lowsrc`` attribute. It also searches ``style`` attributes for
+ ``url(link)``, and ``<style>`` tags for ``@import`` and ``url()``.
+
+ This function does *not* pay attention to ``<base href>``.
+
+``.resolve_base_href()``:
+ This function will modify the document in-place to take account of
+ ``<base href>`` if the document contains that tag. In the process
+ it will also remove that tag from the document.
+
+``.make_links_absolute(base_href, resolve_base_href=True)``:
+ This makes all links in the document absolute, assuming that
+ ``base_href`` is the URL of the document. So if you pass
+ ``base_href="http://localhost/foo/bar.html"`` and there is a link
+ to ``baz.html`` that will be rewritten as
+ ``http://localhost/foo/baz.html``.
+
+ If ``resolve_base_href`` is true, then any ``<base href>`` tag
+ will be taken into account (just calling
+ ``self.resolve_base_href()``).
+
+``.rewrite_links(link_repl_func, resolve_base_href=True, base_href=None)``:
+ This rewrites all the links in the document using your given link
+ replacement function. If you give a ``base_href`` value, all
+ links will be passed in after they are joined with this URL.
+
+ For each link ``link_repl_func(link)`` is called. That function
+ then returns the new link, or None to remove the attribute or tag
+ that contains the link. Note that all links will be passed in,
+ including links like ``"#anchor"`` (which is purely internal), and
+ things like ``"mailto:bob@example.com"`` (or ``javascript:...``).
+
+ If you want access to the context of the link, you should use
+ ``.iterlinks()`` instead.
+
+Functions
+---------
+
+In addition to these methods, there are corresponding functions:
+
+* ``iterlinks(html)``
+* ``make_links_absolute(html, base_href, ...)``
+* ``rewrite_links(html, link_repl_func, ...)``
+* ``resolve_base_href(html)``
+
+These functions will parse ``html`` if it is a string, then return the new
+HTML as a string. If you pass in a document, the document will be copied
+(except for ``iterlinks()``), the method performed, and the new document
+returned.
+
+Forms
+=====
+
+Any ``<form>`` elements in a document are available through
+the list ``doc.forms`` (e.g., ``doc.forms[0]``). Form, input, select,
+and textarea elements each have special methods.
+
+Input elements (including ``<select>`` and ``<textarea>``) have these
+attributes:
+
+``.name``:
+ The name of the element.
+
+``.value``:
+ The value of an input, the content of a textarea, the selected
+ option(s) of a select. This attribute can be set.
+
+ In the case of a select that takes multiple options (``<select
+ multiple>``) this will be a set of the selected options; you can
+ add or remove items to select and unselect the options.
+
+Select attributes:
+
+``.value_options``:
+ For select elements, this is all the *possible* values (the values
+ of all the options).
+
+``.multiple``:
+ For select elements, true if this is a ``<select multiple>``
+ element.
+
+Input attributes:
+
+``.type``:
+ The type attribute in ``<input>`` elements.
+
+``.checkable``:
+ True if this can be checked (i.e., true for type=radio and
+ type=checkbox).
+
+``.checked``:
+ If this element is checkable, the checked state. Raises
+ AttributeError on non-checkable inputs.
+
+The form itself has these attributes:
+
+``.inputs``:
+ A dictionary-like object that can be used to access input elements
+ by name. When there are multiple input elements with the same
+ name, this returns list-like structures that can also be used to
+ access the options and their values as a group.
+
+``.fields``:
+ A dictionary-like object used to access *values* by their name.
+ ``form.inputs`` returns elements, this only returns values.
+ Setting values in this dictionary will effect the form inputs.
+ Basically ``form.fields[x]`` is equivalent to
+ ``form.inputs[x].value`` and ``form.fields[x] = y`` is equivalent
+ to ``form.inputs[x].value = y``. (Note that sometimes
+ ``form.inputs[x]`` returns a compound object, but these objects
+ also have ``.value`` attributes.)
+
+ If you set this attribute, it is equivalent to
+ ``form.fields.clear(); form.fields.update(new_value)``
+
+``.form_values()``:
+ Returns a list of ``[(name, value), ...]``, suitable to be passed
+ to ``urllib.urlencode()`` for form submission.
+
+``.action``:
+ The ``action`` attribute. This is resolved to an absolute URL if
+ possible.
+
+``.method``:
+ The ``method`` attribute, which defaults to ``GET``.
+
+Form Filling Example
+--------------------
+
+Note that you can change any of these attributes (values, method,
+action, etc) and then serialize the form to see the updated values.
+You can, for instance, do:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.html import fromstring, tostring
+ >>> form_page = fromstring('''<html><body><form>
+ ... Your name: <input type="text" name="name"> <br>
+ ... Your phone: <input type="text" name="phone"> <br>
+ ... Your favorite pets: <br>
+ ... Dogs: <input type="checkbox" name="interest" value="dogs"> <br>
+ ... Cats: <input type="checkbox" name="interest" value="cats"> <br>
+ ... Llamas: <input type="checkbox" name="interest" value="llamas"> <br>
+ ... <input type="submit"></form></body></html>''')
+ >>> form = form_page.forms[0]
+ >>> form.fields = dict(
+ ... name='John Smith',
+ ... phone='555-555-3949',
+ ... interest=set(['cats', 'llamas']))
+ >>> print tostring(form)
+ <html>
+ <body>
+ <form>
+ Your name:
+ <input name="name" type="text" value="John Smith">
+ <br>Your phone:
+ <input name="phone" type="text" value="555-555-3949">
+ <br>Your favorite pets:
+ <br>Dogs:
+ <input name="interest" type="checkbox" value="dogs">
+ <br>Cats:
+ <input checked name="interest" type="checkbox" value="cats">
+ <br>Llamas:
+ <input checked name="interest" type="checkbox" value="llamas">
+ <br>
+ <input type="submit">
+ </form>
+ </body>
+ </html>
+
+
+Form Submission
+---------------
+
+You can submit a form with ``lxml.html.submit_form(form_element)``.
+This will return a file-like object (the result of
+``urllib.urlopen()``).
+
+If you have extra input values you want to pass you can use the
+keyword argument ``extra_values``, like ``extra_values={'submit':
+'Yes!'}``. This is the only way to get submit values into the form,
+as there is no state of "submitted" for these elements.
+
+You can pass in an alternate opener with the ``open_http`` keyword
+argument, which is a function with the signature ``open_http(method,
+url, values)``.
+
+Example:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.html import parse, submit_form
+ >>> page = parse('http://tinyurl.com').getroot()
+ >>> page.forms[0].fields['url'] = 'http://lxml.de/'
+ >>> result = parse(submit_form(page.forms[0])).getroot()
+ >>> [a.attrib['href'] for a in result.xpath("//a[@target='_blank']")]
+ ['http://tinyurl.com/2xae8s', 'http://preview.tinyurl.com/2xae8s']
+
+Cleaning up HTML
+================
+
+The module ``lxml.html.clean`` provides a ``Cleaner`` class for cleaning up
+HTML pages. It supports removing embedded or script content, special tags,
+CSS style annotations and much more.
+
+Say, you have an evil web page from an untrusted source that contains lots of
+content that upsets browsers and tries to run evil code on the client side:
+
+.. sourcecode:: pycon
+
+ >>> html = '''\
+ ... <html>
+ ... <head>
+ ... <script type="text/javascript" src="evil-site"></script>
+ ... <link rel="alternate" type="text/rss" src="evil-rss">
+ ... <style>
+ ... body {background-image: url(javascript:do_evil)};
+ ... div {color: expression(evil)};
+ ... </style>
+ ... </head>
+ ... <body onload="evil_function()">
+ ... <!-- I am interpreted for EVIL! -->
+ ... <a href="javascript:evil_function()">a link</a>
+ ... <a href="#" onclick="evil_function()">another link</a>
+ ... <p onclick="evil_function()">a paragraph</p>
+ ... <div style="display: none">secret EVIL!</div>
+ ... <object> of EVIL! </object>
+ ... <iframe src="evil-site"></iframe>
+ ... <form action="evil-site">
+ ... Password: <input type="password" name="password">
+ ... </form>
+ ... <blink>annoying EVIL!</blink>
+ ... <a href="evil-site">spam spam SPAM!</a>
+ ... <image src="evil!">
+ ... </body>
+ ... </html>'''
+
+To remove the all suspicious content from this unparsed document, use the
+``clean_html`` function:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.html.clean import clean_html
+ >>> print clean_html(html)
+ <div><style>/* deleted */</style><body>
+
+ <a href="">a link</a>
+ <a href="#">another link</a>
+ <p>a paragraph</p>
+ <div>secret EVIL!</div>
+ of EVIL!
+
+
+ Password:
+ annoying EVIL!<a href="evil-site">spam spam SPAM!</a>
+ <img src="evil!"></body></div>
+
+The ``Cleaner`` class supports several keyword arguments to control exactly
+which content is removed:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.html.clean import Cleaner
+
+ >>> cleaner = Cleaner(page_structure=False, links=False)
+ >>> print cleaner.clean_html(html)
+ <html>
+ <head>
+ <link rel="alternate" src="evil-rss" type="text/rss">
+ <style>/* deleted */</style>
+ </head>
+ <body>
+ <a href="">a link</a>
+ <a href="#">another link</a>
+ <p>a paragraph</p>
+ <div>secret EVIL!</div>
+ of EVIL!
+ Password:
+ annoying EVIL!
+ <a href="evil-site">spam spam SPAM!</a>
+ <img src="evil!">
+ </body>
+ </html>
+
+ >>> cleaner = Cleaner(style=True, links=True, add_nofollow=True,
+ ... page_structure=False, safe_attrs_only=False)
+
+ >>> print cleaner.clean_html(html)
+ <html>
+ <head>
+ </head>
+ <body>
+ <a href="">a link</a>
+ <a href="#">another link</a>
+ <p>a paragraph</p>
+ <div>secret EVIL!</div>
+ of EVIL!
+ Password:
+ annoying EVIL!
+ <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
+ <img src="evil!">
+ </body>
+ </html>
+
+You can also whitelist some otherwise dangerous content with
+``Cleaner(host_whitelist=['www.youtube.com'])``, which would allow
+embedded media from YouTube, while still filtering out embedded media
+from other sites.
+
+See the docstring of ``Cleaner`` for the details of what can be
+cleaned.
+
+
+autolink
+--------
+
+In addition to cleaning up malicious HTML, ``lxml.html.clean``
+contains functions to do other things to your HTML. This includes
+autolinking::
+
+ autolink(doc, ...)
+
+ autolink_html(html, ...)
+
+This finds anything that looks like a link (e.g.,
+``http://example.com``) in the *text* of an HTML document, and
+turns it into an anchor. It avoids making bad links.
+
+Links in the elements ``<textarea>``, ``<pre>``, ``<code>``,
+anything in the head of the document. You can pass in a list of
+elements to avoid in ``avoid_elements=['textarea', ...]``.
+
+Links to some hosts can be avoided. By default links to
+``localhost*``, ``example.*`` and ``127.0.0.1`` are not
+autolinked. Pass in ``avoid_hosts=[list_of_regexes]`` to control
+this.
+
+Elements with the ``nolink`` CSS class are not autolinked. Pass
+in ``avoid_classes=['code', ...]`` to control this.
+
+The ``autolink_html()`` version of the function parses the HTML
+string first, and returns a string.
+
+
+wordwrap
+--------
+
+You can also wrap long words in your html::
+
+ word_break(doc, max_width=40, ...)
+
+ word_break_html(html, ...)
+
+This finds any long words in the text of the document and inserts
+``&#8203;`` in the document (which is the Unicode zero-width space).
+
+This avoids the elements ``<pre>``, ``<textarea>``, and ``<code>``.
+You can control this with ``avoid_elements=['textarea', ...]``.
+
+It also avoids elements with the CSS class ``nobreak``. You can
+control this with ``avoid_classes=['code', ...]``.
+
+Lastly you can control the character that is inserted with
+``break_character=u'\u200b'``. However, you cannot insert markup,
+only text.
+
+``word_break_html(html)`` parses the HTML document and returns a
+string.
+
+HTML Diff
+=========
+
+The module ``lxml.html.diff`` offers some ways to visualize
+differences in HTML documents. These differences are *content*
+oriented. That is, changes in markup are largely ignored; only
+changes in the content itself are highlighted.
+
+There are two ways to view differences: ``htmldiff`` and
+``html_annotate``. One shows differences with ``<ins>`` and
+``<del>``, while the other annotates a set of changes similar to ``svn
+blame``. Both these functions operate on text, and work best with
+content fragments (only what goes in ``<body>``), not complete
+documents.
+
+Example of ``htmldiff``:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.html.diff import htmldiff, html_annotate
+ >>> doc1 = '''<p>Here is some text.</p>'''
+ >>> doc2 = '''<p>Here is <b>a lot</b> of <i>text</i>.</p>'''
+ >>> doc3 = '''<p>Here is <b>a little</b> <i>text</i>.</p>'''
+ >>> print htmldiff(doc1, doc2)
+ <p>Here is <ins><b>a lot</b> of <i>text</i>.</ins> <del>some text.</del> </p>
+ >>> print html_annotate([(doc1, 'author1'), (doc2, 'author2'),
+ ... (doc3, 'author3')])
+ <p><span title="author1">Here is</span>
+ <b><span title="author2">a</span>
+ <span title="author3">little</span></b>
+ <i><span title="author2">text</span></i>
+ <span title="author2">.</span></p>
+
+As you can see, it is imperfect as such things tend to be. On larger
+tracts of text with larger edits it will generally do better.
+
+The ``html_annotate`` function can also take an optional second
+argument, ``markup``. This is a function like ``markup(text,
+version)`` that returns the given text marked up with the given
+version. The default version, the output of which you see in the
+example, looks like:
+
+.. sourcecode:: python
+
+ def default_markup(text, version):
+ return '<span title="%s">%s</span>' % (
+ cgi.escape(unicode(version), 1), text)
+
+Examples
+========
+
+Microformat Example
+-------------------
+
+This example parses the `hCard <http://microformats.org/wiki/hcard>`_
+microformat.
+
+First we get the page:
+
+.. sourcecode:: pycon
+
+ >>> import urllib
+ >>> from lxml.html import fromstring
+ >>> url = 'http://microformats.org/'
+ >>> content = urllib.urlopen(url).read()
+ >>> doc = fromstring(content)
+ >>> doc.make_links_absolute(url)
+
+Then we create some objects to put the information in:
+
+.. sourcecode:: pycon
+
+ >>> class Card(object):
+ ... def __init__(self, **kw):
+ ... for name, value in kw:
+ ... setattr(self, name, value)
+ >>> class Phone(object):
+ ... def __init__(self, phone, types=()):
+ ... self.phone, self.types = phone, types
+
+And some generally handy functions for microformats:
+
+.. sourcecode:: pycon
+
+ >>> def get_text(el, class_name):
+ ... els = el.find_class(class_name)
+ ... if els:
+ ... return els[0].text_content()
+ ... else:
+ ... return ''
+ >>> def get_value(el):
+ ... return get_text(el, 'value') or el.text_content()
+ >>> def get_all_texts(el, class_name):
+ ... return [e.text_content() for e in els.find_class(class_name)]
+ >>> def parse_addresses(el):
+ ... # Ideally this would parse street, etc.
+ ... return el.find_class('adr')
+
+Then the parsing:
+
+.. sourcecode:: pycon
+
+ >>> for el in doc.find_class('hcard'):
+ ... card = Card()
+ ... card.el = el
+ ... card.fn = get_text(el, 'fn')
+ ... card.tels = []
+ ... for tel_el in card.find_class('tel'):
+ ... card.tels.append(Phone(get_value(tel_el),
+ ... get_all_texts(tel_el, 'type')))
+ ... card.addresses = parse_addresses(el)
diff --git a/doc/main.txt b/doc/main.txt
new file mode 100644
index 0000000..ead457d
--- /dev/null
+++ b/doc/main.txt
@@ -0,0 +1,307 @@
+lxml
+====
+
+.. meta::
+ :description: lxml - the most feature-rich and easy-to-use library for processing XML and HTML in the Python language
+ :keywords: Python XML, XML processing, HTML, lxml, simple XML, ElementTree, etree, lxml.etree, objectify, XML parsing, XML validation, XPath, XSLT
+
+.. class:: pagequote
+
+| `» lxml takes all the pain out of XML. « <https://mailman-mail5.webfaction.com/pipermail/lxml/20080131/019119.html>`_
+| Stephan Richter
+
+.. class:: eyecatcher
+
+ lxml is the most feature-rich
+ and easy-to-use library
+ for processing XML and HTML
+ in the Python language.
+
+..
+ 1 Introduction
+ 2 Documentation
+ 3 Download
+ 4 Mailing list
+ 5 Bug tracker
+ 6 License
+ 7 Old Versions
+
+
+Introduction
+------------
+
+The lxml XML toolkit is a Pythonic binding for the C libraries
+libxml2_ and libxslt_. It is unique in that it combines the speed and
+XML feature completeness of these libraries with the simplicity of a
+native Python API, mostly compatible but superior to the well-known
+ElementTree_ API. The latest release works with all CPython versions
+from 2.7 to 3.9. See the introduction_ for more information about
+background and goals of the lxml project. Some common questions are
+answered in the FAQ_.
+
+.. _libxml2: http://xmlsoft.org/
+.. _libxslt: http://xmlsoft.org/XSLT/
+
+.. _introduction: intro.html
+.. _FAQ: FAQ.html
+
+
+Documentation
+-------------
+
+The complete lxml documentation is available for download as `PDF
+documentation`_. The HTML documentation from this web site is part of
+the normal `source download <#download>`_.
+
+* Tutorials:
+
+ * the `lxml.etree tutorial for XML processing`_
+
+ * John Shipman's tutorial on `Python XML processing with lxml`_
+
+ * Fredrik Lundh's `tutorial for ElementTree`_
+
+* ElementTree:
+
+ * `ElementTree API`_
+
+ * compatibility_ and differences of lxml.etree
+
+ * `ElementTree performance`_ characteristics and comparison
+
+* lxml.etree:
+
+ * `lxml.etree specific API`_ documentation
+
+ * the `generated API documentation`_ as a reference
+
+ * parsing_ and validating_ XML
+
+ * `XPath and XSLT`_ support
+
+ * Python `XPath extension functions`_ for XPath and XSLT
+
+ * `custom XML element classes`_ for custom XML APIs (see `EuroPython 2008 talk`_)
+
+ * a `SAX compliant API`_ for interfacing with other XML tools
+
+ * a `C-level API`_ for interfacing with external C/Cython modules
+
+* lxml.objectify:
+
+ * `lxml.objectify`_ API documentation
+
+ * a brief comparison of `objectify and etree`_
+
+lxml.etree follows the ElementTree_ API as much as possible, building
+it on top of the native libxml2 tree. If you are new to ElementTree,
+start with the `lxml.etree tutorial for XML processing`_. See also the
+ElementTree compatibility_ overview and the `ElementTree performance`_
+page comparing lxml to the original ElementTree_ and cElementTree_
+implementations.
+
+Right after the `lxml.etree tutorial for XML processing`_ and the
+ElementTree_ documentation, the next place to look is the `lxml.etree
+specific API`_ documentation. It describes how lxml extends the
+ElementTree API to expose libxml2 and libxslt specific XML
+functionality, such as XPath_, `Relax NG`_, `XML Schema`_, XSLT_, and
+`c14n`_ (including `c14n 2.0`_).
+Python code can be called from XPath expressions and XSLT
+stylesheets through the use of `XPath extension functions`_. lxml
+also offers a `SAX compliant API`_, that works with the SAX support in
+the standard library.
+
+There is a separate module `lxml.objectify`_ that implements a data-binding
+API on top of lxml.etree. See the `objectify and etree`_ FAQ entry for a
+comparison.
+
+In addition to the ElementTree API, lxml also features a sophisticated
+API for `custom XML element classes`_. This is a simple way to write
+arbitrary XML driven APIs on top of lxml. lxml.etree also has a
+`C-level API`_ that can be used to efficiently extend lxml.etree in
+external C modules, including fast custom element class support.
+
+.. _ElementTree: http://effbot.org/zone/element-index.htm
+.. _`ElementTree API`: http://effbot.org/zone/element-index.htm#documentation
+.. _cElementTree: http://effbot.org/zone/celementtree.htm
+
+.. _`tutorial for ElementTree`: http://effbot.org/zone/element.htm
+.. _`lxml.etree tutorial for XML processing`: tutorial.html
+.. _`Python XML processing with lxml`: http://www.nmt.edu/tcc/help/pubs/pylxml/
+.. _`generated API documentation`: api/index.html
+.. _`ElementTree performance`: performance.html
+.. _`compatibility`: compatibility.html
+.. _`lxml.etree specific API`: api.html
+.. _`parsing`: parsing.html
+.. _`validating`: validation.html
+.. _`XPath and XSLT`: xpathxslt.html
+.. _`XPath extension functions`: extensions.html
+.. _`custom XML element classes`: element_classes.html
+.. _`SAX compliant API`: sax.html
+.. _`C-level API`: capi.html
+.. _`lxml.objectify`: objectify.html
+.. _`objectify and etree`: FAQ.html#what-is-the-difference-between-lxml-etree-and-lxml-objectify
+.. _`EuroPython 2008 talk`: s5/lxml-ep2008.html
+
+.. _XPath: https://www.w3.org/TR/xpath/
+.. _`Relax NG`: https://relaxng.org/
+.. _`XML Schema`: https://www.w3.org/XML/Schema
+.. _`XSLT`: https://www.w3.org/TR/xslt
+.. _`c14n`: https://www.w3.org/TR/xml-c14n
+.. _`c14n 2.0`: https://www.w3.org/TR/xml-c14n2
+
+
+Download
+--------
+
+The best way to download lxml is to visit `lxml at the Python Package
+Index <http://pypi.python.org/pypi/lxml/>`_ (PyPI). It has the source
+that compiles on various platforms. The source distribution is signed
+with `this key <pubkey.asc>`_.
+
+The latest version is `lxml 4.6.3`_, released 2021-03-21
+(`changes for 4.6.3`_). `Older versions <#old-versions>`_
+are listed below.
+
+Please take a look at the
+`installation instructions <installation.html>`_ !
+
+This complete web site (including the generated API documentation) is
+part of the source distribution, so if you want to download the
+documentation for offline use, take the source archive and copy the
+``doc/html`` directory out of the source tree, or use the
+`PDF documentation`_.
+
+The latest `installable developer sources <https://github.com/lxml/lxml/archive/master.zip>`_
+are available from Github. It's also possible to check out
+the latest development version of lxml from Github directly, using a command
+like this (assuming you use hg and have hg-git installed)::
+
+ hg clone git+ssh://git@github.com/lxml/lxml.git lxml
+
+Alternatively, if you use git, this should work as well::
+
+ git clone https://github.com/lxml/lxml.git lxml
+
+You can browse the `source repository`_ and its history through
+the web. Please read `how to build lxml from source <build.html>`_
+first. The `latest CHANGES`_ of the developer version are also
+accessible. You can check there if a bug you found has been fixed
+or a feature you want has been implemented in the latest trunk version.
+
+.. _`source repository`: https://github.com/lxml/lxml/
+.. _`latest CHANGES`: https://github.com/lxml/lxml/blob/master/CHANGES.txt
+
+
+Mailing list
+------------
+
+Questions? Suggestions? Code to contribute? We have a `mailing list`_.
+
+You can search the archive with Gmane_ or Google_.
+
+.. _`mailing list`: http://lxml.de/mailinglist/
+.. _Gmane: http://blog.gmane.org/gmane.comp.python.lxml.devel
+.. _Google: http://www.google.com/webhp?q=site:comments.gmane.org%2Fgmane.comp.python.lxml.devel+
+
+
+Bug tracker
+-----------
+
+lxml uses the `launchpad bug tracker`_. If you are sure you found a
+bug in lxml, please file a bug report there. If you are not sure
+whether some unexpected behaviour of lxml is a bug or not, please
+check the documentation and ask on the `mailing list`_ first. Do not
+forget to search the archive (e.g. with Gmane_)!
+
+.. _`launchpad bug tracker`: https://launchpad.net/lxml/
+
+
+License
+-------
+
+The lxml library is shipped under a `BSD license`_. libxml2 and libxslt2
+itself are shipped under the `MIT license`_. There should therefore be no
+obstacle to using lxml in your codebase.
+
+.. _`BSD license`: https://github.com/lxml/lxml/blob/master/doc/licenses/BSD.txt
+.. _`MIT license`: http://www.opensource.org/licenses/mit-license.html
+
+
+Old Versions
+------------
+
+See the websites of lxml
+`4.5 <http://lxml.de/4.5/>`_,
+`4.4 <http://lxml.de/4.4/>`_,
+`4.3 <http://lxml.de/4.3/>`_,
+`4.2 <http://lxml.de/4.2/>`_,
+`4.1 <http://lxml.de/4.1/>`_,
+`4.0 <http://lxml.de/4.0/>`_,
+`3.8 <http://lxml.de/3.8/>`_,
+`3.7 <http://lxml.de/3.7/>`_,
+`3.6 <http://lxml.de/3.6/>`_,
+`3.5 <http://lxml.de/3.5/>`_,
+`3.4 <http://lxml.de/3.4/>`_,
+`3.3 <http://lxml.de/3.3/>`_,
+`3.2 <http://lxml.de/3.2/>`_,
+`3.1 <http://lxml.de/3.1/>`_,
+`3.0 <http://lxml.de/3.0/>`_,
+`2.3 <http://lxml.de/2.3/>`_,
+`2.2 <http://lxml.de/2.2/>`_,
+`2.1 <http://lxml.de/2.1/>`_,
+`2.0 <http://lxml.de/2.0/>`_,
+`1.3 <http://lxml.de/1.3/>`_
+
+..
+ and the `latest in-development version <http://lxml.de/dev/>`_.
+
+.. _`PDF documentation`: lxmldoc-4.6.3.pdf
+
+* `lxml 4.6.3`_, released 2021-03-21 (`changes for 4.6.3`_)
+
+* `lxml 4.6.2`_, released 2020-11-26 (`changes for 4.6.2`_)
+
+* `lxml 4.6.1`_, released 2020-10-18 (`changes for 4.6.1`_)
+
+* `lxml 4.6.0`_, released 2020-10-17 (`changes for 4.6.0`_)
+
+* `lxml 4.5.2`_, released 2020-07-09 (`changes for 4.5.2`_)
+
+* `lxml 4.5.1`_, released 2020-05-19 (`changes for 4.5.1`_)
+
+* `lxml 4.5.0`_, released 2020-01-29 (`changes for 4.5.0`_)
+
+* `lxml 4.4.3`_, released 2020-01-28 (`changes for 4.4.3`_)
+
+* `lxml 4.4.2`_, released 2019-11-25 (`changes for 4.4.2`_)
+
+* `lxml 4.4.1`_, released 2019-08-11 (`changes for 4.4.1`_)
+
+* `lxml 4.4.0`_, released 2019-07-27 (`changes for 4.4.0`_)
+
+* `older releases <http://lxml.de/4.3/#old-versions>`_
+
+.. _`lxml 4.6.3`: /files/lxml-4.6.3.tgz
+.. _`lxml 4.6.2`: /files/lxml-4.6.2.tgz
+.. _`lxml 4.6.1`: /files/lxml-4.6.1.tgz
+.. _`lxml 4.6.0`: /files/lxml-4.6.0.tgz
+.. _`lxml 4.5.2`: /files/lxml-4.5.2.tgz
+.. _`lxml 4.5.1`: /files/lxml-4.5.1.tgz
+.. _`lxml 4.5.0`: /files/lxml-4.5.0.tgz
+.. _`lxml 4.4.3`: /files/lxml-4.4.3.tgz
+.. _`lxml 4.4.2`: /files/lxml-4.4.2.tgz
+.. _`lxml 4.4.1`: /files/lxml-4.4.1.tgz
+.. _`lxml 4.4.0`: /files/lxml-4.4.0.tgz
+
+.. _`changes for 4.6.3`: /changes-4.6.3.html
+.. _`changes for 4.6.2`: /changes-4.6.2.html
+.. _`changes for 4.6.1`: /changes-4.6.1.html
+.. _`changes for 4.6.0`: /changes-4.6.0.html
+.. _`changes for 4.5.2`: /changes-4.5.2.html
+.. _`changes for 4.5.1`: /changes-4.5.1.html
+.. _`changes for 4.5.0`: /changes-4.5.0.html
+.. _`changes for 4.4.3`: /changes-4.4.3.html
+.. _`changes for 4.4.2`: /changes-4.4.2.html
+.. _`changes for 4.4.1`: /changes-4.4.1.html
+.. _`changes for 4.4.0`: /changes-4.4.0.html
diff --git a/doc/memorymanagement.txt b/doc/memorymanagement.txt
new file mode 100644
index 0000000..f6f1dec
--- /dev/null
+++ b/doc/memorymanagement.txt
@@ -0,0 +1,83 @@
+Memory management
+=================
+
+There can be two types of nodes:
+
+* those connected to an existing tree
+
+* those unconnected. These may be the top node of a tree
+
+Nodes consist of a C-level libxml2 node, Node for short, and
+optionally a Python-level proxy node, Proxy. Zero, one or more Proxies can
+exist for a single Node.
+
+Proxies are garbage collected automatically by Python. Nodes are not
+garbage collected at all. Instead, explicit mechanisms exist for
+Nodes to clear them and the tree they may be the top of.
+
+A Node can be safely freed when:
+
+* no Proxy is connected to this Node
+
+* no Proxy cannot be created for this Node
+
+A Proxy cannot be created to a CNode when:
+
+* no Proxy exist for nodes that are connected to that Node
+
+This is the case when:
+
+* the Node is in a tree that has no Proxy connected to any of the nodes.
+
+This means that the whole tree in such a condition can be freed.
+
+Detecting whether a Node is in a tree that has no Proxies connected to
+it can be done by relying on Python's garbage collection
+algorithm. Each Proxy can have a reference to the Proxy that points to
+the top of the tree. In case of a document tree, this reference is to
+the Document Proxy. When no more references exist in the system to the
+top Proxy, this means no more Proxies exist that point to the Node
+tree the top Proxy is the top of. If this Node tree is unconnected;
+i.e. it is not a subtree, this means that tree can be safely garbage
+collected.
+
+A special case exists for document references. Each Proxy will always
+have a reference to the Document Proxy, as any Node will have such a
+reference to the Document Node. This means that a Document Node can
+only be garbage collected when no more Proxies at all exist anymore
+which refer to the Document. This is a separate system from the
+top-Node references, even though the top-node in many cases will be
+the Document. This because there is no way to get to a node that is
+not connected to the Document tree from a Document Proxy.
+
+This approach requires a system that can keep track of the top of the
+tree in any case. Usually this is simple: when a Proxy gets connected,
+the tree top becomes the tree top of whatever node it is connected
+to.
+
+Sometimes this is more difficult: a Proxy may exist pointing to a node
+in a subtree that just got connected. The top reference cannot be
+updated. This is a problem in the following case:
+
+ a
+ b c h
+d e f g i j
+ k
+
+now imagine we have a proxy to k, K, and a proxy of i, I. They both
+have a pointer to proxy H.
+
+Now imagine i gets moved under g through proxy I. Proxy I will have an
+updated pointer to proxy A. However, proxy K cannot be updated and still
+points to H, from which it is now in fact disconnected.
+
+proxy H cannot be removed now until proxy A is removed. In addition,
+proxy A has a refcount that is too low because proxy K doesn't point
+to it but should.
+
+Another strategy involves having a reference count on the underlying
+nodes, one per proxy. A node can only be freed if there is no
+descendant-or-self that has the refcount higher than 0. A node, when
+no more Python references to it exist, will check for refcounts first.
+The drawback of this is potentially heavy tree-walking each time a proxy
+can be removed.
diff --git a/doc/mkhtml.py b/doc/mkhtml.py
new file mode 100644
index 0000000..c652335
--- /dev/null
+++ b/doc/mkhtml.py
@@ -0,0 +1,327 @@
+from __future__ import absolute_import
+
+from docstructure import SITE_STRUCTURE, HREF_MAP, BASENAME_MAP
+from lxml.etree import (parse, fromstring, ElementTree,
+ Element, SubElement, XPath, XML)
+import glob
+import hashlib
+import os
+import re
+import sys
+import copy
+import shutil
+import textwrap
+import subprocess
+
+from io import open as open_file
+
+RST2HTML_OPTIONS = " ".join([
+ '--no-toc-backlinks',
+ '--strip-comments',
+ '--language en',
+ '--date',
+ ])
+
+XHTML_NS = 'http://www.w3.org/1999/xhtml'
+htmlnsmap = {"h": XHTML_NS}
+
+find_head = XPath("/h:html/h:head[1]", namespaces=htmlnsmap)
+find_body = XPath("/h:html/h:body[1]", namespaces=htmlnsmap)
+find_title = XPath("/h:html/h:head/h:title/text()", namespaces=htmlnsmap)
+find_title_tag = XPath("/h:html/h:head/h:title", namespaces=htmlnsmap)
+find_headings = XPath("//h:h1[not(@class)]//text()", namespaces=htmlnsmap)
+find_heading_tag = XPath("//h:h1[@class = 'title'][1]", namespaces=htmlnsmap)
+find_menu = XPath("//h:ul[@id=$name]", namespaces=htmlnsmap)
+find_page_end = XPath("/h:html/h:body/h:div[last()]", namespaces=htmlnsmap)
+
+find_words = re.compile(r'(\w+)').findall
+replace_invalid = re.compile(r'[-_/.\s\\]').sub
+
+
+def make_menu_section_head(section, menuroot):
+ section_id = section + '-section'
+ section_head = menuroot.xpath("//ul[@id=$section]/li", section=section_id)
+ if not section_head:
+ ul = SubElement(menuroot, "ul", id=section_id)
+ section_head = SubElement(ul, "li")
+ title = SubElement(section_head, "span", {"class":"section title"})
+ title.text = section
+ else:
+ section_head = section_head[0]
+ return section_head
+
+
+def build_menu(tree, basename, section_head):
+ page_title = find_title(tree)
+ if page_title:
+ page_title = page_title[0]
+ else:
+ page_title = replace_invalid('', basename.capitalize())
+ build_menu_entry(page_title, basename+".html", section_head,
+ headings=find_headings(tree))
+
+
+def build_menu_entry(page_title, url, section_head, headings=None):
+ page_id = replace_invalid(' ', os.path.splitext(url)[0]) + '-menu'
+ ul = SubElement(section_head, "ul", {"class":"menu foreign", "id":page_id})
+
+ title = SubElement(ul, "li", {"class":"menu title"})
+ a = SubElement(title, "a", href=url)
+ a.text = page_title
+
+ if headings:
+ subul = SubElement(title, "ul", {"class":"submenu"})
+ for heading in headings:
+ li = SubElement(subul, "li", {"class":"menu item"})
+ try:
+ ref = heading.getparent().getparent().get('id')
+ except AttributeError:
+ ref = None
+ if ref is None:
+ ref = '-'.join(find_words(replace_invalid(' ', heading.lower())))
+ a = SubElement(li, "a", href=url+'#'+ref)
+ a.text = heading
+
+
+def merge_menu(tree, menu, name):
+ menu_root = copy.deepcopy(menu)
+ tree.getroot()[1][0].insert(0, menu_root) # html->body->div[class=document]
+ for el in menu_root.iter():
+ tag = el.tag
+ if tag[0] != '{':
+ el.tag = "{http://www.w3.org/1999/xhtml}" + tag
+ current_menu = find_menu(
+ menu_root, name=replace_invalid(' ', name) + '-menu')
+ if not current_menu:
+ current_menu = find_menu(
+ menu_root, name=replace_invalid('-', name) + '-menu')
+ if current_menu:
+ for submenu in current_menu:
+ submenu.set("class", submenu.get("class", "").
+ replace("foreign", "current"))
+ return tree
+
+
+def inject_flatter_button(tree):
+ head = tree.xpath('h:head[1]', namespaces=htmlnsmap)[0]
+ script = SubElement(head, '{%s}script' % XHTML_NS, type='text/javascript')
+ script.text = """
+ (function() {
+ var s = document.createElement('script');
+ var t = document.getElementsByTagName('script')[0];
+ s.type = 'text/javascript';
+ s.async = true;
+ s.src = 'http://api.flattr.com/js/0.6/load.js?mode=auto';
+ t.parentNode.insertBefore(s, t);
+ })();
+"""
+ script.tail = '\n'
+ intro_div = tree.xpath('h:body//h:div[@id = "introduction"][1]', namespaces=htmlnsmap)[0]
+ intro_div.insert(-1, XML(
+ '<p style="text-align: center;">Like working with lxml? '
+ 'Happy about the time that it just saved you? <br />'
+ 'Show your appreciation with <a href="http://flattr.com/thing/268156/lxml-The-Python-XML-Toolkit">Flattr</a>.<br />'
+ '<a class="FlattrButton" style="display:none;" rev="flattr;button:compact;" href="http://lxml.de/"></a>'
+ '</p>'
+ ))
+
+
+def inject_donate_buttons(lxml_path, rst2html_script, tree):
+ command = ([sys.executable, rst2html_script]
+ + RST2HTML_OPTIONS.split() + [os.path.join(lxml_path, 'README.rst')])
+ rst2html = subprocess.Popen(command, stdout=subprocess.PIPE)
+ stdout, _ = rst2html.communicate()
+ readme = fromstring(stdout)
+
+ intro_div = tree.xpath('h:body//h:div[@id = "introduction"][1]',
+ namespaces=htmlnsmap)[0]
+ support_div = readme.xpath('h:body//h:div[@id = "support-the-project"][1]',
+ namespaces=htmlnsmap)[0]
+ intro_div.append(support_div)
+
+ finance_div = readme.xpath('h:body//h:div[@id = "project-income-report"][1]',
+ namespaces=htmlnsmap)[0]
+ legal = readme.xpath('h:body//h:div[@id = "legal-notice-for-donations"][1]',
+ namespaces=htmlnsmap)[0]
+ last_div = tree.xpath('h:body//h:div//h:div', namespaces=htmlnsmap)[-1]
+ last_div.addnext(finance_div)
+ finance_div.addnext(legal)
+
+
+def inject_banner(parent):
+ banner = parent.makeelement('div', {'class': 'banner'})
+ parent.insert(0, banner)
+
+ banner_image = SubElement(banner, 'div', {'class': "banner_image"})
+ SubElement(banner_image, 'img', src="python-xml-title.png")
+
+ banner_text = SubElement(banner, 'div', {'class': "banner_link"})
+ banner_link = SubElement(banner_text, 'a', href="index.html#support-the-project")
+ banner_link.text = "Like the tool? "
+ SubElement(banner_link, 'br', {'class': "first"}).tail = "Help making it better! "
+ SubElement(banner_link, 'br', {'class': "second"}).tail = "Your donation helps!"
+
+
+def rest2html(script, source_path, dest_path, stylesheet_url):
+ command = ('%s %s %s --stylesheet=%s --link-stylesheet %s > %s' %
+ (sys.executable, script, RST2HTML_OPTIONS,
+ stylesheet_url, source_path, dest_path))
+ subprocess.call(command, shell=True)
+
+
+def convert_changelog(lxml_path, changelog_file_path, rst2html_script, stylesheet_url):
+ f = open_file(os.path.join(lxml_path, 'CHANGES.txt'), 'r', encoding='utf-8')
+ try:
+ content = f.read()
+ finally:
+ f.close()
+
+ links = dict(LP='`%s <https://bugs.launchpad.net/lxml/+bug/%s>`_',
+ GH='`%s <https://github.com/lxml/lxml/issues/%s>`_')
+ replace_tracker_links = re.compile('((LP|GH)#([0-9]+))').sub
+ def insert_link(match):
+ text, ref_type, ref_id = match.groups()
+ return links[ref_type] % (text, ref_id)
+ content = replace_tracker_links(insert_link, content)
+
+ command = [sys.executable, rst2html_script] + RST2HTML_OPTIONS.split() + [
+ '--link-stylesheet', '--stylesheet', stylesheet_url ]
+ out_file = open(changelog_file_path, 'wb')
+ try:
+ rst2html = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=out_file)
+ rst2html.communicate(content.encode('utf8'))
+ finally:
+ out_file.close()
+
+
+def publish(dirname, lxml_path, release):
+ if not os.path.exists(dirname):
+ os.mkdir(dirname)
+
+ doc_dir = os.path.join(lxml_path, 'doc')
+ script = os.path.join(doc_dir, 'rest2html.py')
+ pubkey = os.path.join(doc_dir, 'pubkey.asc')
+ stylesheet_file = 'style.css'
+
+ shutil.copy(pubkey, dirname)
+ # FIXME: find a way to make hashed filenames work both locally and in the versioned directories.
+ stylesheet_url = stylesheet_file
+ """
+ style_file_pattern = "style_%s.css"
+ for old_stylesheet in glob.iglob(os.path.join(dirname, style_file_pattern % "*")):
+ os.unlink(old_stylesheet)
+ with open(os.path.join(dirname, stylesheet_file), 'rb') as f:
+ css = f.read()
+ checksum = hashlib.sha256(css).hexdigest()[:32]
+
+ stylesheet_url = style_file_pattern % checksum
+ with open(os.path.join(dirname, stylesheet_url), 'wb') as out:
+ out.write(css)
+ """
+
+ href_map = HREF_MAP.copy()
+ changelog_basename = 'changes-%s' % release
+ href_map['Release Changelog'] = changelog_basename + '.html'
+
+ menu_js = textwrap.dedent('''
+ function trigger_menu(event) {
+ var sidemenu = document.getElementById("sidemenu");
+ var classes = sidemenu.getAttribute("class");
+ classes = (classes.indexOf(" visible") === -1) ? classes + " visible" : classes.replace(" visible", "");
+ sidemenu.setAttribute("class", classes);
+ event.preventDefault();
+ event.stopPropagation();
+ }
+ function hide_menu() {
+ var sidemenu = document.getElementById("sidemenu");
+ var classes = sidemenu.getAttribute("class");
+ if (classes.indexOf(" visible") !== -1) {
+ sidemenu.setAttribute("class", classes.replace(" visible", ""));
+ }
+ }
+ ''')
+
+ trees = {}
+ menu = Element("div", {'class': 'sidemenu', 'id': 'sidemenu'})
+ SubElement(menu, 'div', {'class': 'menutrigger', 'onclick': 'trigger_menu(event)'}).text = "Menu"
+ menu_div = SubElement(menu, 'div', {'class': 'menu'})
+ inject_banner(menu_div)
+
+ # build HTML pages and parse them back
+ for section, text_files in SITE_STRUCTURE:
+ section_head = make_menu_section_head(section, menu_div)
+ for filename in text_files:
+ if filename.startswith('@'):
+ # special menu entry
+ page_title = filename[1:]
+ url = href_map[page_title]
+ build_menu_entry(page_title, url, section_head)
+ else:
+ path = os.path.join(doc_dir, filename)
+ basename = os.path.splitext(os.path.basename(filename))[0]
+ basename = BASENAME_MAP.get(basename, basename)
+ outname = basename + '.html'
+ outpath = os.path.join(dirname, outname)
+
+ rest2html(script, path, outpath, stylesheet_url)
+ tree = parse(outpath)
+
+ page_div = tree.getroot()[1][0] # html->body->div[class=document]
+ inject_banner(page_div)
+
+ if filename == 'main.txt':
+ # inject donation buttons
+ #inject_flatter_button(tree)
+ inject_donate_buttons(lxml_path, script, tree)
+
+ trees[filename] = (tree, basename, outpath)
+ build_menu(tree, basename, section_head)
+
+ # also convert CHANGES.txt
+ convert_changelog(lxml_path, os.path.join(dirname, 'changes-%s.html' % release),
+ script, stylesheet_url)
+
+ # generate sitemap from menu
+ sitemap = XML(textwrap.dedent('''\
+ <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+ <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
+ <title>Sitemap of lxml.de - Processing XML and HTML with Python</title>
+ <meta content="lxml - the most feature-rich and easy-to-use library for processing XML and HTML in the Python language"
+ name="description" />
+ <meta content="Python XML, XML, XML processing, HTML, lxml, simple XML, ElementTree, etree, lxml.etree, objectify, XML parsing, XML validation, XPath, XSLT"
+ name="keywords" />
+ </head>
+ <body>
+ <h1>Sitemap of lxml.de - Processing XML and HTML with Python</h1>
+ </body>
+ </html>
+ '''))
+ sitemap_menu = copy.deepcopy(menu)
+ SubElement(SubElement(sitemap_menu[-1], 'li'), 'a', href='http://lxml.de/files/').text = 'Download files'
+ sitemap[-1].append(sitemap_menu) # append to body
+ ElementTree(sitemap).write(os.path.join(dirname, 'sitemap.html'))
+
+ # integrate sitemap into the menu
+ SubElement(SubElement(menu_div[-1], 'li'), 'a', href='/sitemap.html').text = 'Sitemap'
+
+ # integrate menu into web pages
+ for tree, basename, outpath in trees.values():
+ head = find_head(tree)[0]
+ SubElement(head, 'script', type='text/javascript').text = menu_js
+ SubElement(head, 'meta', name='viewport', content="width=device-width, initial-scale=1")
+ find_body(tree)[0].set('onclick', 'hide_menu()')
+
+ new_tree = merge_menu(tree, menu, basename)
+ title = find_title_tag(new_tree)
+ if title and title[0].text == 'lxml':
+ title[0].text = "lxml - Processing XML and HTML with Python"
+ heading = find_heading_tag(new_tree)
+ if heading:
+ heading[0].text = "lxml - XML and HTML with Python"
+ new_tree.write(outpath)
+
+
+if __name__ == '__main__':
+ publish(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/doc/mklatex.py b/doc/mklatex.py
new file mode 100644
index 0000000..2bb73b7
--- /dev/null
+++ b/doc/mklatex.py
@@ -0,0 +1,334 @@
+# The script builds the LaTeX documentation.
+# Testing:
+# python mklatex.py latex .. 1.0
+
+from docstructure import SITE_STRUCTURE, BASENAME_MAP
+import os, shutil, re, sys, datetime
+
+TARGET_FILE = "lxmldoc.tex"
+
+RST2LATEX_OPTIONS = " ".join([
+# "--no-toc-backlinks",
+ "--strip-comments",
+ "--language en",
+# "--date",
+# "--use-latex-footnotes",
+ "--use-latex-citations",
+ "--use-latex-toc",
+ "--font-encoding=T1",
+ "--output-encoding=utf-8",
+ "--input-encoding=utf-8",
+ "--graphicx-option=pdftex",
+ ])
+
+htmlnsmap = {"h" : "http://www.w3.org/1999/xhtml"}
+
+replace_invalid = re.compile(r'[-_/.\s\\]').sub
+replace_content = re.compile("\{[^\}]*\}").sub
+
+replace_epydoc_macros = re.compile(r'(,\s*amssymb|dvips\s*,\s*)').sub
+replace_rst_macros = re.compile(r'(\\usepackage\{color}|\\usepackage\[[^]]*]\{hyperref})').sub
+
+BASENAME_MAP = BASENAME_MAP.copy()
+BASENAME_MAP.update({'api' : 'lxmlapi'})
+
+# LaTeX snippets
+
+DOCUMENT_CLASS = r"""
+\documentclass[10pt,english]{report}
+\usepackage[a4paper]{geometry}
+\usepackage{tabularx}
+\usepackage{ifthen}
+\usepackage[pdftex]{graphicx}
+\parindent0pt
+\parskip1ex
+
+%%% Fallback definitions for Docutils-specific commands
+
+% providelength (provide a length variable and set default, if it is new)
+\providecommand*{\DUprovidelength}[2]{
+ \ifthenelse{\isundefined{#1}}{\newlength{#1}\setlength{#1}{#2}}{}
+}
+
+% docinfo (width of docinfo table)
+\DUprovidelength{\DUdocinfowidth}{0.9\textwidth}
+
+% titlereference role
+\providecommand*{\DUroletitlereference}[1]{\textsl{#1}}
+
+"""
+
+PYGMENTS_IMPORT = r"""
+\usepackage{fancyvrb}
+\input{_part_pygments.tex}
+"""
+
+EPYDOC_IMPORT = r"""
+\input{_part_epydoc.tex}
+"""
+
+def write_chapter(master, title, filename):
+ filename = os.path.join(os.path.dirname(filename),
+ "_part_%s" % os.path.basename(filename))
+ master.write(r"""
+\chapter{%s}
+\label{%s}
+\input{%s}
+""" % (title, filename, filename))
+
+
+# the program ----
+
+def rest2latex(script, source_path, dest_path):
+ command = ('%s %s %s %s > %s' %
+ (sys.executable, script, RST2LATEX_OPTIONS,
+ source_path, dest_path))
+ os.system(command)
+
+def build_pygments_macros(filename):
+ from pygments.formatters import LatexFormatter
+ text = LatexFormatter().get_style_defs()
+ with open(filename, "w") as f:
+ f.write(text)
+ f.write('\n')
+
+def copy_epydoc_macros(src, dest, existing_header_lines):
+ doc = open(src, 'r')
+ out = open(dest, "w")
+ for line in doc:
+ if line.startswith('%% generator') \
+ or line.startswith('% generated by ') \
+ or '\\begin{document}' in line \
+ or '\\makeindex' in line:
+ break
+ if line.startswith('%') or \
+ r'\documentclass' in line or \
+ r'\makeindex' in line or \
+ r'{inputenc}' in line:
+ continue
+ if line.startswith(r'\usepackage'):
+ if line in existing_header_lines:
+ continue
+ if '{hyperref}' in line:
+ line = line.replace('black', 'blue')
+ out.write( replace_epydoc_macros('', line) )
+ out.close()
+ doc.close()
+
+def noop(input):
+ return input
+
+counter_no = 0
+
+def tex_postprocess(src_path, dest_path, want_header=False, process_line=noop):
+ """
+ Postprocessing of the LaTeX file generated from ReST.
+
+ Reads file src_path and saves to dest_path only the true content
+ (without the document header and final) - so it is suitable
+ to be used as part of the longer document.
+
+ Returns the title of document
+
+ If want_header is set, returns also the document header (as
+ the list of lines).
+ """
+ header = []
+ add_header_line = header.append
+ global counter_no
+ counter_no = counter_no + 1
+ counter_text = "listcnt%d" % counter_no
+
+ search_title = re.compile(r'\\title{([^{}]*(?:{[^}]*})*)}').search
+ skipping = re.compile(r'(\\end{document}|\\tableofcontents|^%)').search
+
+ with open(src_path) as src:
+ src_text = src.read()
+
+ dest = open(dest_path, "w")
+
+ title = search_title(src_text)
+ if title:
+ # remove any commands from the title
+ title = re.sub(r'\\\w+({[^}]*})?', '', title.group(1))
+
+ iter_lines = iter(src_text.splitlines())
+ for l in iter_lines:
+ l = process_line(l)
+ if not l:
+ continue
+ if want_header:
+ add_header_line(replace_rst_macros('', l))
+ if l.startswith("\\maketitle"):
+ break
+
+ for l in iter_lines:
+ l = process_line(l)
+ if skipping(l):
+ # To-Do minitoc instead of tableofcontents
+ continue
+ elif "\hypertarget{old-versions}" in l:
+ break
+ elif "listcnt0" in l:
+ l = l.replace("listcnt0", counter_text)
+ dest.write(l + '\n')
+
+ dest.close()
+
+ if not title:
+ raise Exception("Bueee, no title in %s" % src_path)
+ return title, header
+
+def publish(dirname, lxml_path, release):
+ if not os.path.exists(dirname):
+ os.mkdir(dirname)
+
+ book_title = "lxml %s" % release
+
+ doc_dir = os.path.join(lxml_path, 'doc')
+ script = os.path.join(doc_dir, 'rest2latex.py')
+ pubkey = os.path.join(doc_dir, 'pubkey.asc')
+
+ shutil.copy(pubkey, dirname)
+
+ # build pygments macros
+ build_pygments_macros(os.path.join(dirname, '_part_pygments.tex'))
+
+ # Used in postprocessing of generated LaTeX files
+ header = []
+ titles = {}
+
+ replace_interdoc_hyperrefs = re.compile(
+ r'\\href\{([^/}]+)[.]([^./}]+)\}').sub
+ replace_docinternal_hyperrefs = re.compile(
+ r'\\href\{\\#([^}]+)\}').sub
+ replace_image_paths = re.compile(
+ r'^(\\includegraphics{)').sub
+ def build_hyperref(match):
+ basename, extension = match.groups()
+ outname = BASENAME_MAP.get(basename, basename)
+ if '#' in extension:
+ anchor = extension.split('#')[-1]
+ return r"\hyperref[%s]" % anchor
+ elif extension != 'html':
+ return r'\href{http://lxml.de/%s.%s}' % (
+ outname, extension)
+ else:
+ return r"\hyperref[_part_%s.tex]" % outname
+ def fix_relative_hyperrefs(line):
+ line = replace_image_paths(r'\1../html/', line)
+ if r'\href' not in line:
+ return line
+ line = replace_interdoc_hyperrefs(build_hyperref, line)
+ return replace_docinternal_hyperrefs(r'\\hyperref[\1]', line)
+
+ # Building pages
+ for section, text_files in SITE_STRUCTURE:
+ for filename in text_files:
+ if filename.startswith('@'):
+ continue
+ #page_title = filename[1:]
+ #url = href_map[page_title]
+ #build_menu_entry(page_title, url, section_head)
+
+ basename = os.path.splitext(os.path.basename(filename))[0]
+ basename = BASENAME_MAP.get(basename, basename)
+ outname = basename + '.tex'
+ outpath = os.path.join(dirname, outname)
+ path = os.path.join(doc_dir, filename)
+
+ print("Creating %s" % outname)
+ rest2latex(script, path, outpath)
+
+ final_name = os.path.join(dirname, os.path.dirname(outname),
+ "_part_%s" % os.path.basename(outname))
+
+ title, hd = tex_postprocess(outpath, final_name,
+ want_header = not header,
+ process_line=fix_relative_hyperrefs)
+ if not header:
+ header = hd
+ titles[outname] = title
+
+ # integrate generated API docs
+
+ print("Integrating API docs")
+ apidocsname = 'api.tex'
+ apipath = os.path.join(dirname, apidocsname)
+ tex_postprocess(apipath, os.path.join(dirname, "_part_%s" % apidocsname),
+ process_line=fix_relative_hyperrefs)
+ copy_epydoc_macros(apipath, os.path.join(dirname, '_part_epydoc.tex'),
+ set(header))
+
+ # convert CHANGES.txt
+
+ print("Integrating ChangeLog")
+ find_version_title = re.compile(
+ r'(.*\\section\{)([0-9][^\} ]*)\s+\(([^)]+)\)(\}.*)').search
+ def fix_changelog(line):
+ m = find_version_title(line)
+ if m:
+ line = "%sChanges in version %s, released %s%s" % m.groups()
+ else:
+ line = line.replace(r'\subsection{', r'\subsection*{')
+ return line
+
+ chgname = 'changes-%s.tex' % release
+ chgpath = os.path.join(dirname, chgname)
+ rest2latex(script,
+ os.path.join(lxml_path, 'CHANGES.txt'),
+ chgpath)
+ tex_postprocess(chgpath, os.path.join(dirname, "_part_%s" % chgname),
+ process_line=fix_changelog)
+
+ # Writing a master file
+ print("Building %s\n" % TARGET_FILE)
+ master = open( os.path.join(dirname, TARGET_FILE), "w")
+ for hln in header:
+ if hln.startswith(r"\documentclass"):
+ #hln = hln.replace('article', 'book')
+ hln = DOCUMENT_CLASS + EPYDOC_IMPORT
+ elif hln.startswith(r"\begin{document}"):
+ # pygments and epydoc support
+ master.write(PYGMENTS_IMPORT)
+ elif hln.startswith(r"\title{"):
+ hln = replace_content(
+ r'{%s\\\\\\vspace{1cm}\\includegraphics[width=2.5cm]{../html/tagpython-big.png}}' % book_title, hln)
+ elif hln.startswith(r"\date{"):
+ hln = replace_content(
+ r'{%s}' % datetime.date.today().isoformat(), hln)
+ elif hln.startswith("pdftitle"):
+ hln = replace_content(
+ r'{%s}' % book_title, hln)
+ master.write(hln + '\n')
+
+ master.write("\\setcounter{page}{2}\n")
+ master.write("\\tableofcontents\n")
+
+ for section, text_files in SITE_STRUCTURE:
+ master.write("\n\n\\part{%s}\n" % section)
+ for filename in text_files:
+ if filename.startswith('@'):
+ continue
+ #print "Not yet implemented: %s" % filename[1:]
+ #page_title = filename[1:]
+ #url = href_map[page_title]
+ #build_menu_entry(page_title, url, section_head)
+ else:
+ basename = os.path.splitext(os.path.basename(filename))[0]
+ basename = BASENAME_MAP.get(basename, basename)
+ outname = basename + '.tex'
+ write_chapter(master, titles[outname], outname)
+
+ master.write("\\appendix\n")
+ master.write("\\begin{appendix}\n")
+
+ write_chapter(master, "Changes", chgname)
+ write_chapter(master, "Generated API documentation", apidocsname)
+
+ master.write("\\end{appendix}\n")
+ master.write("\\end{document}\n")
+
+
+if __name__ == '__main__':
+ publish(sys.argv[1], sys.argv[2], sys.argv[3])
diff --git a/doc/objectify.txt b/doc/objectify.txt
new file mode 100644
index 0000000..f490f90
--- /dev/null
+++ b/doc/objectify.txt
@@ -0,0 +1,1409 @@
+==============
+lxml.objectify
+==============
+
+:Authors:
+ Stefan Behnel, Holger Joukl
+
+lxml supports an alternative API similar to the Amara_ bindery or
+gnosis.xml.objectify_ through a `custom Element implementation`_. The main idea
+is to hide the usage of XML behind normal Python objects, sometimes referred
+to as data-binding. It allows you to use XML as if you were dealing with a
+normal Python object hierarchy.
+
+Accessing the children of an XML element deploys object attribute access. If
+there are multiple children with the same name, slicing and indexing can be
+used. Python data types are extracted from XML content automatically and made
+available to the normal Python operators.
+
+.. contents::
+..
+ 1 The lxml.objectify API
+ 1.1 Element access through object attributes
+ 1.2 Creating objectify trees
+ 1.3 Tree generation with the E-factory
+ 1.4 Namespace handling
+ 2 Asserting a Schema
+ 3 ObjectPath
+ 4 Python data types
+ 4.1 Recursive tree dump
+ 4.2 Recursive string representation of elements
+ 5 How data types are matched
+ 5.1 Type annotations
+ 5.2 XML Schema datatype annotation
+ 5.3 The DataElement factory
+ 5.4 Defining additional data classes
+ 5.5 Advanced element class lookup
+ 6 What is different from lxml.etree?
+
+.. _Amara: http://uche.ogbuji.net/tech/4suite/amara/
+.. _gnosis.xml.objectify: http://gnosis.cx/download/
+.. _`benchmark page`: performance.html#lxml-objectify
+.. _`custom Element implementation`: element_classes.html
+
+To set up and use ``objectify``, you need both the ``lxml.etree``
+module and ``lxml.objectify``:
+
+.. sourcecode:: pycon
+
+ >>> from lxml import etree
+ >>> from lxml import objectify
+
+The objectify API is very different from the ElementTree API. If it
+is used, it should not be mixed with other element implementations
+(such as trees parsed with ``lxml.etree``), to avoid non-obvious
+behaviour.
+
+The `benchmark page`_ has some hints on performance optimisation of
+code using lxml.objectify.
+
+To make the doctests in this document look a little nicer, we also use
+this:
+
+.. sourcecode:: pycon
+
+ >>> import lxml.usedoctest
+
+Imported from within a doctest, this relieves us from caring about the exact
+formatting of XML output.
+
+..
+ >>> try: from StringIO import StringIO
+ ... except ImportError:
+ ... from io import BytesIO # Python 3
+ ... def StringIO(s):
+ ... if isinstance(s, str): s = s.encode('UTF-8')
+ ... return BytesIO(s)
+
+..
+ >>> import sys
+ >>> from lxml import etree as _etree
+ >>> if sys.version_info[0] >= 3:
+ ... class etree_mock(object):
+ ... def __getattr__(self, name): return getattr(_etree, name)
+ ... def tostring(self, *args, **kwargs):
+ ... s = _etree.tostring(*args, **kwargs)
+ ... if isinstance(s, bytes) and bytes([10]) in s: s = s.decode("utf-8") # CR
+ ... if s[-1] == '\n': s = s[:-1]
+ ... return s
+ ... else:
+ ... class etree_mock(object):
+ ... def __getattr__(self, name): return getattr(_etree, name)
+ ... def tostring(self, *args, **kwargs):
+ ... s = _etree.tostring(*args, **kwargs)
+ ... if s[-1] == '\n': s = s[:-1]
+ ... return s
+ >>> etree = etree_mock()
+
+
+The lxml.objectify API
+======================
+
+In ``lxml.objectify``, element trees provide an API that models the behaviour
+of normal Python object trees as closely as possible.
+
+
+Element access through object attributes
+----------------------------------------
+
+The main idea behind the ``objectify`` API is to hide XML element access
+behind the usual object attribute access pattern. Asking an element for an
+attribute will return the sequence of children with corresponding tag names:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.Element("root")
+ >>> b = objectify.SubElement(root, "b")
+ >>> print(root.b[0].tag)
+ b
+ >>> root.index(root.b[0])
+ 0
+ >>> b = objectify.SubElement(root, "b")
+ >>> print(root.b[0].tag)
+ b
+ >>> print(root.b[1].tag)
+ b
+ >>> root.index(root.b[1])
+ 1
+
+For convenience, you can omit the index '0' to access the first child:
+
+.. sourcecode:: pycon
+
+ >>> print(root.b.tag)
+ b
+ >>> root.index(root.b)
+ 0
+ >>> del root.b
+
+Iteration and slicing also obey the requested tag:
+
+.. sourcecode:: pycon
+
+ >>> x1 = objectify.SubElement(root, "x")
+ >>> x2 = objectify.SubElement(root, "x")
+ >>> x3 = objectify.SubElement(root, "x")
+
+ >>> [ el.tag for el in root.x ]
+ ['x', 'x', 'x']
+
+ >>> [ el.tag for el in root.x[1:3] ]
+ ['x', 'x']
+
+ >>> [ el.tag for el in root.x[-1:] ]
+ ['x']
+
+ >>> del root.x[1:2]
+ >>> [ el.tag for el in root.x ]
+ ['x', 'x']
+
+If you want to iterate over all children or need to provide a specific
+namespace for the tag, use the ``iterchildren()`` method. Like the other
+methods for iteration, it supports an optional tag keyword argument:
+
+.. sourcecode:: pycon
+
+ >>> [ el.tag for el in root.iterchildren() ]
+ ['b', 'x', 'x']
+
+ >>> [ el.tag for el in root.iterchildren(tag='b') ]
+ ['b']
+
+ >>> [ el.tag for el in root.b ]
+ ['b']
+
+XML attributes are accessed as in the normal ElementTree API:
+
+.. sourcecode:: pycon
+
+ >>> c = objectify.SubElement(root, "c", myattr="someval")
+ >>> print(root.c.get("myattr"))
+ someval
+
+ >>> root.c.set("c", "oh-oh")
+ >>> print(root.c.get("c"))
+ oh-oh
+
+In addition to the normal ElementTree API for appending elements to trees,
+subtrees can also be added by assigning them to object attributes. In this
+case, the subtree is automatically deep copied and the tag name of its root is
+updated to match the attribute name:
+
+.. sourcecode:: pycon
+
+ >>> el = objectify.Element("yet_another_child")
+ >>> root.new_child = el
+ >>> print(root.new_child.tag)
+ new_child
+ >>> print(el.tag)
+ yet_another_child
+
+ >>> root.y = [ objectify.Element("y"), objectify.Element("y") ]
+ >>> [ el.tag for el in root.y ]
+ ['y', 'y']
+
+The latter is a short form for operations on the full slice:
+
+.. sourcecode:: pycon
+
+ >>> root.y[:] = [ objectify.Element("y") ]
+ >>> [ el.tag for el in root.y ]
+ ['y']
+
+You can also replace children that way:
+
+.. sourcecode:: pycon
+
+ >>> child1 = objectify.SubElement(root, "child")
+ >>> child2 = objectify.SubElement(root, "child")
+ >>> child3 = objectify.SubElement(root, "child")
+
+ >>> el = objectify.Element("new_child")
+ >>> subel = objectify.SubElement(el, "sub")
+
+ >>> root.child = el
+ >>> print(root.child.sub.tag)
+ sub
+
+ >>> root.child[2] = el
+ >>> print(root.child[2].sub.tag)
+ sub
+
+Note that special care must be taken when changing the tag name of an element:
+
+.. sourcecode:: pycon
+
+ >>> print(root.b.tag)
+ b
+ >>> root.b.tag = "notB"
+ >>> root.b
+ Traceback (most recent call last):
+ ...
+ AttributeError: no such child: b
+ >>> print(root.notB.tag)
+ notB
+
+
+Creating objectify trees
+------------------------
+
+As with ``lxml.etree``, you can either create an ``objectify`` tree by
+parsing an XML document or by building one from scratch. To parse a
+document, just use the ``parse()`` or ``fromstring()`` functions of
+the module:
+
+.. sourcecode:: pycon
+
+ >>> fileobject = StringIO('<test/>')
+
+ >>> tree = objectify.parse(fileobject)
+ >>> print(isinstance(tree.getroot(), objectify.ObjectifiedElement))
+ True
+
+ >>> root = objectify.fromstring('<test/>')
+ >>> print(isinstance(root, objectify.ObjectifiedElement))
+ True
+
+To build a new tree in memory, ``objectify`` replicates the standard
+factory function ``Element()`` from ``lxml.etree``:
+
+.. sourcecode:: pycon
+
+ >>> obj_el = objectify.Element("new")
+ >>> print(isinstance(obj_el, objectify.ObjectifiedElement))
+ True
+
+After creating such an Element, you can use the `usual API`_ of
+lxml.etree to add SubElements to the tree:
+
+.. sourcecode:: pycon
+
+ >>> child = objectify.SubElement(obj_el, "newchild", attr="value")
+
+.. _`usual API`: tutorial.html#the-element-class
+
+New subelements will automatically inherit the objectify behaviour
+from their tree. However, all independent elements that you create
+through the ``Element()`` factory of lxml.etree (instead of objectify)
+will not support the ``objectify`` API by themselves:
+
+.. sourcecode:: pycon
+
+ >>> subel = objectify.SubElement(obj_el, "sub")
+ >>> print(isinstance(subel, objectify.ObjectifiedElement))
+ True
+
+ >>> independent_el = etree.Element("new")
+ >>> print(isinstance(independent_el, objectify.ObjectifiedElement))
+ False
+
+
+Tree generation with the E-factory
+----------------------------------
+
+To simplify the generation of trees even further, you can use the E-factory:
+
+.. sourcecode:: pycon
+
+ >>> E = objectify.E
+ >>> root = E.root(
+ ... E.a(5),
+ ... E.b(6.21),
+ ... E.c(True),
+ ... E.d("how", tell="me")
+ ... )
+
+ >>> print(etree.tostring(root, pretty_print=True))
+ <root xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <a py:pytype="int">5</a>
+ <b py:pytype="float">6.21</b>
+ <c py:pytype="bool">true</c>
+ <d py:pytype="str" tell="me">how</d>
+ </root>
+
+This allows you to write up a specific language in tags:
+
+.. sourcecode:: pycon
+
+ >>> ROOT = objectify.E.root
+ >>> TITLE = objectify.E.title
+ >>> HOWMANY = getattr(objectify.E, "how-many")
+
+ >>> root = ROOT(
+ ... TITLE("The title"),
+ ... HOWMANY(5)
+ ... )
+
+ >>> print(etree.tostring(root, pretty_print=True))
+ <root xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <title py:pytype="str">The title</title>
+ <how-many py:pytype="int">5</how-many>
+ </root>
+
+``objectify.E`` is an instance of ``objectify.ElementMaker``. By default, it
+creates pytype annotated Elements without a namespace. You can switch off the
+pytype annotation by passing False to the ``annotate`` keyword argument of the
+constructor. You can also pass a default namespace and an ``nsmap``:
+
+.. sourcecode:: pycon
+
+ >>> myE = objectify.ElementMaker(annotate=False,
+ ... namespace="http://my/ns", nsmap={None : "http://my/ns"})
+
+ >>> root = myE.root( myE.someint(2) )
+
+ >>> print(etree.tostring(root, pretty_print=True))
+ <root xmlns="http://my/ns">
+ <someint>2</someint>
+ </root>
+
+
+Namespace handling
+------------------
+
+During tag lookups, namespaces are handled mostly behind the scenes.
+If you access a child of an Element without specifying a namespace,
+the lookup will use the namespace of the parent:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.Element("{http://ns/}root")
+ >>> b = objectify.SubElement(root, "{http://ns/}b")
+ >>> c = objectify.SubElement(root, "{http://other/}c")
+
+ >>> print(root.b.tag)
+ {http://ns/}b
+
+Note that the ``SubElement()`` factory of ``lxml.etree`` does not
+inherit any namespaces when creating a new subelement. Element
+creation must be explicit about the namespace, and is simplified
+through the E-factory as described above.
+
+Lookups, however, inherit namespaces implicitly:
+
+.. sourcecode:: pycon
+
+ >>> print(root.b.tag)
+ {http://ns/}b
+
+ >>> print(root.c)
+ Traceback (most recent call last):
+ ...
+ AttributeError: no such child: {http://ns/}c
+
+To access an element in a different namespace than its parent, you can
+use ``getattr()``:
+
+.. sourcecode:: pycon
+
+ >>> c = getattr(root, "{http://other/}c")
+ >>> print(c.tag)
+ {http://other/}c
+
+For convenience, there is also a quick way through item access:
+
+.. sourcecode:: pycon
+
+ >>> c = root["{http://other/}c"]
+ >>> print(c.tag)
+ {http://other/}c
+
+The same approach must be used to access children with tag names that are not
+valid Python identifiers:
+
+.. sourcecode:: pycon
+
+ >>> el = objectify.SubElement(root, "{http://ns/}tag-name")
+ >>> print(root["tag-name"].tag)
+ {http://ns/}tag-name
+
+ >>> new_el = objectify.Element("{http://ns/}new-element")
+ >>> el = objectify.SubElement(new_el, "{http://ns/}child")
+ >>> el = objectify.SubElement(new_el, "{http://ns/}child")
+ >>> el = objectify.SubElement(new_el, "{http://ns/}child")
+
+ >>> root["tag-name"] = [ new_el, new_el ]
+ >>> print(len(root["tag-name"]))
+ 2
+ >>> print(root["tag-name"].tag)
+ {http://ns/}tag-name
+
+ >>> print(len(root["tag-name"].child))
+ 3
+ >>> print(root["tag-name"].child.tag)
+ {http://ns/}child
+ >>> print(root["tag-name"][1].child.tag)
+ {http://ns/}child
+
+or for names that have a special meaning in lxml.objectify:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.XML("<root><text>TEXT</text></root>")
+
+ >>> print(root.text.text)
+ Traceback (most recent call last):
+ ...
+ AttributeError: 'NoneType' object has no attribute 'text'
+
+ >>> print(root["text"].text)
+ TEXT
+
+
+Asserting a Schema
+==================
+
+When dealing with XML documents from different sources, you will often
+require them to follow a common schema. In lxml.objectify, this
+directly translates to enforcing a specific object tree, i.e. expected
+object attributes are ensured to be there and to have the expected
+type. This can easily be achieved through XML Schema validation at
+parse time. Also see the `documentation on validation`_ on this
+topic.
+
+.. _`documentation on validation`: validation.html
+
+First of all, we need a parser that knows our schema, so let's say we
+parse the schema from a file-like object (or file or filename):
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO('''\
+ ... <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ ... <xsd:element name="a" type="AType"/>
+ ... <xsd:complexType name="AType">
+ ... <xsd:sequence>
+ ... <xsd:element name="b" type="xsd:string" />
+ ... </xsd:sequence>
+ ... </xsd:complexType>
+ ... </xsd:schema>
+ ... ''')
+ >>> schema = etree.XMLSchema(file=f)
+
+When creating the validating parser, we must make sure it `returns
+objectify trees`_. This is best done with the ``makeparser()``
+function:
+
+.. sourcecode:: pycon
+
+ >>> parser = objectify.makeparser(schema = schema)
+
+.. _`returns objectify trees`: #advance-element-class-lookup
+
+Now we can use it to parse a valid document:
+
+.. sourcecode:: pycon
+
+ >>> xml = "<a><b>test</b></a>"
+ >>> a = objectify.fromstring(xml, parser)
+
+ >>> print(a.b)
+ test
+
+Or an invalid document:
+
+.. sourcecode:: pycon
+
+ >>> xml = b"<a><b>test</b><c/></a>"
+ >>> a = objectify.fromstring(xml, parser) # doctest: +ELLIPSIS
+ Traceback (most recent call last):
+ lxml.etree.XMLSyntaxError: Element 'c': This element is not expected...
+
+Note that the same works for parse-time DTD validation, except that
+DTDs do not support any data types by design.
+
+
+ObjectPath
+==========
+
+For both convenience and speed, objectify supports its own path language,
+represented by the ``ObjectPath`` class:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.Element("{http://ns/}root")
+ >>> b1 = objectify.SubElement(root, "{http://ns/}b")
+ >>> c = objectify.SubElement(b1, "{http://ns/}c")
+ >>> b2 = objectify.SubElement(root, "{http://ns/}b")
+ >>> d = objectify.SubElement(root, "{http://other/}d")
+
+ >>> path = objectify.ObjectPath("root.b.c")
+ >>> print(path)
+ root.b.c
+ >>> path.hasattr(root)
+ True
+ >>> print(path.find(root).tag)
+ {http://ns/}c
+
+ >>> find = objectify.ObjectPath("root.b.c")
+ >>> print(find(root).tag)
+ {http://ns/}c
+
+ >>> find = objectify.ObjectPath("root.{http://other/}d")
+ >>> print(find(root).tag)
+ {http://other/}d
+
+ >>> find = objectify.ObjectPath("root.{not}there")
+ >>> print(find(root).tag)
+ Traceback (most recent call last):
+ ...
+ AttributeError: no such child: {not}there
+
+ >>> find = objectify.ObjectPath("{not}there")
+ >>> print(find(root).tag)
+ Traceback (most recent call last):
+ ...
+ ValueError: root element does not match: need {not}there, got {http://ns/}root
+
+ >>> find = objectify.ObjectPath("root.b[1]")
+ >>> print(find(root).tag)
+ {http://ns/}b
+
+ >>> find = objectify.ObjectPath("root.{http://ns/}b[1]")
+ >>> print(find(root).tag)
+ {http://ns/}b
+
+Apart from strings, ObjectPath also accepts lists of path segments:
+
+.. sourcecode:: pycon
+
+ >>> find = objectify.ObjectPath(['root', 'b', 'c'])
+ >>> print(find(root).tag)
+ {http://ns/}c
+
+ >>> find = objectify.ObjectPath(['root', '{http://ns/}b[1]'])
+ >>> print(find(root).tag)
+ {http://ns/}b
+
+You can also use relative paths starting with a '.' to ignore the actual root
+element and only inherit its namespace:
+
+.. sourcecode:: pycon
+
+ >>> find = objectify.ObjectPath(".b[1]")
+ >>> print(find(root).tag)
+ {http://ns/}b
+
+ >>> find = objectify.ObjectPath(['', 'b[1]'])
+ >>> print(find(root).tag)
+ {http://ns/}b
+
+ >>> find = objectify.ObjectPath(".unknown[1]")
+ >>> print(find(root).tag)
+ Traceback (most recent call last):
+ ...
+ AttributeError: no such child: {http://ns/}unknown
+
+ >>> find = objectify.ObjectPath(".{http://other/}unknown[1]")
+ >>> print(find(root).tag)
+ Traceback (most recent call last):
+ ...
+ AttributeError: no such child: {http://other/}unknown
+
+For convenience, a single dot represents the empty ObjectPath (identity):
+
+.. sourcecode:: pycon
+
+ >>> find = objectify.ObjectPath(".")
+ >>> print(find(root).tag)
+ {http://ns/}root
+
+ObjectPath objects can be used to manipulate trees:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.Element("{http://ns/}root")
+
+ >>> path = objectify.ObjectPath(".some.child.{http://other/}unknown")
+ >>> path.hasattr(root)
+ False
+ >>> path.find(root)
+ Traceback (most recent call last):
+ ...
+ AttributeError: no such child: {http://ns/}some
+
+ >>> path.setattr(root, "my value") # creates children as necessary
+ >>> path.hasattr(root)
+ True
+ >>> print(path.find(root).text)
+ my value
+ >>> print(root.some.child["{http://other/}unknown"].text)
+ my value
+
+ >>> print(len( path.find(root) ))
+ 1
+ >>> path.addattr(root, "my new value")
+ >>> print(len( path.find(root) ))
+ 2
+ >>> [ el.text for el in path.find(root) ]
+ ['my value', 'my new value']
+
+As with attribute assignment, ``setattr()`` accepts lists:
+
+.. sourcecode:: pycon
+
+ >>> path.setattr(root, ["v1", "v2", "v3"])
+ >>> [ el.text for el in path.find(root) ]
+ ['v1', 'v2', 'v3']
+
+
+Note, however, that indexing is only supported in this context if the children
+exist. Indexing of non existing children will not extend or create a list of
+such children but raise an exception:
+
+.. sourcecode:: pycon
+
+ >>> path = objectify.ObjectPath(".{non}existing[1]")
+ >>> path.setattr(root, "my value")
+ Traceback (most recent call last):
+ ...
+ TypeError: creating indexed path attributes is not supported
+
+It is worth noting that ObjectPath does not depend on the ``objectify`` module
+or the ObjectifiedElement implementation. It can also be used in combination
+with Elements from the normal lxml.etree API.
+
+
+Python data types
+=================
+
+The objectify module knows about Python data types and tries its best to let
+element content behave like them. For example, they support the normal math
+operators:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring(
+ ... "<root><a>5</a><b>11</b><c>true</c><d>hoi</d></root>")
+ >>> root.a + root.b
+ 16
+ >>> root.a += root.b
+ >>> print(root.a)
+ 16
+
+ >>> root.a = 2
+ >>> print(root.a + 2)
+ 4
+ >>> print(1 + root.a)
+ 3
+
+ >>> print(root.c)
+ True
+ >>> root.c = False
+ >>> if not root.c:
+ ... print("false!")
+ false!
+
+ >>> print(root.d + " test !")
+ hoi test !
+ >>> root.d = "%s - %s"
+ >>> print(root.d % (1234, 12345))
+ 1234 - 12345
+
+However, data elements continue to provide the objectify API. This means that
+sequence operations such as ``len()``, slicing and indexing (e.g. of strings)
+cannot behave as the Python types. Like all other tree elements, they show
+the normal slicing behaviour of objectify elements:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring("<root><a>test</a><b>toast</b></root>")
+ >>> print(root.a + ' me') # behaves like a string, right?
+ test me
+ >>> len(root.a) # but there's only one 'a' element!
+ 1
+ >>> [ a.tag for a in root.a ]
+ ['a']
+ >>> print(root.a[0].tag)
+ a
+
+ >>> print(root.a)
+ test
+ >>> [ str(a) for a in root.a[:1] ]
+ ['test']
+
+If you need to run sequence operations on data types, you must ask the API for
+the *real* Python value. The string value is always available through the
+normal ElementTree ``.text`` attribute. Additionally, all data classes
+provide a ``.pyval`` attribute that returns the value as plain Python type:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring("<root><a>test</a><b>5</b></root>")
+ >>> root.a.text
+ 'test'
+ >>> root.a.pyval
+ 'test'
+
+ >>> root.b.text
+ '5'
+ >>> root.b.pyval
+ 5
+
+Note, however, that both attributes are read-only in objectify. If you want
+to change values, just assign them directly to the attribute:
+
+.. sourcecode:: pycon
+
+ >>> root.a.text = "25"
+ Traceback (most recent call last):
+ ...
+ TypeError: attribute 'text' of 'StringElement' objects is not writable
+
+ >>> root.a.pyval = 25
+ Traceback (most recent call last):
+ ...
+ TypeError: attribute 'pyval' of 'StringElement' objects is not writable
+
+ >>> root.a = 25
+ >>> print(root.a)
+ 25
+ >>> print(root.a.pyval)
+ 25
+
+In other words, ``objectify`` data elements behave like immutable Python
+types. You can replace them, but not modify them.
+
+
+Recursive tree dump
+-------------------
+
+To see the data types that are currently used, you can call the module level
+``dump()`` function that returns a recursive string representation for
+elements:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring("""
+ ... <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ ... <a attr1="foo" attr2="bar">1</a>
+ ... <a>1.2</a>
+ ... <b>1</b>
+ ... <b>true</b>
+ ... <c>what?</c>
+ ... <d xsi:nil="true"/>
+ ... </root>
+ ... """)
+
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ a = 1 [IntElement]
+ * attr1 = 'foo'
+ * attr2 = 'bar'
+ a = 1.2 [FloatElement]
+ b = 1 [IntElement]
+ b = True [BoolElement]
+ c = 'what?' [StringElement]
+ d = None [NoneElement]
+ * xsi:nil = 'true'
+
+You can freely switch between different types for the same child:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring("<root><a>5</a></root>")
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ a = 5 [IntElement]
+
+ >>> root.a = 'nice string!'
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ a = 'nice string!' [StringElement]
+ * py:pytype = 'str'
+
+ >>> root.a = True
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ a = True [BoolElement]
+ * py:pytype = 'bool'
+
+ >>> root.a = [1, 2, 3]
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ a = 1 [IntElement]
+ * py:pytype = 'int'
+ a = 2 [IntElement]
+ * py:pytype = 'int'
+ a = 3 [IntElement]
+ * py:pytype = 'int'
+
+ >>> root.a = (1, 2, 3)
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ a = 1 [IntElement]
+ * py:pytype = 'int'
+ a = 2 [IntElement]
+ * py:pytype = 'int'
+ a = 3 [IntElement]
+ * py:pytype = 'int'
+
+
+Recursive string representation of elements
+-------------------------------------------
+
+Normally, elements use the standard string representation for str() that is
+provided by lxml.etree. You can enable a pretty-print representation for
+objectify elements like this:
+
+.. sourcecode:: pycon
+
+ >>> objectify.enable_recursive_str()
+
+ >>> root = objectify.fromstring("""
+ ... <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ ... <a attr1="foo" attr2="bar">1</a>
+ ... <a>1.2</a>
+ ... <b>1</b>
+ ... <b>true</b>
+ ... <c>what?</c>
+ ... <d xsi:nil="true"/>
+ ... </root>
+ ... """)
+
+ >>> print(str(root))
+ root = None [ObjectifiedElement]
+ a = 1 [IntElement]
+ * attr1 = 'foo'
+ * attr2 = 'bar'
+ a = 1.2 [FloatElement]
+ b = 1 [IntElement]
+ b = True [BoolElement]
+ c = 'what?' [StringElement]
+ d = None [NoneElement]
+ * xsi:nil = 'true'
+
+This behaviour can be switched off in the same way:
+
+.. sourcecode:: pycon
+
+ >>> objectify.enable_recursive_str(False)
+
+
+How data types are matched
+==========================
+
+Objectify uses two different types of Elements. Structural Elements (or tree
+Elements) represent the object tree structure. Data Elements represent the
+data containers at the leafs. You can explicitly create tree Elements with
+the ``objectify.Element()`` factory and data Elements with the
+``objectify.DataElement()`` factory.
+
+When Element objects are created, lxml.objectify must determine which
+implementation class to use for them. This is relatively easy for tree
+Elements and less so for data Elements. The algorithm is as follows:
+
+1. If an element has children, use the default tree class.
+
+2. If an element is defined as xsi:nil, use the NoneElement class.
+
+3. If a "Python type hint" attribute is given, use this to determine the element
+ class, see below.
+
+4. If an XML Schema xsi:type hint is given, use this to determine the element
+ class, see below.
+
+5. Try to determine the element class from the text content type by trial and
+ error.
+
+6. If the element is a root node then use the default tree class.
+
+7. Otherwise, use the default class for empty data classes.
+
+You can change the default classes for tree Elements and empty data Elements
+at setup time. The ``ObjectifyElementClassLookup()`` call accepts two keyword
+arguments, ``tree_class`` and ``empty_data_class``, that determine the Element
+classes used in these cases. By default, ``tree_class`` is a class called
+``ObjectifiedElement`` and ``empty_data_class`` is a ``StringElement``.
+
+
+Type annotations
+----------------
+
+The "type hint" mechanism deploys an XML attribute defined as
+``lxml.objectify.PYTYPE_ATTRIBUTE``. It may contain any of the following
+string values: int, long, float, str, unicode, NoneType:
+
+.. sourcecode:: pycon
+
+ >>> print(objectify.PYTYPE_ATTRIBUTE)
+ {http://codespeak.net/lxml/objectify/pytype}pytype
+ >>> ns, name = objectify.PYTYPE_ATTRIBUTE[1:].split('}')
+
+ >>> root = objectify.fromstring("""\
+ ... <root xmlns:py='%s'>
+ ... <a py:pytype='str'>5</a>
+ ... <b py:pytype='int'>5</b>
+ ... <c py:pytype='NoneType' />
+ ... </root>
+ ... """ % ns)
+
+ >>> print(root.a + 10)
+ 510
+ >>> print(root.b + 10)
+ 15
+ >>> print(root.c)
+ None
+
+Note that you can change the name and namespace used for this
+attribute through the ``set_pytype_attribute_tag(tag)`` module
+function, in case your application ever needs to. There is also a
+utility function ``annotate()`` that recursively generates this
+attribute for the elements of a tree:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring("<root><a>test</a><b>5</b></root>")
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ a = 'test' [StringElement]
+ b = 5 [IntElement]
+
+ >>> objectify.annotate(root)
+
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ a = 'test' [StringElement]
+ * py:pytype = 'str'
+ b = 5 [IntElement]
+ * py:pytype = 'int'
+
+
+XML Schema datatype annotation
+------------------------------
+
+A second way of specifying data type information uses XML Schema types as
+element annotations. Objectify knows those that can be mapped to normal
+Python types:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring('''\
+ ... <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ ... xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ ... <d xsi:type="xsd:double">5</d>
+ ... <i xsi:type="xsd:int" >5</i>
+ ... <s xsi:type="xsd:string">5</s>
+ ... </root>
+ ... ''')
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ d = 5.0 [FloatElement]
+ * xsi:type = 'xsd:double'
+ i = 5 [IntElement]
+ * xsi:type = 'xsd:int'
+ s = '5' [StringElement]
+ * xsi:type = 'xsd:string'
+
+Again, there is a utility function ``xsiannotate()`` that recursively
+generates the "xsi:type" attribute for the elements of a tree:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring('''\
+ ... <root><a>test</a><b>5</b><c>true</c></root>
+ ... ''')
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ a = 'test' [StringElement]
+ b = 5 [IntElement]
+ c = True [BoolElement]
+
+ >>> objectify.xsiannotate(root)
+
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ a = 'test' [StringElement]
+ * xsi:type = 'xsd:string'
+ b = 5 [IntElement]
+ * xsi:type = 'xsd:integer'
+ c = True [BoolElement]
+ * xsi:type = 'xsd:boolean'
+
+Note, however, that ``xsiannotate()`` will always use the first XML Schema
+datatype that is defined for any given Python type, see also
+`Defining additional data classes`_.
+
+The utility function ``deannotate()`` can be used to get rid of 'py:pytype'
+and/or 'xsi:type' information:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring('''\
+ ... <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ ... xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ ... <d xsi:type="xsd:double">5</d>
+ ... <i xsi:type="xsd:int" >5</i>
+ ... <s xsi:type="xsd:string">5</s>
+ ... </root>''')
+ >>> objectify.annotate(root)
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ d = 5.0 [FloatElement]
+ * py:pytype = 'float'
+ * xsi:type = 'xsd:double'
+ i = 5 [IntElement]
+ * py:pytype = 'int'
+ * xsi:type = 'xsd:int'
+ s = '5' [StringElement]
+ * py:pytype = 'str'
+ * xsi:type = 'xsd:string'
+ >>> objectify.deannotate(root)
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ d = 5 [IntElement]
+ i = 5 [IntElement]
+ s = 5 [IntElement]
+
+You can control which type attributes should be de-annotated with the keyword
+arguments 'pytype' (default: True) and 'xsi' (default: True).
+``deannotate()`` can also remove 'xsi:nil' attributes by setting 'xsi_nil=True'
+(default: False):
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring('''\
+ ... <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ ... xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ ... <d xsi:type="xsd:double">5</d>
+ ... <i xsi:type="xsd:int" >5</i>
+ ... <s xsi:type="xsd:string">5</s>
+ ... <n xsi:nil="true"/>
+ ... </root>''')
+ >>> objectify.annotate(root)
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ d = 5.0 [FloatElement]
+ * py:pytype = 'float'
+ * xsi:type = 'xsd:double'
+ i = 5 [IntElement]
+ * py:pytype = 'int'
+ * xsi:type = 'xsd:int'
+ s = '5' [StringElement]
+ * py:pytype = 'str'
+ * xsi:type = 'xsd:string'
+ n = None [NoneElement]
+ * py:pytype = 'NoneType'
+ * xsi:nil = 'true'
+ >>> objectify.deannotate(root, xsi_nil=True)
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ d = 5 [IntElement]
+ i = 5 [IntElement]
+ s = 5 [IntElement]
+ n = u'' [StringElement]
+
+Note that ``deannotate()`` does not remove the namespace declarations
+of the ``pytype`` namespace by default. To remove them as well, and
+to generally clean up the namespace declarations in the document
+(usually when done with the whole processing), pass the option
+``cleanup_namespaces=True``. This option is new in lxml 2.3.2. In
+older versions, use the function ``lxml.etree.cleanup_namespaces()``
+instead.
+
+
+The DataElement factory
+-----------------------
+
+For convenience, the ``DataElement()`` factory creates an Element with a
+Python value in one step. You can pass the required Python type name or the
+XSI type name:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.Element("root")
+ >>> root.x = objectify.DataElement(5, _pytype="int")
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ x = 5 [IntElement]
+ * py:pytype = 'int'
+
+ >>> root.x = objectify.DataElement(5, _pytype="str", myattr="someval")
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ x = '5' [StringElement]
+ * myattr = 'someval'
+ * py:pytype = 'str'
+
+ >>> root.x = objectify.DataElement(5, _xsi="integer")
+ >>> print(objectify.dump(root))
+ root = None [ObjectifiedElement]
+ x = 5 [IntElement]
+ * py:pytype = 'int'
+ * xsi:type = 'xsd:integer'
+
+XML Schema types reside in the XML schema namespace thus ``DataElement()``
+tries to correctly prefix the xsi:type attribute value for you:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.Element("root")
+ >>> root.s = objectify.DataElement(5, _xsi="string")
+
+ >>> objectify.deannotate(root, xsi=False)
+ >>> print(etree.tostring(root, pretty_print=True))
+ <root xmlns:py="http://codespeak.net/lxml/objectify/pytype" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <s xsi:type="xsd:string">5</s>
+ </root>
+
+``DataElement()`` uses a default nsmap to set these prefixes:
+
+.. sourcecode:: pycon
+
+ >>> el = objectify.DataElement('5', _xsi='string')
+ >>> namespaces = list(el.nsmap.items())
+ >>> namespaces.sort()
+ >>> for prefix, namespace in namespaces:
+ ... print("%s - %s" % (prefix, namespace))
+ py - http://codespeak.net/lxml/objectify/pytype
+ xsd - http://www.w3.org/2001/XMLSchema
+ xsi - http://www.w3.org/2001/XMLSchema-instance
+
+ >>> print(el.get("{http://www.w3.org/2001/XMLSchema-instance}type"))
+ xsd:string
+
+While you can set custom namespace prefixes, it is necessary to provide valid
+namespace information if you choose to do so:
+
+.. sourcecode:: pycon
+
+ >>> el = objectify.DataElement('5', _xsi='foo:string',
+ ... nsmap={'foo': 'http://www.w3.org/2001/XMLSchema'})
+ >>> namespaces = list(el.nsmap.items())
+ >>> namespaces.sort()
+ >>> for prefix, namespace in namespaces:
+ ... print("%s - %s" % (prefix, namespace))
+ foo - http://www.w3.org/2001/XMLSchema
+ py - http://codespeak.net/lxml/objectify/pytype
+ xsi - http://www.w3.org/2001/XMLSchema-instance
+
+ >>> print(el.get("{http://www.w3.org/2001/XMLSchema-instance}type"))
+ foo:string
+
+Note how lxml chose a default prefix for the XML Schema Instance
+namespace. We can override it as in the following example:
+
+.. sourcecode:: pycon
+
+ >>> el = objectify.DataElement('5', _xsi='foo:string',
+ ... nsmap={'foo': 'http://www.w3.org/2001/XMLSchema',
+ ... 'myxsi': 'http://www.w3.org/2001/XMLSchema-instance'})
+ >>> namespaces = list(el.nsmap.items())
+ >>> namespaces.sort()
+ >>> for prefix, namespace in namespaces:
+ ... print("%s - %s" % (prefix, namespace))
+ foo - http://www.w3.org/2001/XMLSchema
+ myxsi - http://www.w3.org/2001/XMLSchema-instance
+ py - http://codespeak.net/lxml/objectify/pytype
+
+ >>> print(el.get("{http://www.w3.org/2001/XMLSchema-instance}type"))
+ foo:string
+
+Care must be taken if different namespace prefixes have been used for the same
+namespace. Namespace information gets merged to avoid duplicate definitions
+when adding a new sub-element to a tree, but this mechanism does not adapt the
+prefixes of attribute values:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring("""<root xmlns:schema="http://www.w3.org/2001/XMLSchema"/>""")
+ >>> print(etree.tostring(root, pretty_print=True))
+ <root xmlns:schema="http://www.w3.org/2001/XMLSchema"/>
+
+ >>> s = objectify.DataElement("17", _xsi="string")
+ >>> print(etree.tostring(s, pretty_print=True))
+ <value xmlns:py="http://codespeak.net/lxml/objectify/pytype" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" py:pytype="str" xsi:type="xsd:string">17</value>
+
+ >>> root.s = s
+ >>> print(etree.tostring(root, pretty_print=True))
+ <root xmlns:schema="http://www.w3.org/2001/XMLSchema">
+ <s xmlns:py="http://codespeak.net/lxml/objectify/pytype" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" py:pytype="str" xsi:type="xsd:string">17</s>
+ </root>
+
+It is your responsibility to fix the prefixes of attribute values if you
+choose to deviate from the standard prefixes. A convenient way to do this for
+xsi:type attributes is to use the ``xsiannotate()`` utility:
+
+.. sourcecode:: pycon
+
+ >>> objectify.xsiannotate(root)
+ >>> print(etree.tostring(root, pretty_print=True))
+ <root xmlns:schema="http://www.w3.org/2001/XMLSchema">
+ <s xmlns:py="http://codespeak.net/lxml/objectify/pytype" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" py:pytype="str" xsi:type="schema:string">17</s>
+ </root>
+
+Of course, it is discouraged to use different prefixes for one and the same
+namespace when building up an objectify tree.
+
+
+Defining additional data classes
+--------------------------------
+
+You can plug additional data classes into objectify that will be used in
+exactly the same way as the predefined types. Data classes can either inherit
+from ``ObjectifiedDataElement`` directly or from one of the specialised
+classes like ``NumberElement`` or ``BoolElement``. The numeric types require
+an initial call to the NumberElement method ``self._setValueParser(function)``
+to set their type conversion function (string -> numeric Python type). This
+call should be placed into the element ``_init()`` method.
+
+The registration of data classes uses the ``PyType`` class:
+
+.. sourcecode:: pycon
+
+ >>> class ChristmasDate(objectify.ObjectifiedDataElement):
+ ... def call_santa(self):
+ ... print("Ho ho ho!")
+
+ >>> def checkChristmasDate(date_string):
+ ... if not date_string.startswith('24.12.'):
+ ... raise ValueError # or TypeError
+
+ >>> xmas_type = objectify.PyType('date', checkChristmasDate, ChristmasDate)
+
+The PyType constructor takes a string type name, an (optional) callable type
+check and the custom data class. If a type check is provided it must accept a
+string as argument and raise ValueError or TypeError if it cannot handle the
+string value.
+
+PyTypes are used if an element carries a ``py:pytype`` attribute denoting its
+data type or, in absence of such an attribute, if the given type check callable
+does not raise a ValueError/TypeError exception when applied to the element
+text.
+
+If you want, you can also register this type under an XML Schema type name:
+
+.. sourcecode:: pycon
+
+ >>> xmas_type.xmlSchemaTypes = ("date",)
+
+XML Schema types will be considered if the element has an ``xsi:type``
+attribute that specifies its data type. The line above binds the XSD type
+``date`` to the newly defined Python type. Note that this must be done before
+the next step, which is to register the type. Then you can use it:
+
+.. sourcecode:: pycon
+
+ >>> xmas_type.register()
+
+ >>> root = objectify.fromstring(
+ ... "<root><a>24.12.2000</a><b>12.24.2000</b></root>")
+ >>> root.a.call_santa()
+ Ho ho ho!
+ >>> root.b.call_santa()
+ Traceback (most recent call last):
+ ...
+ AttributeError: no such child: call_santa
+
+If you need to specify dependencies between the type check functions, you can
+pass a sequence of type names through the ``before`` and ``after`` keyword
+arguments of the ``register()`` method. The PyType will then try to register
+itself before or after the respective types, as long as they are currently
+registered. Note that this only impacts the currently registered types at the
+time of registration. Types that are registered later on will not care about
+the dependencies of already registered types.
+
+If you provide XML Schema type information, this will override the type check
+function defined above:
+
+.. sourcecode:: pycon
+
+ >>> root = objectify.fromstring('''\
+ ... <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ ... <a xsi:type="date">12.24.2000</a>
+ ... </root>
+ ... ''')
+ >>> print(root.a)
+ 12.24.2000
+ >>> root.a.call_santa()
+ Ho ho ho!
+
+To unregister a type, call its ``unregister()`` method:
+
+.. sourcecode:: pycon
+
+ >>> root.a.call_santa()
+ Ho ho ho!
+ >>> xmas_type.unregister()
+ >>> root.a.call_santa()
+ Traceback (most recent call last):
+ ...
+ AttributeError: no such child: call_santa
+
+Be aware, though, that this does not immediately apply to elements to which
+there already is a Python reference. Their Python class will only be changed
+after all references are gone and the Python object is garbage collected.
+
+
+Advanced element class lookup
+-----------------------------
+
+In some cases, the normal data class setup is not enough. Being based
+on ``lxml.etree``, however, ``lxml.objectify`` supports very
+fine-grained control over the Element classes used in a tree. All you
+have to do is configure a different `class lookup`_ mechanism (or
+write one yourself).
+
+.. _`class lookup`: element_classes.html
+
+The first step for the setup is to create a new parser that builds
+objectify documents. The objectify API is meant for data-centric XML
+(as opposed to document XML with mixed content). Therefore, we
+configure the parser to let it remove whitespace-only text from the
+parsed document if it is not enclosed by an XML element. Note that
+this alters the document infoset, so if you consider the removed
+spaces as data in your specific use case, you should go with a normal
+parser and just set the element class lookup. Most applications,
+however, will work fine with the following setup:
+
+.. sourcecode:: pycon
+
+ >>> parser = objectify.makeparser(remove_blank_text=True)
+
+What this does internally, is:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser(remove_blank_text=True)
+
+ >>> lookup = objectify.ObjectifyElementClassLookup()
+ >>> parser.set_element_class_lookup(lookup)
+
+If you want to change the lookup scheme, say, to get additional
+support for `namespace specific classes`_, you can register the
+objectify lookup as a fallback of the namespace lookup. In this case,
+however, you have to take care that the namespace classes inherit from
+``objectify.ObjectifiedElement``, not only from the normal
+``lxml.etree.ElementBase``, so that they support the ``objectify``
+API. The above setup code then becomes:
+
+.. sourcecode:: pycon
+
+ >>> lookup = etree.ElementNamespaceClassLookup(
+ ... objectify.ObjectifyElementClassLookup() )
+ >>> parser.set_element_class_lookup(lookup)
+
+.. _`namespace specific classes`: element_classes.html#namespace-class-lookup
+
+See the documentation on `class lookup`_ schemes for more information.
+
+
+What is different from lxml.etree?
+==================================
+
+Such a different Element API obviously implies some side effects to the normal
+behaviour of the rest of the API.
+
+* len(<element>) returns the sibling count, not the number of children of
+ <element>. You can retrieve the number of children with the
+ ``countchildren()`` method.
+
+* Iteration over elements does not yield the children, but the siblings. You
+ can access all children with the ``iterchildren()`` method on elements or
+ retrieve a list by calling the ``getchildren()`` method.
+
+* The find, findall and findtext methods require a different implementation
+ based on ETXPath. In ``lxml.etree``, they use a Python implementation based
+ on the original iteration scheme. This has the disadvantage that they may
+ not be 100% backwards compatible, and the additional advantage that they now
+ support any XPath expression.
diff --git a/doc/parsing.txt b/doc/parsing.txt
new file mode 100644
index 0000000..a271dc0
--- /dev/null
+++ b/doc/parsing.txt
@@ -0,0 +1,1062 @@
+==============================
+Parsing XML and HTML with lxml
+==============================
+
+lxml provides a very simple and powerful API for parsing XML and HTML. It
+supports one-step parsing as well as step-by-step parsing using an
+event-driven API (currently only for XML).
+
+.. contents::
+..
+ 1 Parsers
+ 1.1 Parser options
+ 1.2 Error log
+ 1.3 Parsing HTML
+ 1.4 Doctype information
+ 2 The target parser interface
+ 3 The feed parser interface
+ 4 Incremental event parsing
+ 4.1 Event types
+ 4.1 Modifying the tree
+ 4.3 Selective tag events
+ 4.4 Comments and PIs
+ 4.5 Events with custom targets
+ 5 iterparse and iterwalk
+ 5.1 iterwalk
+ 6 Python unicode strings
+ 6.1 Serialising to Unicode strings
+
+
+The usual setup procedure:
+
+.. sourcecode:: pycon
+
+ >>> from lxml import etree
+
+The following examples also use StringIO or BytesIO to show how to parse
+from files and file-like objects. Both are available in the ``io`` module:
+
+.. sourcecode:: python
+
+ from io import StringIO, BytesIO
+
+..
+ >>> from lxml import usedoctest
+
+ >>> try: from StringIO import StringIO
+ ... except ImportError:
+ ... from io import BytesIO
+ ... def StringIO(s):
+ ... if isinstance(s, str): s = s.encode("UTF-8")
+ ... return BytesIO(s)
+
+ >>> try: unicode = unicode
+ ... except NameError: unicode = str
+
+ >>> import sys
+ >>> from lxml import etree as _etree
+ >>> if sys.version_info[0] >= 3:
+ ... class etree_mock(object):
+ ... def __getattr__(self, name): return getattr(_etree, name)
+ ... def tostring(self, *args, **kwargs):
+ ... s = _etree.tostring(*args, **kwargs)
+ ... if isinstance(s, bytes) and bytes([10]) in s: s = s.decode("utf-8") # CR
+ ... if s[-1] == '\n': s = s[:-1]
+ ... return s
+ ... else:
+ ... class etree_mock(object):
+ ... def __getattr__(self, name): return getattr(_etree, name)
+ ... def tostring(self, *args, **kwargs):
+ ... s = _etree.tostring(*args, **kwargs)
+ ... if s[-1] == '\n': s = s[:-1]
+ ... return s
+ >>> etree = etree_mock()
+
+
+Parsers
+=======
+
+Parsers are represented by parser objects. There is support for parsing both
+XML and (broken) HTML. Note that XHTML is best parsed as XML, parsing it with
+the HTML parser can lead to unexpected results. Here is a simple example for
+parsing XML from an in-memory string:
+
+.. sourcecode:: pycon
+
+ >>> xml = '<a xmlns="test"><b xmlns="test"/></a>'
+
+ >>> root = etree.fromstring(xml)
+ >>> etree.tostring(root)
+ b'<a xmlns="test"><b xmlns="test"/></a>'
+
+To read from a file or file-like object, you can use the ``parse()`` function,
+which returns an ``ElementTree`` object:
+
+.. sourcecode:: pycon
+
+ >>> tree = etree.parse(StringIO(xml))
+ >>> etree.tostring(tree.getroot())
+ b'<a xmlns="test"><b xmlns="test"/></a>'
+
+Note how the ``parse()`` function reads from a file-like object here. If
+parsing is done from a real file, it is more common (and also somewhat more
+efficient) to pass a filename:
+
+.. sourcecode:: pycon
+
+ >>> tree = etree.parse("doc/test.xml")
+
+lxml can parse from a local file, an HTTP URL or an FTP URL. It also
+auto-detects and reads gzip-compressed XML files (.gz).
+
+If you want to parse from memory and still provide a base URL for the document
+(e.g. to support relative paths in an XInclude), you can pass the ``base_url``
+keyword argument:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.fromstring(xml, base_url="http://where.it/is/from.xml")
+
+
+Parser options
+--------------
+
+The parsers accept a number of setup options as keyword arguments. The above
+example is easily extended to clean up namespaces during parsing:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser(ns_clean=True)
+ >>> tree = etree.parse(StringIO(xml), parser)
+ >>> etree.tostring(tree.getroot())
+ b'<a xmlns="test"><b/></a>'
+
+The keyword arguments in the constructor are mainly based on the libxml2
+parser configuration. A DTD will also be loaded if validation or attribute
+default values are requested.
+
+Available boolean keyword arguments:
+
+* attribute_defaults - read the DTD (if referenced by the document) and add
+ the default attributes from it
+
+* dtd_validation - validate while parsing (if a DTD was referenced)
+
+* load_dtd - load and parse the DTD while parsing (no validation is performed)
+
+* no_network - prevent network access when looking up external
+ documents (on by default)
+
+* ns_clean - try to clean up redundant namespace declarations
+
+* recover - try hard to parse through broken XML
+
+* remove_blank_text - discard blank text nodes between tags, also known as
+ ignorable whitespace. This is best used together with a DTD or schema
+ (which tells data and noise apart), otherwise a heuristic will be applied.
+
+* remove_comments - discard comments
+
+* remove_pis - discard processing instructions
+
+* strip_cdata - replace CDATA sections by normal text content (on by
+ default)
+
+* resolve_entities - replace entities by their text value (on by
+ default)
+
+* huge_tree - disable security restrictions and support very deep trees
+ and very long text content (only affects libxml2 2.7+)
+
+* compact - use compact storage for short text content (on by default)
+
+* collect_ids - collect XML IDs in a hash table while parsing (on by default).
+ Disabling this can substantially speed up parsing of documents with many
+ different IDs if the hash lookup is not used afterwards.
+
+Other keyword arguments:
+
+* encoding - override the document encoding
+
+* target - a parser target object that will receive the parse events
+ (see `The target parser interface`_)
+
+* schema - an XMLSchema to validate against (see `validation <validation.html#xmlschema>`_)
+
+
+Error log
+---------
+
+Parsers have an ``error_log`` property that lists the errors and
+warnings of the last parser run:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser()
+ >>> print(len(parser.error_log))
+ 0
+
+ >>> tree = etree.XML("<root>\n</b>", parser) # doctest: +ELLIPSIS
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XMLSyntaxError: Opening and ending tag mismatch: root line 1 and b, line 2, column 5...
+
+ >>> print(len(parser.error_log))
+ 1
+
+ >>> error = parser.error_log[0]
+ >>> print(error.message)
+ Opening and ending tag mismatch: root line 1 and b
+ >>> print(error.line)
+ 2
+ >>> print(error.column)
+ 5
+
+Each entry in the log has the following properties:
+
+* ``message``: the message text
+* ``domain``: the domain ID (see the lxml.etree.ErrorDomains class)
+* ``type``: the message type ID (see the lxml.etree.ErrorTypes class)
+* ``level``: the log level ID (see the lxml.etree.ErrorLevels class)
+* ``line``: the line at which the message originated (if applicable)
+* ``column``: the character column at which the message originated (if applicable)
+* ``filename``: the name of the file in which the message originated (if applicable)
+
+For convenience, there are also three properties that provide readable
+names for the ID values:
+
+* ``domain_name``
+* ``type_name``
+* ``level_name``
+
+To filter for a specific kind of message, use the different
+``filter_*()`` methods on the error log (see the
+lxml.etree._ListErrorLog class).
+
+
+Parsing HTML
+------------
+
+HTML parsing is similarly simple. The parsers have a ``recover``
+keyword argument that the HTMLParser sets by default. It lets libxml2
+try its best to return a valid HTML tree with all content it can
+manage to parse. It will not raise an exception on parser errors.
+You should use libxml2 version 2.6.21 or newer to take advantage of
+this feature.
+
+.. sourcecode:: pycon
+
+ >>> broken_html = "<html><head><title>test<body><h1>page title</h3>"
+
+ >>> parser = etree.HTMLParser()
+ >>> tree = etree.parse(StringIO(broken_html), parser)
+
+ >>> result = etree.tostring(tree.getroot(),
+ ... pretty_print=True, method="html")
+ >>> print(result)
+ <html>
+ <head>
+ <title>test</title>
+ </head>
+ <body>
+ <h1>page title</h1>
+ </body>
+ </html>
+
+Lxml has an HTML function, similar to the XML shortcut known from
+ElementTree:
+
+.. sourcecode:: pycon
+
+ >>> html = etree.HTML(broken_html)
+ >>> result = etree.tostring(html, pretty_print=True, method="html")
+ >>> print(result)
+ <html>
+ <head>
+ <title>test</title>
+ </head>
+ <body>
+ <h1>page title</h1>
+ </body>
+ </html>
+
+The support for parsing broken HTML depends entirely on libxml2's recovery
+algorithm. It is *not* the fault of lxml if you find documents that are so
+heavily broken that the parser cannot handle them. There is also no guarantee
+that the resulting tree will contain all data from the original document. The
+parser may have to drop seriously broken parts when struggling to keep
+parsing. Especially misplaced meta tags can suffer from this, which may lead
+to encoding problems.
+
+Note that the result is a valid HTML tree, but it may not be a
+well-formed XML tree. For example, XML forbids double hyphens in
+comments, which the HTML parser will happily accept in recovery mode.
+Therefore, if your goal is to serialise an HTML document as an
+XML/XHTML document after parsing, you may have to apply some manual
+preprocessing first.
+
+Also note that the HTML parser is meant to parse HTML documents. For
+XHTML documents, use the XML parser, which is namespace aware.
+
+
+Doctype information
+-------------------
+
+The use of the libxml2 parsers makes some additional information available at
+the API level. Currently, ElementTree objects can access the DOCTYPE
+information provided by a parsed document, as well as the XML version and the
+original encoding. Since lxml 3.5, the doctype references are mutable.
+
+.. sourcecode:: pycon
+
+ >>> pub_id = "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ >>> sys_url = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
+ >>> doctype_string = '<!DOCTYPE html PUBLIC "%s" "%s">' % (pub_id, sys_url)
+ >>> xml_header = '<?xml version="1.0" encoding="ascii"?>'
+ >>> xhtml = xml_header + doctype_string + '<html><body></body></html>'
+
+ >>> tree = etree.parse(StringIO(xhtml))
+ >>> docinfo = tree.docinfo
+ >>> print(docinfo.public_id)
+ -//W3C//DTD XHTML 1.0 Transitional//EN
+ >>> print(docinfo.system_url)
+ http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd
+ >>> docinfo.doctype == doctype_string
+ True
+
+ >>> print(docinfo.xml_version)
+ 1.0
+ >>> print(docinfo.encoding)
+ ascii
+
+ >>> docinfo.system_url = None
+ >>> docinfo.public_id = None
+ >>> print(etree.tostring(tree))
+ <!DOCTYPE html>
+ <html><body/></html>
+
+
+The target parser interface
+===========================
+
+.. _`As in ElementTree`: http://effbot.org/elementtree/elementtree-xmlparser.htm
+
+`As in ElementTree`_, and similar to a SAX event handler, you can pass
+a target object to the parser:
+
+.. sourcecode:: pycon
+
+ >>> class EchoTarget(object):
+ ... def start(self, tag, attrib):
+ ... print("start %s %r" % (tag, dict(attrib)))
+ ... def end(self, tag):
+ ... print("end %s" % tag)
+ ... def data(self, data):
+ ... print("data %r" % data)
+ ... def comment(self, text):
+ ... print("comment %s" % text)
+ ... def close(self):
+ ... print("close")
+ ... return "closed!"
+
+ >>> parser = etree.XMLParser(target = EchoTarget())
+
+ >>> result = etree.XML("<element>some<!--comment-->text</element>",
+ ... parser)
+ start element {}
+ data u'some'
+ comment comment
+ data u'text'
+ end element
+ close
+
+ >>> print(result)
+ closed!
+
+It is important for the ``.close()`` method to reset the parser target
+to a usable state, so that you can reuse the parser as often as you
+like:
+
+.. sourcecode:: pycon
+
+ >>> result = etree.XML("<element>some<!--comment-->text</element>",
+ ... parser)
+ start element {}
+ data u'some'
+ comment comment
+ data u'text'
+ end element
+ close
+
+ >>> print(result)
+ closed!
+
+Starting with lxml 2.3, the ``.close()`` method will also be called in
+the error case. This diverges from the behaviour of ElementTree, but
+allows target objects to clean up their state in all situations, so
+that the parser can reuse them afterwards.
+
+.. sourcecode:: pycon
+
+ >>> class CollectorTarget(object):
+ ... def __init__(self):
+ ... self.events = []
+ ... def start(self, tag, attrib):
+ ... self.events.append("start %s %r" % (tag, dict(attrib)))
+ ... def end(self, tag):
+ ... self.events.append("end %s" % tag)
+ ... def data(self, data):
+ ... self.events.append("data %r" % data)
+ ... def comment(self, text):
+ ... self.events.append("comment %s" % text)
+ ... def close(self):
+ ... self.events.append("close")
+ ... return "closed!"
+
+ >>> parser = etree.XMLParser(target = CollectorTarget())
+
+ >>> result = etree.XML("<element>some</error>",
+ ... parser) # doctest: +ELLIPSIS
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XMLSyntaxError: Opening and ending tag mismatch...
+
+ >>> for event in parser.target.events:
+ ... print(event)
+ start element {}
+ data u'some'
+ close
+
+Note that the parser does *not* build a tree when using a parser
+target. The result of the parser run is whatever the target object
+returns from its ``.close()`` method. If you want to return an XML
+tree here, you have to create it programmatically in the target
+object. An example for a parser target that builds a tree is the
+``TreeBuilder``:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser(target = etree.TreeBuilder())
+
+ >>> result = etree.XML("<element>some<!--comment-->text</element>",
+ ... parser)
+
+ >>> print(result.tag)
+ element
+ >>> print(result[0].text)
+ comment
+
+
+The feed parser interface
+=========================
+
+Since lxml 2.0, the parsers have a feed parser interface that is
+compatible to the `ElementTree parsers`_. You can use it to feed data
+into the parser in a controlled step-by-step way.
+
+In lxml.etree, you can use both interfaces to a parser at the same
+time: the ``parse()`` or ``XML()`` functions, and the feed parser
+interface. Both are independent and will not conflict (except if used
+in conjunction with a parser target object as described above).
+
+.. _`ElementTree parsers`: http://effbot.org/elementtree/elementtree-xmlparser.htm
+
+To start parsing with a feed parser, just call its ``feed()`` method
+to feed it some data.
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser()
+
+ >>> for data in ('<?xml versio', 'n="1.0"?', '><roo', 't><a', '/></root>'):
+ ... parser.feed(data)
+
+When you are done parsing, you **must** call the ``close()`` method to
+retrieve the root Element of the parse result document, and to unlock the
+parser:
+
+.. sourcecode:: pycon
+
+ >>> root = parser.close()
+
+ >>> print(root.tag)
+ root
+ >>> print(root[0].tag)
+ a
+
+If you do not call ``close()``, the parser will stay locked and
+subsequent feeds will keep appending data, usually resulting in a non
+well-formed document and an unexpected parser error. So make sure you
+always close the parser after use, also in the exception case.
+
+Another way of achieving the same step-by-step parsing is by writing your own
+file-like object that returns a chunk of data on each ``read()`` call. Where
+the feed parser interface allows you to actively pass data chunks into the
+parser, a file-like object passively responds to ``read()`` requests of the
+parser itself. Depending on the data source, either way may be more natural.
+
+Note that the feed parser has its own error log called
+``feed_error_log``. Errors in the feed parser do not show up in the
+normal ``error_log`` and vice versa.
+
+You can also combine the feed parser interface with the target parser:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser(target = EchoTarget())
+
+ >>> parser.feed("<eleme")
+ >>> parser.feed("nt>some text</elem")
+ start element {}
+ data u'some text'
+ >>> parser.feed("ent>")
+ end element
+
+ >>> result = parser.close()
+ close
+ >>> print(result)
+ closed!
+
+Again, this prevents the automatic creation of an XML tree and leaves
+all the event handling to the target object. The ``close()`` method
+of the parser forwards the return value of the target's ``close()``
+method.
+
+
+Incremental event parsing
+=========================
+
+In Python 3.4, the ``xml.etree.ElementTree`` package gained an extension
+to the feed parser interface that is implemented by the ``XMLPullParser``
+class. It additionally allows processing parse events after each
+incremental parsing step, by calling the ``.read_events()`` method and
+iterating over the result. This is most useful for non-blocking execution
+environments where data chunks arrive one after the other and should be
+processed as far as possible in each step.
+
+The same feature is available in lxml 3.3. The basic usage is as follows:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLPullParser(events=('start', 'end'))
+
+ >>> def print_events(parser):
+ ... for action, element in parser.read_events():
+ ... print('%s: %s' % (action, element.tag))
+
+ >>> parser.feed('<root>some text')
+ >>> print_events(parser)
+ start: root
+ >>> print_events(parser) # well, no more events, as before ...
+
+ >>> parser.feed('<child><a />')
+ >>> print_events(parser)
+ start: child
+ start: a
+ end: a
+
+ >>> parser.feed('</child></roo')
+ >>> print_events(parser)
+ end: child
+ >>> parser.feed('t>')
+ >>> print_events(parser)
+ end: root
+
+Just like the normal feed parser, the ``XMLPullParser`` builds a tree in
+memory (and you should always call the ``.close()`` method when done with
+parsing):
+
+.. sourcecode:: pycon
+
+ >>> root = parser.close()
+ >>> etree.tostring(root)
+ b'<root>some text<child><a/></child></root>'
+
+However, since the parser provides incremental access to that tree,
+you can explicitly delete content that you no longer need once you
+have processed it. Read the section on `Modifying the tree`_ below
+to see what you can do here and what kind of modifications you should
+avoid.
+
+In lxml, it is enough to call the ``.read_events()`` method once as
+the iterator it returns can be reused when new events are available.
+
+Also, as known from other iterators in lxml, you can pass a ``tag``
+argument that selects which parse events are returned by the
+``.read_events()`` iterator.
+
+
+Event types
+-----------
+
+The parse events are tuples ``(event-type, object)``. The event types
+supported by ElementTree and lxml.etree are the strings 'start', 'end',
+'start-ns' and 'end-ns'. The 'start' and 'end' events represent opening
+and closing elements. They are accompanied by the respective Element
+instance. By default, only 'end' events are generated, whereas the
+example above requested the generation of both 'start' and 'end' events.
+
+The 'start-ns' and 'end-ns' events notify about namespace declarations.
+They do not come with Elements. Instead, the value of the 'start-ns'
+event is a tuple ``(prefix, namespaceURI)`` that designates the beginning
+of a prefix-namespace mapping. The corresponding ``end-ns`` event does
+not have a value (None). It is common practice to use a list as namespace
+stack and pop the last entry on the 'end-ns' event.
+
+.. sourcecode:: pycon
+
+ >>> def print_events(events):
+ ... for action, obj in events:
+ ... if action in ('start', 'end'):
+ ... print("%s: %s" % (action, obj.tag))
+ ... elif action == 'start-ns':
+ ... print("%s: %s" % (action, obj))
+ ... else:
+ ... print(action)
+
+ >>> event_types = ("start", "end", "start-ns", "end-ns")
+ >>> parser = etree.XMLPullParser(event_types)
+ >>> events = parser.read_events()
+
+ >>> parser.feed('<root><element>')
+ >>> print_events(events)
+ start: root
+ start: element
+ >>> parser.feed('text</element><element>text</element>')
+ >>> print_events(events)
+ end: element
+ start: element
+ end: element
+ >>> parser.feed('<empty-element xmlns="http://testns/" />')
+ >>> print_events(events)
+ start-ns: ('', 'http://testns/')
+ start: {http://testns/}empty-element
+ end: {http://testns/}empty-element
+ end-ns
+ >>> parser.feed('</root>')
+ >>> print_events(events)
+ end: root
+
+
+Modifying the tree
+------------------
+
+You can modify the element and its descendants when handling the
+'end' event. To save memory, for example, you can remove subtrees
+that are no longer needed:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLPullParser()
+ >>> events = parser.read_events()
+
+ >>> parser.feed('<root><element key="value">text</element>')
+ >>> parser.feed('<element><child /></element>')
+ >>> for action, elem in events:
+ ... print('%s: %d' % (elem.tag, len(elem))) # processing
+ ... elem.clear(keep_tail=True) # delete children
+ element: 0
+ child: 0
+ element: 1
+ >>> parser.feed('<empty-element xmlns="http://testns/" /></root>')
+ >>> for action, elem in events:
+ ... print('%s: %d' % (elem.tag, len(elem))) # processing
+ ... elem.clear(keep_tail=True) # delete children
+ {http://testns/}empty-element: 0
+ root: 3
+
+ >>> root = parser.close()
+ >>> etree.tostring(root)
+ b'<root/>'
+
+**WARNING**: During the 'start' event, any content of the element,
+such as the descendants, following siblings or text, is not yet
+available and should not be accessed. Only attributes are guaranteed
+to be set. During the 'end' event, the element and its descendants
+can be freely modified, but its following siblings should not be
+accessed. During either of the two events, you **must not** modify or
+move the ancestors (parents) of the current element. You should also
+avoid moving or discarding the element itself. The golden rule is: do
+not touch anything that will have to be touched again by the parser
+later on.
+
+If you have elements with a long list of children in your XML file and want
+to save more memory during parsing, you can clean up the preceding siblings
+of the current element:
+
+.. sourcecode:: pycon
+
+ >>> for event, element in parser.read_events():
+ ... # ... do something with the element
+ ... element.clear(keep_tail=True) # clean up children
+ ... while element.getprevious() is not None:
+ ... del element.getparent()[0] # clean up preceding siblings
+
+The ``while`` loop deletes multiple siblings in a row. This is only necessary
+if you skipped over some of them using the ``tag`` keyword argument.
+Otherwise, a simple ``if`` should do. The more selective your tag is,
+however, the more thought you will have to put into finding the right way to
+clean up the elements that were skipped. Therefore, it is sometimes easier to
+traverse all elements and do the tag selection by hand in the event handler
+code.
+
+
+Selective tag events
+--------------------
+
+As an extension over ElementTree, lxml.etree accepts a ``tag`` keyword
+argument just like ``element.iter(tag)``. This restricts events to a
+specific tag or namespace:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLPullParser(tag="element")
+
+ >>> parser.feed('<root><element key="value">text</element>')
+ >>> parser.feed('<element><child /></element>')
+ >>> parser.feed('<empty-element xmlns="http://testns/" /></root>')
+
+ >>> for action, elem in parser.read_events():
+ ... print("%s: %s" % (action, elem.tag))
+ end: element
+ end: element
+
+ >>> event_types = ("start", "end")
+ >>> parser = etree.XMLPullParser(event_types, tag="{http://testns/}*")
+
+ >>> parser.feed('<root><element key="value">text</element>')
+ >>> parser.feed('<element><child /></element>')
+ >>> parser.feed('<empty-element xmlns="http://testns/" /></root>')
+
+ >>> for action, elem in parser.read_events():
+ ... print("%s: %s" % (action, elem.tag))
+ start: {http://testns/}empty-element
+ end: {http://testns/}empty-element
+
+
+Comments and PIs
+----------------
+
+As an extension over ElementTree, the ``XMLPullParser`` in lxml.etree
+also supports the event types 'comment' and 'pi' for the respective
+XML structures.
+
+.. sourcecode:: pycon
+
+ >>> event_types = ("start", "end", "comment", "pi")
+ >>> parser = etree.XMLPullParser(event_types)
+
+ >>> parser.feed('<?some pi ?><!-- a comment --><root>')
+ >>> parser.feed('<element key="value">text</element>')
+ >>> parser.feed('<!-- another comment -->')
+ >>> parser.feed('<element>text</element>tail')
+ >>> parser.feed('<empty-element xmlns="http://testns/" />')
+ >>> parser.feed('</root>')
+
+ >>> for action, elem in parser.read_events():
+ ... if action in ('start', 'end'):
+ ... print("%s: %s" % (action, elem.tag))
+ ... elif action == 'pi':
+ ... print("%s: -%s=%s-" % (action, elem.target, elem.text))
+ ... else: # 'comment'
+ ... print("%s: -%s-" % (action, elem.text))
+ pi: -some=pi -
+ comment: - a comment -
+ start: root
+ start: element
+ end: element
+ comment: - another comment -
+ start: element
+ end: element
+ start: {http://testns/}empty-element
+ end: {http://testns/}empty-element
+ end: root
+
+ >>> root = parser.close()
+ >>> print(root.tag)
+ root
+
+
+Events with custom targets
+--------------------------
+
+You can combine the pull parser with a parser target. In that case,
+it is the target's responsibility to generate event values. Whatever
+it returns from its ``.start()`` and ``.end()`` methods will be returned
+by the pull parser as the second item of the parse events tuple.
+
+.. sourcecode:: pycon
+
+ >>> class Target(object):
+ ... def start(self, tag, attrib):
+ ... print('-> start(%s)' % tag)
+ ... return '>>START: %s<<' % tag
+ ... def end(self, tag):
+ ... print('-> end(%s)' % tag)
+ ... return '>>END: %s<<' % tag
+ ... def close(self):
+ ... print('-> close()')
+ ... return "CLOSED!"
+
+ >>> event_types = ('start', 'end')
+ >>> parser = etree.XMLPullParser(event_types, target=Target())
+
+ >>> parser.feed('<root><child1 /><child2 /></root>')
+ -> start(root)
+ -> start(child1)
+ -> end(child1)
+ -> start(child2)
+ -> end(child2)
+ -> end(root)
+
+ >>> for action, value in parser.read_events():
+ ... print('%s: %s' % (action, value))
+ start: >>START: root<<
+ start: >>START: child1<<
+ end: >>END: child1<<
+ start: >>START: child2<<
+ end: >>END: child2<<
+ end: >>END: root<<
+
+ >>> print(parser.close())
+ -> close()
+ CLOSED!
+
+As you can see, the event values do not even have to be Element objects.
+The target is generally free to decide how it wants to create an XML tree
+or whatever else it wants to make of the parser callbacks. In many cases,
+however, you will want to make your custom target inherit from the
+``TreeBuilder`` class in order to have it build a tree that you can process
+normally. The ``start()`` and ``.end()`` methods of ``TreeBuilder`` return
+the Element object that was created, so you can override them and modify
+the input or output according to your needs. Here is an example that
+filters attributes before they are being added to the tree:
+
+.. sourcecode:: pycon
+
+ >>> class AttributeFilter(etree.TreeBuilder):
+ ... def start(self, tag, attrib):
+ ... attrib = dict(attrib)
+ ... if 'evil' in attrib:
+ ... del attrib['evil']
+ ... return super(AttributeFilter, self).start(tag, attrib)
+
+ >>> parser = etree.XMLPullParser(target=AttributeFilter())
+ >>> parser.feed('<root><child1 test="123" /><child2 evil="YES" /></root>')
+
+ >>> for action, element in parser.read_events():
+ ... print('%s: %s(%r)' % (action, element.tag, element.attrib))
+ end: child1({'test': '123'})
+ end: child2({})
+ end: root({})
+
+ >>> root = parser.close()
+
+
+iterparse and iterwalk
+======================
+
+As known from ElementTree, the ``iterparse()`` utility function
+returns an iterator that generates parser events for an XML file (or
+file-like object), while building the tree. You can think of it as
+a blocking wrapper around the ``XMLPullParser`` that automatically and
+incrementally reads data from the input file for you and provides a
+single iterator for them:
+
+.. sourcecode:: pycon
+
+ >>> xml = '''
+ ... <root>
+ ... <element key='value'>text</element>
+ ... <element>text</element>tail
+ ... <empty-element xmlns="http://testns/" />
+ ... </root>
+ ... '''
+
+ >>> context = etree.iterparse(StringIO(xml))
+ >>> for action, elem in context:
+ ... print("%s: %s" % (action, elem.tag))
+ end: element
+ end: element
+ end: {http://testns/}empty-element
+ end: root
+
+After parsing, the resulting tree is available through the ``root`` property
+of the iterator:
+
+.. sourcecode:: pycon
+
+ >>> context.root.tag
+ 'root'
+
+The other event types can be activated with the ``events`` keyword argument:
+
+.. sourcecode:: pycon
+
+ >>> events = ("start", "end")
+ >>> context = etree.iterparse(StringIO(xml), events=events)
+ >>> for action, elem in context:
+ ... print("%s: %s" % (action, elem.tag))
+ start: root
+ start: element
+ end: element
+ start: element
+ end: element
+ start: {http://testns/}empty-element
+ end: {http://testns/}empty-element
+ end: root
+
+``iterparse()`` also supports the ``tag`` argument for selective event
+iteration and several other parameters that control the parser setup.
+The ``tag`` argument can be a single tag or a sequence of tags.
+You can also use it to parse HTML input by passing ``html=True``.
+
+
+iterwalk
+--------
+
+For convenience, lxml also provides an ``iterwalk()`` function.
+It behaves exactly like ``iterparse()``, but works on Elements and
+ElementTrees. Here is an example for a tree parsed by ``iterparse()``:
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO(xml)
+ >>> context = etree.iterparse(
+ ... f, events=("start", "end"), tag="element")
+
+ >>> for action, elem in context:
+ ... print("%s: %s" % (action, elem.tag))
+ start: element
+ end: element
+ start: element
+ end: element
+
+ >>> root = context.root
+
+And now we can take the resulting in-memory tree and iterate over it
+using ``iterwalk()`` to get the exact same events without parsing the
+input again:
+
+.. sourcecode:: pycon
+
+ >>> context = etree.iterwalk(
+ ... root, events=("start", "end"), tag="element")
+
+ >>> for action, elem in context:
+ ... print("%s: %s" % (action, elem.tag))
+ start: element
+ end: element
+ start: element
+ end: element
+
+In order to avoid wasting time on uninteresting parts of the tree, the ``iterwalk``
+iterator can be instructed to skip over an entire subtree with its
+``.skip_subtree()`` method.
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML('''
+ ... <root>
+ ... <a> <b /> </a>
+ ... <c />
+ ... </root>
+ ... ''')
+
+ >>> context = etree.iterwalk(root, events=("start", "end"))
+
+ >>> for action, elem in context:
+ ... print("%s: %s" % (action, elem.tag))
+ ... if action == 'start' and elem.tag == 'a':
+ ... context.skip_subtree() # ignore <b>
+ start: root
+ start: a
+ end: a
+ start: c
+ end: c
+ end: root
+
+Note that ``.skip_subtree()`` only has an effect when handling ``start`` or
+``start-ns`` events.
+
+
+Python unicode strings
+======================
+
+lxml.etree has broader support for Python unicode strings than the ElementTree
+library. First of all, where ElementTree would raise an exception, the
+parsers in lxml.etree can handle unicode strings straight away. This is most
+helpful for XML snippets embedded in source code using the ``XML()``
+function:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML( u'<test> \uf8d1 + \uf8d2 </test>' )
+
+This requires, however, that unicode strings do not specify a conflicting
+encoding themselves and thus lie about their real encoding:
+
+.. sourcecode:: pycon
+
+ >>> etree.XML( u'<?xml version="1.0" encoding="ASCII"?>\n' +
+ ... u'<test> \uf8d1 + \uf8d2 </test>' )
+ Traceback (most recent call last):
+ ...
+ ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
+
+Similarly, you will get errors when you try the same with HTML data in a
+unicode string that specifies a charset in a meta tag of the header. You
+should generally avoid converting XML/HTML data to unicode before passing it
+into the parsers. It is both slower and error prone.
+
+
+Serialising to Unicode strings
+------------------------------
+
+To serialize the result, you would normally use the ``tostring()``
+module function, which serializes to plain ASCII by default or a
+number of other byte encodings if asked for:
+
+.. sourcecode:: pycon
+
+ >>> etree.tostring(root)
+ b'<test> &#63697; + &#63698; </test>'
+
+ >>> etree.tostring(root, encoding='UTF-8', xml_declaration=False)
+ b'<test> \xef\xa3\x91 + \xef\xa3\x92 </test>'
+
+As an extension, lxml.etree recognises the name 'unicode' as an argument
+to the encoding parameter to build a Python unicode representation of a tree:
+
+.. sourcecode:: pycon
+
+ >>> etree.tostring(root, encoding='unicode')
+ u'<test> \uf8d1 + \uf8d2 </test>'
+
+ >>> el = etree.Element("test")
+ >>> etree.tostring(el, encoding='unicode')
+ u'<test/>'
+
+ >>> subel = etree.SubElement(el, "subtest")
+ >>> etree.tostring(el, encoding='unicode')
+ u'<test><subtest/></test>'
+
+ >>> tree = etree.ElementTree(el)
+ >>> etree.tostring(tree, encoding='unicode')
+ u'<test><subtest/></test>'
+
+The result of ``tostring(encoding='unicode')`` can be treated like any
+other Python unicode string and then passed back into the parsers.
+However, if you want to save the result to a file or pass it over the
+network, you should use ``write()`` or ``tostring()`` with a byte
+encoding (typically UTF-8) to serialize the XML. The main reason is
+that unicode strings returned by ``tostring(encoding='unicode')`` are
+not byte streams and they never have an XML declaration to specify
+their encoding. These strings are most likely not parsable by other
+XML libraries.
+
+For normal byte encodings, the ``tostring()`` function automatically
+adds a declaration as needed that reflects the encoding of the
+returned string. This makes it possible for other parsers to
+correctly parse the XML byte stream. Note that using ``tostring()``
+with UTF-8 is also considerably faster in most cases.
diff --git a/doc/performance.txt b/doc/performance.txt
new file mode 100644
index 0000000..1a0c9ad
--- /dev/null
+++ b/doc/performance.txt
@@ -0,0 +1,863 @@
+====================
+Benchmarks and Speed
+====================
+
+:Author:
+ Stefan Behnel
+
+.. meta::
+ :description: Performance evaluation of lxml and ElementTree:
+ fast operations, common pitfalls and optimisation hints.
+ :keywords: Python XML parser performance, XML processing, performance comparison,
+ lxml performance, lxml.etree, lxml.objectify, benchmarks, ElementTree
+
+
+lxml.etree is a very fast XML library. Most of this is due to the
+speed of libxml2, e.g. the parser and serialiser, or the XPath engine.
+Other areas of lxml were specifically written for high performance in
+high-level operations, such as the tree iterators.
+
+On the other hand, the simplicity of lxml sometimes hides internal
+operations that are more costly than the API suggests. If you are not
+aware of these cases, lxml may not always perform as you expect. A
+common example in the Python world is the Python list type. New users
+often expect it to be a linked list, while it actually is implemented
+as an array, which results in a completely different complexity for
+common operations.
+
+Similarly, the tree model of libxml2 is more complex than what lxml's
+ElementTree API projects into Python space, so some operations may
+show unexpected performance. Rest assured that most lxml users will
+not notice this in real life, as lxml is very fast in absolute
+numbers. It is definitely fast enough for most applications, so lxml
+is probably somewhere between 'fast enough' and 'the best choice' for
+yours. Read some messages_ from happy_ users_ to see what we mean.
+
+.. _messages: http://permalink.gmane.org/gmane.comp.python.lxml.devel/3250
+.. _happy: http://article.gmane.org/gmane.comp.python.lxml.devel/3246
+.. _users: http://thread.gmane.org/gmane.comp.python.lxml.devel/3244/focus=3244
+
+This text describes where lxml.etree (abbreviated to 'lxe') excels, gives
+hints on some performance traps and compares the overall performance to the
+original ElementTree_ (ET) and cElementTree_ (cET) libraries by Fredrik Lundh.
+The cElementTree library is a fast C-implementation of the original
+ElementTree.
+
+.. _ElementTree: http://effbot.org/zone/element-index.htm
+.. _cElementTree: http://effbot.org/zone/celementtree.htm
+
+.. contents::
+..
+ 1 How to read the timings
+ 2 Bad things first
+ 3 Parsing and Serialising
+ 4 The ElementTree API
+ 5 Tree traversal
+ 6 XPath
+ 7 lxml.objectify
+
+
+General notes
+=============
+
+First thing to say: there *is* an overhead involved in having a DOM-like C
+library mimic the ElementTree API. As opposed to ElementTree, lxml has to
+generate Python representations of tree nodes on the fly when asked for them,
+and the internal tree structure of libxml2 results in a higher maintenance
+overhead than the simpler top-down structure of ElementTree. What this means
+is: the more of your code runs in Python, the less you can benefit from the
+speed of lxml and libxml2. Note, however, that this is true for most
+performance critical Python applications. No one would implement Fourier
+transformations in pure Python when you can use NumPy.
+
+The up side then is that lxml provides powerful tools like tree iterators,
+XPath and XSLT, that can handle complex operations at the speed of C. Their
+pythonic API in lxml makes them so flexible that most applications can easily
+benefit from them.
+
+
+How to read the timings
+=======================
+
+The statements made here are backed by the (micro-)benchmark scripts
+`bench_etree.py`_, `bench_xpath.py`_ and `bench_objectify.py`_ that come with
+the lxml source distribution. They are distributed under the same BSD license
+as lxml itself, and the lxml project would like to promote them as a general
+benchmarking suite for all ElementTree implementations. New benchmarks are
+very easy to add as tiny test methods, so if you write a performance test for
+a specific part of the API yourself, please consider sending it to the lxml
+mailing list.
+
+The timings presented below compare lxml 3.1.1 (with libxml2 2.9.0) to the
+latest released versions of ElementTree (with cElementTree as accelerator
+module) in the standard library of CPython 3.3.0. They were run
+single-threaded on a 2.9GHz 64bit double core Intel i7 machine under
+Ubuntu Linux 12.10 (Quantal). The C libraries were compiled with the
+same platform specific optimisation flags. The Python interpreter was
+also manually compiled for the platform. Note that many of the following
+ElementTree timings are therefore better than what a normal Python
+installation with the standard library (c)ElementTree modules would yield.
+Note also that CPython 2.7 and 3.2+ come with a newer ElementTree version,
+so older Python installations will not perform as good for (c)ElementTree,
+and sometimes substantially worse.
+
+.. _`bench_etree.py`: https://github.com/lxml/lxml/blob/master/benchmark/bench_etree.py
+.. _`bench_xpath.py`: https://github.com/lxml/lxml/blob/master/benchmark/bench_xpath.py
+.. _`bench_objectify.py`: https://github.com/lxml/lxml/blob/master/benchmark/bench_objectify.py
+
+The scripts run a number of simple tests on the different libraries, using
+different XML tree configurations: different tree sizes (T1-4), with or
+without attributes (-/A), with or without ASCII string or unicode text
+(-/S/U), and either against a tree or its serialised XML form (T/X). In the
+result extracts cited below, T1 refers to a 3-level tree with many children at
+the third level, T2 is swapped around to have many children below the root
+element, T3 is a deep tree with few children at each level and T4 is a small
+tree, slightly broader than deep. If repetition is involved, this usually
+means running the benchmark in a loop over all children of the tree root,
+otherwise, the operation is run on the root node (C/R).
+
+As an example, the character code ``(SATR T1)`` states that the benchmark was
+running for tree T1, with plain string text (S) and attributes (A). It was
+run against the root element (R) in the tree structure of the data (T).
+
+Note that very small operations are repeated in integer loops to make them
+measurable. It is therefore not always possible to compare the absolute
+timings of, say, a single access benchmark (which usually loops) and a 'get
+all in one step' benchmark, which already takes enough time to be measurable
+and is therefore measured as is. An example is the index access to a single
+child, which cannot be compared to the timings for ``getchildren()``. Take a
+look at the concrete benchmarks in the scripts to understand how the numbers
+compare.
+
+
+Parsing and Serialising
+=======================
+
+Serialisation is an area where lxml excels. The reason is that it
+executes entirely at the C level, without any interaction with Python
+code. The results are rather impressive, especially for UTF-8, which
+is native to libxml2. While 20 to 40 times faster than (c)ElementTree
+1.2 (which was part of the standard library before Python 2.7/3.2),
+lxml is still more than 10 times as fast as the much improved
+ElementTree 1.3 in recent Python versions::
+
+ lxe: tostring_utf16 (S-TR T1) 7.9958 msec/pass
+ cET: tostring_utf16 (S-TR T1) 83.1358 msec/pass
+
+ lxe: tostring_utf16 (UATR T1) 8.3222 msec/pass
+ cET: tostring_utf16 (UATR T1) 84.4688 msec/pass
+
+ lxe: tostring_utf16 (S-TR T2) 8.2297 msec/pass
+ cET: tostring_utf16 (S-TR T2) 87.3415 msec/pass
+
+ lxe: tostring_utf8 (S-TR T2) 6.5677 msec/pass
+ cET: tostring_utf8 (S-TR T2) 76.2064 msec/pass
+
+ lxe: tostring_utf8 (U-TR T3) 1.1952 msec/pass
+ cET: tostring_utf8 (U-TR T3) 22.0058 msec/pass
+
+The difference is somewhat smaller for plain text serialisation::
+
+ lxe: tostring_text_ascii (S-TR T1) 2.7738 msec/pass
+ cET: tostring_text_ascii (S-TR T1) 4.7629 msec/pass
+
+ lxe: tostring_text_ascii (S-TR T3) 0.8273 msec/pass
+ cET: tostring_text_ascii (S-TR T3) 1.5273 msec/pass
+
+ lxe: tostring_text_utf16 (S-TR T1) 2.7659 msec/pass
+ cET: tostring_text_utf16 (S-TR T1) 10.5038 msec/pass
+
+ lxe: tostring_text_utf16 (U-TR T1) 2.8017 msec/pass
+ cET: tostring_text_utf16 (U-TR T1) 10.5207 msec/pass
+
+The ``tostring()`` function also supports serialisation to a Python
+unicode string object, which is currently faster in ElementTree
+under CPython 3.3::
+
+ lxe: tostring_text_unicode (S-TR T1) 2.6896 msec/pass
+ cET: tostring_text_unicode (S-TR T1) 1.0056 msec/pass
+
+ lxe: tostring_text_unicode (U-TR T1) 2.7366 msec/pass
+ cET: tostring_text_unicode (U-TR T1) 1.0154 msec/pass
+
+ lxe: tostring_text_unicode (S-TR T3) 0.7997 msec/pass
+ cET: tostring_text_unicode (S-TR T3) 0.3154 msec/pass
+
+ lxe: tostring_text_unicode (U-TR T4) 0.0048 msec/pass
+ cET: tostring_text_unicode (U-TR T4) 0.0160 msec/pass
+
+For parsing, lxml.etree and cElementTree compete for the medal.
+Depending on the input, either of the two can be faster. The (c)ET
+libraries use a very thin layer on top of the expat parser, which is
+known to be very fast. Here are some timings from the benchmarking
+suite::
+
+ lxe: parse_bytesIO (SAXR T1) 13.0246 msec/pass
+ cET: parse_bytesIO (SAXR T1) 8.2929 msec/pass
+
+ lxe: parse_bytesIO (S-XR T3) 1.3542 msec/pass
+ cET: parse_bytesIO (S-XR T3) 2.4023 msec/pass
+
+ lxe: parse_bytesIO (UAXR T3) 7.5610 msec/pass
+ cET: parse_bytesIO (UAXR T3) 11.2455 msec/pass
+
+And another couple of timings `from a benchmark`_ that Fredrik Lundh
+`used to promote cElementTree`_, comparing a number of different
+parsers. First, parsing a 274KB XML file containing Shakespeare's
+Hamlet::
+
+ xml.etree.ElementTree.parse done in 0.017 seconds
+ xml.etree.cElementTree.parse done in 0.007 seconds
+ xml.etree.cElementTree.XMLParser.feed(): 6636 nodes read in 0.007 seconds
+ lxml.etree.parse done in 0.003 seconds
+ drop_whitespace.parse done in 0.003 seconds
+ lxml.etree.XMLParser.feed(): 6636 nodes read in 0.004 seconds
+ minidom tree read in 0.080 seconds
+
+And a 3.4MB XML file containing the Old Testament::
+
+ xml.etree.ElementTree.parse done in 0.038 seconds
+ xml.etree.cElementTree.parse done in 0.030 seconds
+ xml.etree.cElementTree.XMLParser.feed(): 25317 nodes read in 0.030 seconds
+ lxml.etree.parse done in 0.016 seconds
+ drop_whitespace.parse done in 0.015 seconds
+ lxml.etree.XMLParser.feed(): 25317 nodes read in 0.022 seconds
+ minidom tree read in 0.288 seconds
+
+.. _`from a benchmark`: http://svn.effbot.org/public/elementtree-1.3/benchmark.py
+.. _`used to promote cElementTree`: http://effbot.org/zone/celementtree.htm#benchmarks
+
+Here are the same benchmarks again, but including the memory usage
+of the process in KB before and after parsing (using os.fork() to
+make sure we start from a clean state each time). For the 274KB
+hamlet.xml file::
+
+ Memory usage: 7284
+ xml.etree.ElementTree.parse done in 0.017 seconds
+ Memory usage: 9432 (+2148)
+ xml.etree.cElementTree.parse done in 0.007 seconds
+ Memory usage: 9432 (+2152)
+ xml.etree.cElementTree.XMLParser.feed(): 6636 nodes read in 0.007 seconds
+ Memory usage: 9448 (+2164)
+ lxml.etree.parse done in 0.003 seconds
+ Memory usage: 11032 (+3748)
+ drop_whitespace.parse done in 0.003 seconds
+ Memory usage: 10224 (+2940)
+ lxml.etree.XMLParser.feed(): 6636 nodes read in 0.004 seconds
+ Memory usage: 11804 (+4520)
+ minidom tree read in 0.080 seconds
+ Memory usage: 12324 (+5040)
+
+And for the 3.4MB Old Testament XML file::
+
+ Memory usage: 10420
+ xml.etree.ElementTree.parse done in 0.038 seconds
+ Memory usage: 20660 (+10240)
+ xml.etree.cElementTree.parse done in 0.030 seconds
+ Memory usage: 20660 (+10240)
+ xml.etree.cElementTree.XMLParser.feed(): 25317 nodes read in 0.030 seconds
+ Memory usage: 20844 (+10424)
+ lxml.etree.parse done in 0.016 seconds
+ Memory usage: 27624 (+17204)
+ drop_whitespace.parse done in 0.015 seconds
+ Memory usage: 24468 (+14052)
+ lxml.etree.XMLParser.feed(): 25317 nodes read in 0.022 seconds
+ Memory usage: 29844 (+19424)
+ minidom tree read in 0.288 seconds
+ Memory usage: 28788 (+18368)
+
+As can be seen from the sizes, both lxml.etree and cElementTree are
+rather memory friendly compared to the pure Python libraries
+ElementTree and (especially) minidom. Comparing to older CPython
+versions, the memory footprint of the minidom library was considerably
+reduced in CPython 3.3, by about a factor of 4 in this case.
+
+For plain parser performance, lxml.etree and cElementTree tend to stay
+rather close to each other, usually within a factor of two, with
+winners well distributed over both sides. Similar timings can be
+observed for the ``iterparse()`` function::
+
+ lxe: iterparse_bytesIO (SAXR T1) 17.9198 msec/pass
+ cET: iterparse_bytesIO (SAXR T1) 14.4982 msec/pass
+
+ lxe: iterparse_bytesIO (UAXR T3) 8.8522 msec/pass
+ cET: iterparse_bytesIO (UAXR T3) 12.9857 msec/pass
+
+However, if you benchmark the complete round-trip of a serialise-parse
+cycle, the numbers will look similar to these::
+
+ lxe: write_utf8_parse_bytesIO (S-TR T1) 19.8867 msec/pass
+ cET: write_utf8_parse_bytesIO (S-TR T1) 80.7259 msec/pass
+
+ lxe: write_utf8_parse_bytesIO (UATR T2) 23.7896 msec/pass
+ cET: write_utf8_parse_bytesIO (UATR T2) 98.0766 msec/pass
+
+ lxe: write_utf8_parse_bytesIO (S-TR T3) 3.0684 msec/pass
+ cET: write_utf8_parse_bytesIO (S-TR T3) 24.6122 msec/pass
+
+ lxe: write_utf8_parse_bytesIO (SATR T4) 0.3495 msec/pass
+ cET: write_utf8_parse_bytesIO (SATR T4) 1.9610 msec/pass
+
+For applications that require a high parser throughput of large files,
+and that do little to no serialization, both cET and lxml.etree are a
+good choice. The cET library is particularly fast for iterparse
+applications that extract small amounts of data or aggregate
+information from large XML data sets that do not fit into memory. If
+it comes to round-trip performance, however, lxml is multiple times
+faster in total. So, whenever the input documents are not
+considerably larger than the output, lxml is the clear winner.
+
+Regarding HTML parsing, Ian Bicking has done some `benchmarking on
+lxml's HTML parser`_, comparing it to a number of other famous HTML
+parser tools for Python. lxml wins this contest by quite a length.
+To give an idea, the numbers suggest that lxml.html can run a couple
+of parse-serialise cycles in the time that other tools need for
+parsing alone. The comparison even shows some very favourable results
+regarding memory consumption.
+
+.. _`benchmarking on lxml's HTML parser`: http://blog.ianbicking.org/2008/03/30/python-html-parser-performance/
+
+Liza Daly has written an article that presents a couple of tweaks to
+get the most out of lxml's parser for very large XML documents. She
+quite favourably positions ``lxml.etree`` as a tool for
+`high-performance XML parsing`_.
+
+.. _`high-performance XML parsing`: http://www.ibm.com/developerworks/xml/library/x-hiperfparse/
+
+Finally, `xml.com`_ has a couple of publications about XML parser
+performance. Farwick and Hafner have written two interesting articles
+that compare the parser of libxml2 to some major Java based XML
+parsers. One deals with `event-driven parser performance`_, the other
+one presents `benchmark results comparing DOM parsers`_. Both
+comparisons suggest that libxml2's parser performance is largely
+superior to all commonly used Java parsers in almost all cases. Note
+that the C parser benchmark results are based on xmlbench_, which uses
+a simpler setup for libxml2 than lxml does.
+
+.. _`xml.com`: http://www.xml.com/
+.. _`event-driven parser performance`: http://www.xml.com/lpt/a/1702
+.. _`benchmark results comparing DOM parsers`: http://www.xml.com/lpt/a/1703
+.. _xmlbench: http://xmlbench.sourceforge.net/
+
+
+The ElementTree API
+===================
+
+Since all three libraries implement the same API, their performance is
+easy to compare in this area. A major disadvantage for lxml's
+performance is the different tree model that underlies libxml2. It
+allows lxml to provide parent pointers for elements and full XPath
+support, but also increases the overhead of tree building and
+restructuring. This can be seen from the tree setup times of the
+benchmark (given in seconds)::
+
+ lxe: -- S- U- -A SA UA
+ T1: 0.0299 0.0343 0.0344 0.0293 0.0345 0.0342
+ T2: 0.0368 0.0423 0.0418 0.0427 0.0474 0.0459
+ T3: 0.0088 0.0084 0.0086 0.0251 0.0258 0.0261
+ T4: 0.0002 0.0002 0.0002 0.0005 0.0006 0.0006
+ cET: -- S- U- -A SA UA
+ T1: 0.0050 0.0045 0.0093 0.0044 0.0043 0.0043
+ T2: 0.0073 0.0075 0.0074 0.0201 0.0075 0.0074
+ T3: 0.0033 0.0213 0.0032 0.0034 0.0033 0.0035
+ T4: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000
+
+The timings are somewhat close to each other, although cET can be
+several times faster than lxml for larger trees. One of the
+reasons is that lxml must encode incoming string data and tag names
+into UTF-8, and additionally discard the created Python elements
+after their use, when they are no longer referenced. ElementTree
+represents the tree itself through these objects, which reduces
+the overhead in creating them.
+
+
+Child access
+------------
+
+The same tree overhead makes operations like collecting children as in
+``list(element)`` more costly in lxml. Where cET can quickly create
+a shallow copy of their list of children, lxml has to create a Python
+object for each child and collect them in a list::
+
+ lxe: root_list_children (--TR T1) 0.0038 msec/pass
+ cET: root_list_children (--TR T1) 0.0010 msec/pass
+
+ lxe: root_list_children (--TR T2) 0.0455 msec/pass
+ cET: root_list_children (--TR T2) 0.0050 msec/pass
+
+This handicap is also visible when accessing single children::
+
+ lxe: first_child (--TR T2) 0.0424 msec/pass
+ cET: first_child (--TR T2) 0.0384 msec/pass
+
+ lxe: last_child (--TR T1) 0.0477 msec/pass
+ cET: last_child (--TR T1) 0.0467 msec/pass
+
+... unless you also add the time to find a child index in a bigger
+list. ET and cET use Python lists here, which are based on arrays.
+The data structure used by libxml2 is a linked tree, and thus, a
+linked list of children::
+
+ lxe: middle_child (--TR T1) 0.0710 msec/pass
+ cET: middle_child (--TR T1) 0.0420 msec/pass
+
+ lxe: middle_child (--TR T2) 1.7393 msec/pass
+ cET: middle_child (--TR T2) 0.0396 msec/pass
+
+
+Element creation
+----------------
+
+As opposed to ET, libxml2 has a notion of documents that each element must be
+in. This results in a major performance difference for creating independent
+Elements that end up in independently created documents::
+
+ lxe: create_elements (--TC T2) 1.0045 msec/pass
+ cET: create_elements (--TC T2) 0.0753 msec/pass
+
+Therefore, it is always preferable to create Elements for the document they
+are supposed to end up in, either as SubElements of an Element or using the
+explicit ``Element.makeelement()`` call::
+
+ lxe: makeelement (--TC T2) 1.0586 msec/pass
+ cET: makeelement (--TC T2) 0.1483 msec/pass
+
+ lxe: create_subelements (--TC T2) 0.8826 msec/pass
+ cET: create_subelements (--TC T2) 0.0827 msec/pass
+
+So, if the main performance bottleneck of an application is creating large XML
+trees in memory through calls to Element and SubElement, cET is the best
+choice. Note, however, that the serialisation performance may even out this
+advantage, especially for smaller trees and trees with many attributes.
+
+
+Merging different sources
+-------------------------
+
+A critical action for lxml is moving elements between document contexts. It
+requires lxml to do recursive adaptations throughout the moved tree structure.
+
+The following benchmark appends all root children of the second tree to the
+root of the first tree::
+
+ lxe: append_from_document (--TR T1,T2) 1.0812 msec/pass
+ cET: append_from_document (--TR T1,T2) 0.1104 msec/pass
+
+ lxe: append_from_document (--TR T3,T4) 0.0155 msec/pass
+ cET: append_from_document (--TR T3,T4) 0.0060 msec/pass
+
+Although these are fairly small numbers compared to parsing, this easily shows
+the different performance classes for lxml and (c)ET. Where the latter do not
+have to care about parent pointers and tree structures, lxml has to deep
+traverse the appended tree. The performance difference therefore increases
+with the size of the tree that is moved.
+
+This difference is not always as visible, but applies to most parts of the
+API, like inserting newly created elements::
+
+ lxe: insert_from_document (--TR T1,T2) 3.9763 msec/pass
+ cET: insert_from_document (--TR T1,T2) 0.1459 msec/pass
+
+or replacing the child slice by a newly created element::
+
+ lxe: replace_children_element (--TC T1) 0.0749 msec/pass
+ cET: replace_children_element (--TC T1) 0.0081 msec/pass
+
+as opposed to replacing the slice with an existing element from the
+same document::
+
+ lxe: replace_children (--TC T1) 0.0052 msec/pass
+ cET: replace_children (--TC T1) 0.0036 msec/pass
+
+While these numbers are too small to provide a major performance
+impact in practice, you should keep this difference in mind when you
+merge very large trees. Note that Elements have a ``makeelement()``
+method that allows to create an Element within the same document,
+thus avoiding the merge overhead when inserting it into that tree.
+
+
+deepcopy
+--------
+
+Deep copying a tree is fast in lxml::
+
+ lxe: deepcopy_all (--TR T1) 3.1650 msec/pass
+ cET: deepcopy_all (--TR T1) 53.9973 msec/pass
+
+ lxe: deepcopy_all (-ATR T2) 3.7365 msec/pass
+ cET: deepcopy_all (-ATR T2) 61.6267 msec/pass
+
+ lxe: deepcopy_all (S-TR T3) 0.7913 msec/pass
+ cET: deepcopy_all (S-TR T3) 13.6220 msec/pass
+
+So, for example, if you have a database-like scenario where you parse in a
+large tree and then search and copy independent subtrees from it for further
+processing, lxml is by far the best choice here.
+
+
+Tree traversal
+--------------
+
+Another important area in XML processing is iteration for tree
+traversal. If your algorithms can benefit from step-by-step
+traversal of the XML tree and especially if few elements are of
+interest or the target element tag name is known, the ``.iter()``
+method is a good choice::
+
+ lxe: iter_all (--TR T1) 1.0529 msec/pass
+ cET: iter_all (--TR T1) 0.2635 msec/pass
+
+ lxe: iter_islice (--TR T2) 0.0110 msec/pass
+ cET: iter_islice (--TR T2) 0.0050 msec/pass
+
+ lxe: iter_tag (--TR T2) 0.0079 msec/pass
+ cET: iter_tag (--TR T2) 0.0112 msec/pass
+
+ lxe: iter_tag_all (--TR T2) 0.1822 msec/pass
+ cET: iter_tag_all (--TR T2) 0.5343 msec/pass
+
+This translates directly into similar timings for ``Element.findall()``::
+
+ lxe: findall (--TR T2) 1.7176 msec/pass
+ cET: findall (--TR T2) 0.9973 msec/pass
+
+ lxe: findall (--TR T3) 0.3967 msec/pass
+ cET: findall (--TR T3) 0.2525 msec/pass
+
+ lxe: findall_tag (--TR T2) 0.2258 msec/pass
+ cET: findall_tag (--TR T2) 0.5770 msec/pass
+
+ lxe: findall_tag (--TR T3) 0.1085 msec/pass
+ cET: findall_tag (--TR T3) 0.1919 msec/pass
+
+Note that all three libraries currently use the same Python
+implementation for ``.findall()``, except for their native tree
+iterator (``element.iter()``). In general, lxml is very fast
+for iteration, but loses ground against cET when many Elements
+are found and need to be instantiated. So, the more selective
+your search is, the faster lxml will run.
+
+
+XPath
+=====
+
+The following timings are based on the benchmark script `bench_xpath.py`_.
+
+This part of lxml does not have an equivalent in ElementTree. However, lxml
+provides more than one way of accessing it and you should take care which part
+of the lxml API you use. The most straight forward way is to call the
+``xpath()`` method on an Element or ElementTree::
+
+ lxe: xpath_method (--TC T1) 0.3982 msec/pass
+ lxe: xpath_method (--TC T2) 7.8895 msec/pass
+ lxe: xpath_method (--TC T3) 0.0477 msec/pass
+ lxe: xpath_method (--TC T4) 0.3982 msec/pass
+
+This is well suited for testing and when the XPath expressions are as diverse
+as the trees they are called on. However, if you have a single XPath
+expression that you want to apply to a larger number of different elements,
+the ``XPath`` class is the most efficient way to do it::
+
+ lxe: xpath_class (--TC T1) 0.0713 msec/pass
+ lxe: xpath_class (--TC T2) 1.1325 msec/pass
+ lxe: xpath_class (--TC T3) 0.0215 msec/pass
+ lxe: xpath_class (--TC T4) 0.0722 msec/pass
+
+Note that this still allows you to use variables in the expression, so you can
+parse it once and then adapt it through variables at call time. In other
+cases, where you have a fixed Element or ElementTree and want to run different
+expressions on it, you should consider the ``XPathEvaluator``::
+
+ lxe: xpath_element (--TR T1) 0.1101 msec/pass
+ lxe: xpath_element (--TR T2) 2.0473 msec/pass
+ lxe: xpath_element (--TR T3) 0.0267 msec/pass
+ lxe: xpath_element (--TR T4) 0.1087 msec/pass
+
+While it looks slightly slower, creating an XPath object for each of the
+expressions generates a much higher overhead here::
+
+ lxe: xpath_class_repeat (--TC T1 ) 0.3884 msec/pass
+ lxe: xpath_class_repeat (--TC T2 ) 7.6182 msec/pass
+ lxe: xpath_class_repeat (--TC T3 ) 0.0465 msec/pass
+ lxe: xpath_class_repeat (--TC T4 ) 0.3877 msec/pass
+
+Note that tree iteration can be substantially faster than XPath if
+your code short-circuits after the first couple of elements were
+found. The XPath engine will always return the complete result set,
+regardless of how much of it will actually be used.
+
+Here is an example where only the first matching element is being
+searched, a case for which XPath has syntax support as well::
+
+ lxe: find_single (--TR T2) 0.0184 msec/pass
+ cET: find_single (--TR T2) 0.0052 msec/pass
+
+ lxe: iter_single (--TR T2) 0.0024 msec/pass
+ cET: iter_single (--TR T2) 0.0007 msec/pass
+
+ lxe: xpath_single (--TR T2) 0.0033 msec/pass
+
+When looking for the first two elements out of many, the numbers
+explode for XPath, as restricting the result subset requires a
+more complex expression::
+
+ lxe: iterfind_two (--TR T2) 0.0184 msec/pass
+ cET: iterfind_two (--TR T2) 0.0062 msec/pass
+
+ lxe: iter_two (--TR T2) 0.0029 msec/pass
+ cET: iter_two (--TR T2) 0.0017 msec/pass
+
+ lxe: xpath_two (--TR T2) 0.2768 msec/pass
+
+
+A longer example
+================
+
+... based on lxml 1.3.
+
+A while ago, Uche Ogbuji posted a `benchmark proposal`_ that would
+read in a 3MB XML version of the `Old Testament`_ of the Bible and
+look for the word *begat* in all verses. Apparently, it is contained
+in 120 out of almost 24000 verses. This is easy to implement in
+ElementTree using ``findall()``. However, the fastest and most memory
+friendly way to do this is obviously ``iterparse()``, as most of the
+data is not of any interest.
+
+.. _`benchmark proposal`: http://www.onlamp.com/pub/wlg/6291
+.. _`Old Testament`: http://www.ibiblio.org/bosak/xml/eg/religion.2.00.xml.zip
+
+Now, Uche's original proposal was more or less the following:
+
+.. sourcecode:: python
+
+ def bench_ET():
+ tree = ElementTree.parse("ot.xml")
+ result = []
+ for v in tree.findall("//v"):
+ text = v.text
+ if 'begat' in text:
+ result.append(text)
+ return len(result)
+
+which takes about one second on my machine today. The faster ``iterparse()``
+variant looks like this:
+
+.. sourcecode:: python
+
+ def bench_ET_iterparse():
+ result = []
+ for event, v in ElementTree.iterparse("ot.xml"):
+ if v.tag == 'v':
+ text = v.text
+ if 'begat' in text:
+ result.append(text)
+ v.clear()
+ return len(result)
+
+The improvement is about 10%. At the time I first tried (early 2006), lxml
+didn't have ``iterparse()`` support, but the ``findall()`` variant was already
+faster than ElementTree. This changes immediately when you switch to
+cElementTree. The latter only needs 0.17 seconds to do the trick today and
+only some impressive 0.10 seconds when running the iterparse version. And
+even back then, it was quite a bit faster than what lxml could achieve.
+
+Since then, lxml has matured a lot and has gotten much faster. The iterparse
+variant now runs in 0.14 seconds, and if you remove the ``v.clear()``, it is
+even a little faster (which isn't the case for cElementTree).
+
+One of the many great tools in lxml is XPath, a Swiss army knife for finding
+things in XML documents. It is possible to move the whole thing to a pure
+XPath implementation, which looks like this:
+
+.. sourcecode:: python
+
+ def bench_lxml_xpath_all():
+ tree = etree.parse("ot.xml")
+ result = tree.xpath("//v[contains(., 'begat')]/text()")
+ return len(result)
+
+This runs in about 0.13 seconds and is about the shortest possible
+implementation (in lines of Python code) that I could come up with. Now, this
+is already a rather complex XPath expression compared to the simple "//v"
+ElementPath expression we started with. Since this is also valid XPath, let's
+try this instead:
+
+.. sourcecode:: python
+
+ def bench_lxml_xpath():
+ tree = etree.parse("ot.xml")
+ result = []
+ for v in tree.xpath("//v"):
+ text = v.text
+ if 'begat' in text:
+ result.append(text)
+ return len(result)
+
+This gets us down to 0.12 seconds, thus showing that a generic XPath
+evaluation engine cannot always compete with a simpler, tailored solution.
+However, since this is not much different from the original findall variant,
+we can remove the complexity of the XPath call completely and just go with
+what we had in the beginning. Under lxml, this runs in the same 0.12 seconds.
+
+But there is one thing left to try. We can replace the simple ElementPath
+expression with a native tree iterator:
+
+.. sourcecode:: python
+
+ def bench_lxml_getiterator():
+ tree = etree.parse("ot.xml")
+ result = []
+ for v in tree.getiterator("v"):
+ text = v.text
+ if 'begat' in text:
+ result.append(text)
+ return len(result)
+
+This implements the same thing, just without the overhead of parsing and
+evaluating a path expression. And this makes it another bit faster, down to
+0.11 seconds. For comparison, cElementTree runs this version in 0.17 seconds.
+
+So, what have we learned?
+
+* Python code is not slow. The pure XPath solution was not even as fast as
+ the first shot Python implementation. In general, a few more lines in
+ Python make things more readable, which is much more important than the last
+ 5% of performance.
+
+* It's important to know the available options - and it's worth starting with
+ the most simple one. In this case, a programmer would then probably have
+ started with ``getiterator("v")`` or ``iterparse()``. Either of them would
+ already have been the most efficient, depending on which library is used.
+
+* It's important to know your tool. lxml and cElementTree are both very fast
+ libraries, but they do not have the same performance characteristics. The
+ fastest solution in one library can be comparatively slow in the other. If
+ you optimise, optimise for the specific target platform.
+
+* It's not always worth optimising. After all that hassle we got from 0.12
+ seconds for the initial implementation to 0.11 seconds. Switching over to
+ cElementTree and writing an ``iterparse()`` based version would have given
+ us 0.10 seconds - not a big difference for 3MB of XML.
+
+* Take care what operation is really dominating in your use case. If we split
+ up the operations, we can see that lxml is slightly slower than cElementTree
+ on ``parse()`` (both about 0.06 seconds), but more visibly slower on
+ ``iterparse()``: 0.07 versus 0.10 seconds. However, tree iteration in lxml
+ is incredibly fast, so it can be better to parse the whole tree and then
+ iterate over it rather than using ``iterparse()`` to do both in one step.
+ Or, you can just wait for the lxml developers to optimise iterparse in one
+ of the next releases...
+
+
+lxml.objectify
+==============
+
+The following timings are based on the benchmark script `bench_objectify.py`_.
+
+Objectify is a data-binding API for XML based on lxml.etree, that was added in
+version 1.1. It uses standard Python attribute access to traverse the XML
+tree. It also features ObjectPath, a fast path language based on the same
+meme.
+
+Just like lxml.etree, lxml.objectify creates Python representations of
+elements on the fly. To save memory, the normal Python garbage collection
+mechanisms will discard them when their last reference is gone. In cases
+where deeply nested elements are frequently accessed through the objectify
+API, the create-discard cycles can become a bottleneck, as elements have to be
+instantiated over and over again.
+
+
+ObjectPath
+----------
+
+ObjectPath can be used to speed up the access to elements that are deep in the
+tree. It avoids step-by-step Python element instantiations along the path,
+which can substantially improve the access time::
+
+ lxe: attribute (--TR T1) 4.1828 msec/pass
+ lxe: attribute (--TR T2) 17.3802 msec/pass
+ lxe: attribute (--TR T4) 3.8657 msec/pass
+
+ lxe: objectpath (--TR T1) 0.9289 msec/pass
+ lxe: objectpath (--TR T2) 13.3109 msec/pass
+ lxe: objectpath (--TR T4) 0.9289 msec/pass
+
+ lxe: attributes_deep (--TR T1) 6.2900 msec/pass
+ lxe: attributes_deep (--TR T2) 20.4713 msec/pass
+ lxe: attributes_deep (--TR T4) 6.1679 msec/pass
+
+ lxe: objectpath_deep (--TR T1) 1.3049 msec/pass
+ lxe: objectpath_deep (--TR T2) 14.0815 msec/pass
+ lxe: objectpath_deep (--TR T4) 1.3051 msec/pass
+
+Note, however, that parsing ObjectPath expressions is not for free either, so
+this is most effective for frequently accessing the same element.
+
+
+Caching Elements
+----------------
+
+A way to improve the normal attribute access time is static instantiation of
+the Python objects, thus trading memory for speed. Just create a cache
+dictionary and run:
+
+.. sourcecode:: python
+
+ cache[root] = list(root.iter())
+
+after parsing and:
+
+.. sourcecode:: python
+
+ del cache[root]
+
+when you are done with the tree. This will keep the Python element
+representations of all elements alive and thus avoid the overhead of repeated
+Python object creation. You can also consider using filters or generator
+expressions to be more selective. By choosing the right trees (or even
+subtrees and elements) to cache, you can trade memory usage against access
+speed::
+
+ lxe: attribute_cached (--TR T1) 3.1357 msec/pass
+ lxe: attribute_cached (--TR T2) 15.8911 msec/pass
+ lxe: attribute_cached (--TR T4) 2.9194 msec/pass
+
+ lxe: attributes_deep_cached (--TR T1) 3.8984 msec/pass
+ lxe: attributes_deep_cached (--TR T2) 16.8300 msec/pass
+ lxe: attributes_deep_cached (--TR T4) 3.6936 msec/pass
+
+ lxe: objectpath_deep_cached (--TR T1) 0.7496 msec/pass
+ lxe: objectpath_deep_cached (--TR T2) 12.3763 msec/pass
+ lxe: objectpath_deep_cached (--TR T4) 0.7427 msec/pass
+
+Things to note: you cannot currently use ``weakref.WeakKeyDictionary`` objects
+for this as lxml's element objects do not support weak references (which are
+costly in terms of memory). Also note that new element objects that you add
+to these trees will not turn up in the cache automatically and will therefore
+still be garbage collected when all their Python references are gone, so this
+is most effective for largely immutable trees. You should consider using a
+set instead of a list in this case and add new elements by hand.
+
+
+Further optimisations
+---------------------
+
+Here are some more things to try if optimisation is required:
+
+* A lot of time is usually spent in tree traversal to find the addressed
+ elements in the tree. If you often work in subtrees, do what you would also
+ do with deep Python objects: assign the parent of the subtree to a variable
+ or pass it into functions instead of starting at the root. This allows
+ accessing its descendants more directly.
+
+* Try assigning data values directly to attributes instead of passing them
+ through DataElement.
+
+* If you use custom data types that are costly to parse, try running
+ ``objectify.annotate()`` over read-only trees to speed up the attribute type
+ inference on read access.
+
+Note that none of these measures is guaranteed to speed up your application.
+As usual, you should prefer readable code over premature optimisations and
+profile your expected use cases before bothering to apply optimisations at
+random.
diff --git a/doc/pubkey.asc b/doc/pubkey.asc
new file mode 100644
index 0000000..f72804c
--- /dev/null
+++ b/doc/pubkey.asc
@@ -0,0 +1,36 @@
+-----BEGIN PGP PUBLIC KEY BLOCK-----
+Version: GnuPG v1.4.2 (GNU/Linux)
+
+mQGiBEQf3JQRBACciSqxoX0q3VurkRENVVtG/pVqtFh/d2CohbVJlLCrO4s7nnPj
+CTfZFt6tmykZjsLJl24XpEJt0O/C0jLcaBqvXVgVvRXHz4DjEYYuQF4LPthhI4MA
+4T7ExptX4lU5g3BVJ46vPU8uRBbbxarBRas9rYewgnrYKWpZZCa7yMq+9wCgnyyR
+Si4E3viLwi77jda135nA6vcD/iqu8zIl9/dFuUcOvxJrhrm+UdY72puZ1TVczSAH
+GOqMjrKkfyHlaJh/ZzWENpTZIfOdVhy7Chvva18vH4Wz7jKj5UeIpRrBvjAD28r3
+Y3W5bfsnpPkvDOyU1vqBsw4q+/250GXEX0JqV2Rbf5yLVgEZPdGrswO460dr4UVS
+8RS0BACYTmyrz57AugHc5tRkqNw6o7ux2deOT0c3AbUcOWtOocGumCsUf+M1nOrc
+VWkeBWTv4HIIiecWYY/KwIemTthQGjxywaZDxOlBT0BOL/+vfYTq/plZULXr+g90
+rSe82+kLl9N5onkBDJKeDIcJDzRoxIRPV1i0Om/5JBI4jmUnv7QnU3RlZmFuIEJl
+aG5lbCA8c2NvZGVyQHVzZXJzLmJlcmxpb3MuZGU+iF8EExECACAFAkQiqKYCGwMG
+CwkIBwMCBBUCCAMEFgIDAQIeAQIXgAAKCRANPVNpCNOgHi+2AJ0a0JH8iP3RqrOL
+JefvHz1dSl3MxACYo7Ma6CeIgsGnyaSSdNOmNVXn+IhGBBARAgAGBQJEIqk0AAoJ
+ELO5mMzzmgZbmCcAoKZ2En1IlsxBpaPPxgWYrUOWfc6hAKCBWODMMOYptCBkSrjg
+m3gsrjHgYbQsU3RlZmFuIEJlaG5lbCA8c2NvZGVyQHVzZXJzLnNvdXJjZWZvcmdl
+Lm5ldD6IYAQTEQIAIAUCRB/clAIbAwYLCQgHAwIEFQIIAwQWAgMBAh4BAheAAAoJ
+EA09U2kI06Aen2YAn0hvuDs+Gslq9vPRFFbsFNJI40PmAJ0chjiiEy0xV5C+n6YX
+XFuldRDILYhGBBARAgAGBQJEIp4AAAoJELO5mMzzmgZbgKQAn3pWrmFdj8YaEyuR
+tEjKVZJDQ6ZVAJ0Y1igwADT40BPra+G/xiLa3YbCrrkCDQREH9ynEAgAiR4/0r0d
+doViNECfSLClllu5K0Bo1SEiMtvVNC3sJYgVzBddD8Xn8UAdjyAgmaL5FC2FsNQu
+RxxKkNlHNYCq8ZSWtZaL2MQ+SyMUyHv6VXVCGuSW0COpzbx58u+SZpjyESJ1kaZc
+73SaIw6kv/dVQHjurwmlo1lg3dLZ3PG08WGCYUMqkkv2K+J7+puzE2Cjo31gTq4s
+LYDCV26wjVQ6BqT2EcHQhVEjh0xq5ugc908cr/2FQAKkTifEbF+OVBGWiFMGgri+
+6+G54/BV/RakpvNCFYBiZHn/M9mQaWt7XoTmnEQ1ldq5KNlRhkqnQRF/NK5VpGcQ
+29As28aqpZTECwADBgf/WlRvBRI1Q1eIv2falEv7C6sOxqc3kr5z1uUBTRG5v9t6
+ff9k/J4oC6cnQx00GK3ZR8ija6bl8zwu+0m0M3rW49Krb1rsiT7r4ahOZ7p9RRro
+oG3NbUJYgMG10D1nxpaioYqa/m+PpILJM0wfYZZEuX0xkZcOB24yb+J7EIcGR09T
+mMd5sXtdTU+w/p7Xi2cP61uQ8qixyHBH8E06qgW2JtVFV9rGn7CNUOvkNaUBRnY5
+QxhdkvKJRx7voOLYWZFUBIWgto+6vmTgKmc2Ho6qddzME9UgwUNcknRgm0cf6Cxr
+6zPtxZl8a6KemjQcK7kARSmMNCDkqp/Pohe519A5vYhJBBgRAgAJBQJEH9ynAhsM
+AAoJEA09U2kI06Aesv4AnjiVQVLzqnNS/64vvMMP1UARY3HtAJ90YxNGhRNIhWYL
+UU16oJlGD/9M1Q==
+=gWy2
+-----END PGP PUBLIC KEY BLOCK-----
diff --git a/doc/resolvers.txt b/doc/resolvers.txt
new file mode 100644
index 0000000..6f0b320
--- /dev/null
+++ b/doc/resolvers.txt
@@ -0,0 +1,283 @@
+Document loading and URL resolving
+==================================
+
+.. contents::
+..
+ 1 XML Catalogs
+ 2 URI Resolvers
+ 3 Document loading in context
+ 4 I/O access control in XSLT
+
+
+The normal way to load external entities (such as DTDs) is by using
+XML catalogs. Lxml also has support for user provided document
+loaders in both the parsers and XSL transformations. These so-called
+resolvers are subclasses of the etree.Resolver class.
+
+..
+ >>> try: from StringIO import StringIO
+ ... except ImportError:
+ ... from io import BytesIO
+ ... def StringIO(s):
+ ... if isinstance(s, str): s = s.encode("UTF-8")
+ ... return BytesIO(s)
+
+
+XML Catalogs
+------------
+
+When loading an external entity for a document, e.g. a DTD, the parser
+is normally configured to prevent network access (see the
+``no_network`` parser option). Instead, it will try to load the
+entity from their local file system path or, in the most common case
+that the entity uses a network URL as reference, from a local XML
+catalog.
+
+`XML catalogs`_ are the preferred and agreed-on mechanism to load
+external entities from XML processors. Most tools will use them, so
+it is worth configuring them properly on a system. Many Linux
+installations use them by default, but on other systems they may need
+to get enabled manually. The `libxml2 site`_ has some documentation
+on `how to set up XML catalogs`_
+
+.. _`XML catalogs`: http://www.oasis-open.org/committees/entity/spec.html
+.. _`libxml2 site`: http://xmlsoft.org/
+.. _`how to set up XML catalogs`: http://xmlsoft.org/catalog.html
+
+
+URI Resolvers
+-------------
+
+Here is an example of a custom resolver:
+
+.. sourcecode:: pycon
+
+ >>> from lxml import etree
+
+ >>> class DTDResolver(etree.Resolver):
+ ... def resolve(self, url, id, context):
+ ... print("Resolving URL '%s'" % url)
+ ... return self.resolve_string(
+ ... '<!ENTITY myentity "[resolved text: %s]">' % url, context)
+
+This defines a resolver that always returns a dynamically generated DTD
+fragment defining an entity. The ``url`` argument passes the system URL of
+the requested document, the ``id`` argument is the public ID. Note that any
+of these may be None. The context object is not normally used by client code.
+
+Resolving is based on the methods of the Resolver object that build
+internal representations of the result document. The following
+methods exist:
+
+* ``resolve_string`` takes a parsable string as result document
+* ``resolve_filename`` takes a filename
+* ``resolve_file`` takes an open file-like object that has at least a read() method
+* ``resolve_empty`` resolves into an empty document
+
+The ``resolve()`` method may choose to return None, in which case the next
+registered resolver (or the default resolver) is consulted. Resolving always
+terminates if ``resolve()`` returns the result of any of the above
+``resolve_*()`` methods.
+
+Resolvers are registered local to a parser:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser(load_dtd=True)
+ >>> parser.resolvers.add( DTDResolver() )
+
+Note that we instantiate a parser that loads the DTD. This is not done by the
+default parser, which does no validation. When we use this parser to parse a
+document that requires resolving a URL, it will call our custom resolver:
+
+.. sourcecode:: pycon
+
+ >>> xml = '<!DOCTYPE doc SYSTEM "MissingDTD.dtd"><doc>&myentity;</doc>'
+ >>> tree = etree.parse(StringIO(xml), parser)
+ Resolving URL 'MissingDTD.dtd'
+ >>> root = tree.getroot()
+ >>> print(root.text)
+ [resolved text: MissingDTD.dtd]
+
+The entity in the document was correctly resolved by the generated DTD
+fragment.
+
+
+Document loading in context
+---------------------------
+
+XML documents memorise their initial parser (and its resolvers) during their
+life-time. This means that a lookup process related to a document will use
+the resolvers of the document's parser. We can demonstrate this with a
+resolver that only responds to a specific prefix:
+
+.. sourcecode:: pycon
+
+ >>> class PrefixResolver(etree.Resolver):
+ ... def __init__(self, prefix):
+ ... self.prefix = prefix
+ ... self.result_xml = '''\
+ ... <xsl:stylesheet
+ ... xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ ... <test xmlns="testNS">%s-TEST</test>
+ ... </xsl:stylesheet>
+ ... ''' % prefix
+ ... def resolve(self, url, pubid, context):
+ ... if url.startswith(self.prefix):
+ ... print("Resolved url %s as prefix %s" % (url, self.prefix))
+ ... return self.resolve_string(self.result_xml, context)
+
+We demonstrate this in XSLT and use the following stylesheet as an example:
+
+.. sourcecode:: pycon
+
+ >>> xml_text = """\
+ ... <xsl:stylesheet version="1.0"
+ ... xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ ... <xsl:include href="honk:test"/>
+ ... <xsl:template match="/">
+ ... <test>
+ ... <xsl:value-of select="document('hoi:test')/*/*/text()"/>
+ ... </test>
+ ... </xsl:template>
+ ... </xsl:stylesheet>
+ ... """
+
+Note that it needs to resolve two URIs: ``honk:test`` when compiling the XSLT
+document (i.e. when resolving ``xsl:import`` and ``xsl:include`` elements) and
+``hoi:test`` at transformation time, when calls to the ``document`` function
+are resolved. If we now register different resolvers with two different
+parsers, we can parse our document twice in different resolver contexts:
+
+.. sourcecode:: pycon
+
+ >>> hoi_parser = etree.XMLParser()
+ >>> normal_doc = etree.parse(StringIO(xml_text), hoi_parser)
+
+ >>> hoi_parser.resolvers.add( PrefixResolver("hoi") )
+ >>> hoi_doc = etree.parse(StringIO(xml_text), hoi_parser)
+
+ >>> honk_parser = etree.XMLParser()
+ >>> honk_parser.resolvers.add( PrefixResolver("honk") )
+ >>> honk_doc = etree.parse(StringIO(xml_text), honk_parser)
+
+These contexts are important for the further behaviour of the documents. They
+memorise their original parser so that the correct set of resolvers is used in
+subsequent lookups. To compile the stylesheet, XSLT must resolve the
+``honk:test`` URI in the ``xsl:include`` element. The ``hoi`` resolver cannot
+do that:
+
+.. sourcecode:: pycon
+
+ >>> transform = etree.XSLT(normal_doc)
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XSLTParseError: Cannot resolve URI honk:test
+
+ >>> transform = etree.XSLT(hoi_doc)
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XSLTParseError: Cannot resolve URI honk:test
+
+However, if we use the ``honk`` resolver associated with the respective
+document, everything works fine:
+
+.. sourcecode:: pycon
+
+ >>> transform = etree.XSLT(honk_doc)
+ Resolved url honk:test as prefix honk
+
+Running the transform accesses the same parser context again, but since it now
+needs to resolve the ``hoi`` URI in the call to the document function, its
+``honk`` resolver will fail to do so:
+
+.. sourcecode:: pycon
+
+ >>> result = transform(normal_doc)
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XSLTApplyError: Cannot resolve URI hoi:test
+
+ >>> result = transform(hoi_doc)
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XSLTApplyError: Cannot resolve URI hoi:test
+
+ >>> result = transform(honk_doc)
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XSLTApplyError: Cannot resolve URI hoi:test
+
+This can only be solved by adding a ``hoi`` resolver to the original parser:
+
+.. sourcecode:: pycon
+
+ >>> honk_parser.resolvers.add( PrefixResolver("hoi") )
+ >>> result = transform(honk_doc)
+ Resolved url hoi:test as prefix hoi
+ >>> print(str(result)[:-1])
+ <?xml version="1.0"?>
+ <test>hoi-TEST</test>
+
+We can see that the ``hoi`` resolver was called to generate a document that
+was then inserted into the result document by the XSLT transformation. Note
+that this is completely independent of the XML file you transform, as the URI
+is resolved from within the stylesheet context:
+
+.. sourcecode:: pycon
+
+ >>> result = transform(normal_doc)
+ Resolved url hoi:test as prefix hoi
+ >>> print(str(result)[:-1])
+ <?xml version="1.0"?>
+ <test>hoi-TEST</test>
+
+It may be seen as a matter of taste what resolvers the generated document
+inherits. For XSLT, the output document inherits the resolvers of the input
+document and not those of the stylesheet. Therefore, the last result does not
+inherit any resolvers at all.
+
+
+I/O access control in XSLT
+--------------------------
+
+By default, XSLT supports all extension functions from libxslt and libexslt as
+well as Python regular expressions through EXSLT. Some extensions enable
+style sheets to read and write files on the local file system.
+
+XSLT has a mechanism to control the access to certain I/O operations during
+the transformation process. This is most interesting where XSL scripts come
+from potentially insecure sources and must be prevented from modifying the
+local file system. Note, however, that there is no way to keep them from
+eating up your precious CPU time, so this should not stop you from thinking
+about what XSLT you execute.
+
+Access control is configured using the ``XSLTAccessControl`` class. It can be
+called with a number of keyword arguments that allow or deny specific
+operations:
+
+.. sourcecode:: pycon
+
+ >>> transform = etree.XSLT(honk_doc)
+ Resolved url honk:test as prefix honk
+ >>> result = transform(normal_doc)
+ Resolved url hoi:test as prefix hoi
+
+ >>> ac = etree.XSLTAccessControl(read_network=False)
+ >>> transform = etree.XSLT(honk_doc, access_control=ac)
+ Resolved url honk:test as prefix honk
+ >>> result = transform(normal_doc)
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XSLTApplyError: xsltLoadDocument: read rights for hoi:test denied
+
+There are a few things to keep in mind:
+
+* XSL parsing (``xsl:import``, etc.) is not affected by this mechanism
+* ``read_file=False`` does not imply ``write_file=False``, all controls are
+ independent.
+* ``read_file`` only applies to files in the file system. Any other scheme
+ for URLs is controlled by the ``*_network`` keywords.
+* If you need more fine-grained control than switching access on and off, you
+ should consider writing a custom document loader that returns empty
+ documents or raises exceptions if access is denied.
diff --git a/doc/rest2html.py b/doc/rest2html.py
new file mode 100755
index 0000000..6438df3
--- /dev/null
+++ b/doc/rest2html.py
@@ -0,0 +1,63 @@
+#!/usr/bin/python
+
+"""
+A minimal front end to the Docutils Publisher, producing HTML with
+Pygments syntax highlighting.
+"""
+
+# Set to True if you want inline CSS styles instead of classes
+INLINESTYLES = False
+
+
+try:
+ import locale
+ locale.setlocale(locale.LC_ALL, '')
+except:
+ pass
+
+# set up Pygments
+
+from pygments.formatters import HtmlFormatter
+
+# The default formatter
+DEFAULT = HtmlFormatter(noclasses=INLINESTYLES, cssclass='syntax')
+
+# Add name -> formatter pairs for every variant you want to use
+VARIANTS = {
+ # 'linenos': HtmlFormatter(noclasses=INLINESTYLES, linenos=True),
+}
+
+
+from docutils import nodes
+from docutils.parsers.rst import directives
+
+from pygments import highlight
+from pygments.lexers import get_lexer_by_name, TextLexer
+
+def pygments_directive(name, arguments, options, content, lineno,
+ content_offset, block_text, state, state_machine):
+ try:
+ lexer = get_lexer_by_name(arguments[0])
+ except ValueError:
+ # no lexer found - use the text one instead of an exception
+ lexer = TextLexer()
+ # take an arbitrary option if more than one is given
+ formatter = options and VARIANTS[options.keys()[0]] or DEFAULT
+ parsed = highlight(u'\n'.join(content), lexer, formatter)
+ return [nodes.raw('', parsed, format='html')]
+
+pygments_directive.arguments = (1, 0, 1)
+pygments_directive.content = 1
+pygments_directive.options = dict([(key, directives.flag) for key in VARIANTS])
+
+directives.register_directive('sourcecode', pygments_directive)
+
+
+# run the generation
+
+from docutils.core import publish_cmdline, default_description
+
+description = ('Generates (X)HTML documents from standalone reStructuredText '
+ 'sources. ' + default_description)
+
+publish_cmdline(writer_name='html', description=description)
diff --git a/doc/rest2latex.py b/doc/rest2latex.py
new file mode 100644
index 0000000..92d3e3b
--- /dev/null
+++ b/doc/rest2latex.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+
+# Testing:
+# python rest2latex.py objectify.txt > latex/objectify.tex
+
+"""
+A minimal front end to the Docutils Publisher, producing LaTeX with
+some syntax highlighting.
+"""
+
+# Set to True if you want inline CSS styles instead of classes
+INLINESTYLES = False
+
+
+try:
+ import locale
+ locale.setlocale(locale.LC_ALL, '')
+except:
+ pass
+
+# set up Pygments
+
+from pygments.formatters import LatexFormatter
+
+# The default formatter
+DEFAULT = LatexFormatter()
+
+# Add name -> formatter pairs for every variant you want to use
+VARIANTS = {
+ # 'linenos': HtmlFormatter(noclasses=INLINESTYLES, linenos=True),
+}
+
+
+from docutils import nodes
+from docutils.parsers.rst import directives
+
+from pygments import highlight
+from pygments.lexers import get_lexer_by_name, TextLexer
+
+def pygments_directive(name, arguments, options, content, lineno,
+ content_offset, block_text, state, state_machine):
+ try:
+ lexer = get_lexer_by_name(arguments[0])
+ except ValueError as e:
+ # no lexer found - use the text one instead of an exception
+ lexer = TextLexer()
+ # take an arbitrary option if more than one is given
+ formatter = options and VARIANTS[options.keys()[0]] or DEFAULT
+ parsed = highlight(u'\n'.join(content), lexer, formatter)
+ return [nodes.raw('', parsed, format='latex')]
+
+pygments_directive.arguments = (1, 0, 1)
+pygments_directive.content = 1
+pygments_directive.options = dict([(key, directives.flag) for key in VARIANTS])
+
+directives.register_directive('sourcecode', pygments_directive)
+
+
+# run the generation
+
+from docutils.core import publish_cmdline, default_description
+
+description = ('Generates LaTeX documents from standalone reStructuredText '
+ 'sources. ' + default_description)
+
+publish_cmdline(writer_name='latex2e', description=description)
diff --git a/doc/s5/Makefile b/doc/s5/Makefile
new file mode 100644
index 0000000..d08e710
--- /dev/null
+++ b/doc/s5/Makefile
@@ -0,0 +1,11 @@
+PYTHON?=python
+
+SLIDES=$(subst .txt,.html,$(wildcard *.txt))
+
+slides: $(SLIDES)
+
+%.html: %.txt
+ $(PYTHON) rst2s5.py --current-slide --language=en $< $@
+
+clean:
+ rm -f *~ $(SLIDES)
diff --git a/doc/s5/ep2008/atom-example.xml b/doc/s5/ep2008/atom-example.xml
new file mode 100644
index 0000000..18ab87a
--- /dev/null
+++ b/doc/s5/ep2008/atom-example.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+
+ <title>Example Feed</title>
+ <link href="http://example.org/"/>
+ <updated>2003-12-13T18:30:02Z</updated>
+ <author>
+ <name>John Doe</name>
+ </author>
+ <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
+
+ <entry>
+ <title>Atom-Powered Robots Run Amok</title>
+ <link href="http://example.org/2003/12/13/atom03"/>
+ <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+ <updated>2003-12-13T18:30:02Z</updated>
+ <summary>Some text.</summary>
+ </entry>
+
+</feed>
diff --git a/doc/s5/ep2008/atom.py b/doc/s5/ep2008/atom.py
new file mode 100644
index 0000000..d45462a
--- /dev/null
+++ b/doc/s5/ep2008/atom.py
@@ -0,0 +1,626 @@
+# ET is 80's!
+#import elementtree as etree
+# LXML is 00's!
+from lxml import etree
+from lxml.etree import tostring
+#from dateutil.parser import parse as parse_date
+from datetime import datetime
+import uuid
+import cgi
+import copy
+
+__all__ = [
+ 'ATOM', 'atom_ns', 'Element', 'tostring']
+
+ATOM_NAMESPACE = atom_ns = 'http://www.w3.org/2005/Atom'
+app_ns = 'http://www.w3.org/2007/app'
+xhtml_ns = 'http://www.w3.org/1999/xhtml'
+
+nsmap = {'': atom_ns, 'app': app_ns}
+
+_rel_alternate_xpath = etree.XPath(
+ "./atom:link[not(@rel) or @rel = 'alternate']",
+ namespaces=dict(atom=atom_ns))
+_rel_other_xpath = etree.XPath(
+ "./atom:link[@rel = $rel]",
+ namespaces=dict(atom=atom_ns))
+
+
+
+class AtomLookup(etree.CustomElementClassLookup):
+ _elements = {}
+ _app_elements = {}
+
+ def lookup(self, node_type, document, namespace, name):
+ if node_type == 'element':
+ if namespace == atom_ns:
+ return self._elements.get(name, AtomElement)
+ elif namespace == app_ns:
+ return self._app_elements.get(name, APPElement)
+ ## FIXME: is this default good?
+ return AtomElement
+ # Otherwise normal lookup
+ return None
+
+atom_parser = etree.XMLParser()
+atom_parser.setElementClassLookup(AtomLookup())
+
+def parse(input):
+ return etree.parse(input, atom_parser)
+
+def ATOM(atom):
+ """
+ Parse an Atom document
+ """
+ return etree.XML(atom, atom_parser)
+
+def Element(tag, *args, **kw):
+ """
+ Create an Atom element. Adds the Atom namespace if no namespace
+ is given.
+ """
+ if '{' not in tag:
+ # No namespace means the atom namespace
+ tag = '{%s}%s' % (atom_ns, tag)
+ return atom_parser.makeelement(tag, *args, **kw)
+
+def _strftime(d):
+ """
+ Format a date the way Atom likes it (RFC3339?)
+ """
+ return d.strftime('%Y-%m-%dT%H:%M:%SZ%z')
+
+## try:
+## from lxml import builder
+## except ImportError:
+## pass
+## else:
+## E = builder.ElementMaker(parser=atom_parser,
+## typemap={datetime: lambda e, v: _strftime(v)})
+from lxml import builder
+E = builder.ElementMaker(#parser=atom_parser,
+ typemap={datetime: lambda e, v: _strftime(v)})
+__all__.append('E')
+
+class NoDefault:
+ pass
+
+class _LiveList(list):
+ """
+ This list calls on_add or on_remove whenever the list is modified.
+ """
+ on_add = on_remove = None
+ name = None
+ def __init__(self, *args, **kw):
+ on_add = on_remove = name = None
+ if 'on_add' in kw:
+ on_add = kw.pop('on_add')
+ if 'on_remove' in kw:
+ on_remove = kw.pop('on_remove')
+ if 'name' in kw:
+ name = kw.pop('name')
+ list.__init__(self, *args, **kw)
+ self.on_add = on_add
+ self.on_remove = on_remove
+ self.name = name
+ def _make_list(self, obj):
+ if not isinstance(obj, (list, tuple)):
+ obj = list(obj)
+ return obj
+ def _do_add(self, items):
+ if self.on_add is not None:
+ for item in items:
+ self.on_add(self, item)
+ def _do_remove(self, items):
+ if self.on_remove is not None:
+ for item in items:
+ self.on_remove(self, item)
+ def __setslice__(self, i, j, other):
+ other = self._make_list(other)
+ old = self[i:j]
+ list.__setslice__(self, i, j, other)
+ self._do_remove(old)
+ self._do_add(other)
+ def __delslice__(self, i, j):
+ old = self[i:j]
+ list.__delslice__(self, i, j)
+ self._do_remove(old)
+ def __iadd__(self, other):
+ other = self._make_list(other)
+ list.__iadd__(self, other)
+ self._do_add(other)
+ def __imul__(self, n):
+ while n > 0:
+ self += self
+ n -= 1
+ def append(self, item):
+ list.append(self, item)
+ self._do_add([item])
+ def insert(self, i, item):
+ list.insert(self, i, item)
+ self._do_add([item])
+ def pop(self, i=-1):
+ item = self[i]
+ result = list.pop(self, i)
+ self._do_remove([item])
+ return result
+ def remove(self, item):
+ list.remove(self, item)
+ self._do_remove([item])
+ def extend(self, other):
+ for item in other:
+ self.append(item)
+ def __repr__(self):
+ name = self.name
+ if name is None:
+ name = '_LiveList'
+ return '%s(%s)' % (name, list.__repr__(self))
+
+class _findall_property(object):
+ """
+ Returns a LiveList of all the objects with the given tag. You can
+ append or remove items to the list to add or remove them from the
+ containing tag.
+ """
+
+ def __init__(self, tag, ns=atom_ns):
+ self.tag = tag
+ self.ns = ns
+ self.__doc__ = 'Return live list of all the <atom:%s> element' % self.tag
+ def __get__(self, obj, type=None):
+ if obj is None:
+ return self
+ def add(lst, item):
+ # FIXME: shouldn't just be an append
+ obj.append(item)
+ def remove(lst, item):
+ obj.remove(item)
+ return _LiveList(obj._atom_iter(self.tag, ns=self.ns),
+ on_add=add, on_remove=remove,
+ name='live_%s_list' % self.tag)
+ def __set__(self, obj, value):
+ cur = self.__get__(obj)
+ cur[:] = value
+
+class _text_element_property(object):
+ """
+ Creates an attribute that returns the text content of the given
+ subelement. E.g., ``title = _text_element_property('title')``
+ will make ``obj.title`` return the contents of the ``<title>``.
+ Similarly setting the attribute sets the text content of the
+ attribute.
+ """
+
+ def __init__(self, tag, strip=True):
+ self.tag = tag
+ self.strip = strip
+ self.__doc__ = 'Access the <atom:%s> element as text' % self.tag
+ def __get__(self, obj, type=None):
+ if obj is None:
+ return self
+ v = obj._atom_findtext(self.tag)
+ if self.strip:
+ if v is not None:
+ v = v.strip()
+ else:
+ return ''
+ return v
+ def __set__(self, obj, value):
+ el = obj._get_or_create(self.tag)
+ el.text = value
+ def __delete__(self, obj):
+ el = obj._atom_get(self.tag)
+ if el:
+ # FIXME: should it be an error if it doesn't exist?
+ obj.remove(el)
+
+class _element_property(object):
+ """
+ Returns a single subelement based on tag. Setting the attribute
+ removes the element and adds a new one. Deleting it removes the
+ element.
+
+ """
+ def __init__(self, tag):
+ self.tag = tag
+ self.__doc__ = 'Get the <atom:%s> element' % self.tag
+ def __get__(self, obj, type=None):
+ if obj is None:
+ return self
+ return obj._atom_get(self.tag)
+ def __set__(self, obj, value):
+ el = obj._atom_get(self.tag)
+ if el is not None:
+ parent = el.getparent()
+ index = parent.index(el)
+ parent[index] = value
+ else:
+ obj.append(value)
+ def __delete__(self):
+ el = obj._atom_get(self.tag)
+ if el is not None:
+ obj.remove(el)
+
+class _attr_element_property(object):
+ """
+ Get/set the value of the attribute on this element.
+ """
+
+ def __init__(self, attr, default=NoDefault):
+ self.attr = attr
+ self.default = default
+ self.__doc__ = 'Access the %s attribute' % self.attr
+ def __get__(self, obj, type=None):
+ if obj is None:
+ return self
+ try:
+ return obj.attrib[self.attr]
+ except KeyError:
+ if self.default is not NoDefault:
+ return self.default
+ raise AttributeError(self.attr)
+ def __set__(self, obj, value):
+ if value is None:
+ self.__delete__(obj)
+ else:
+ obj.attrib[self.attr] = value
+ def __delete__(self, obj):
+ if self.attr in obj.attrib:
+ del obj.attrib[self.attr]
+
+class _date_element_property(object):
+ """
+ Get/set the parsed date value of the text content of a tag.
+ """
+
+ def __init__(self, tag, ns=atom_ns):
+ self.tag = tag
+ self.ns = ns
+ self.__doc__ = 'Access the date in %s' % self.tag
+ def __get__(self, obj, type=None):
+ if obj is None:
+ return self
+ el = obj._atom_get(self.tag, ns=self.ns)
+ if el is None:
+ return None
+ return el.date
+ def __set__(self, obj, value):
+ el = obj._get_or_create(self.tag, ns=self.ns)
+ el.date = value
+ def __delete__(self):
+ el = obj._atom_get(self.tag)
+ if el is not None:
+ obj.remove(el)
+
+class _date_text_property(object):
+
+ def __get__(self, obj, type=None):
+ if obj is None:
+ return self
+ return parse_date(obj.text)
+ def __set__(self, obj, value):
+ if not value:
+ obj.text = None
+ return
+ if isinstance(value, datetime):
+ value = _strftime(value)
+ obj.text = value
+ def __del__(self, obj):
+ obj.text = None
+
+class AtomElement(etree.ElementBase):
+ def _get_or_create(self, tag, ns=atom_ns):
+ el = self.find('{%s}%s' % (ns, tag))
+ if el is None:
+ el = self.makeelement('{%s}%s' % (ns, tag))
+ self.append(el)
+ return el
+
+ def _atom_get(self, tag, ns=atom_ns):
+ for item in self._atom_iter(tag, ns=ns):
+ return item
+ return None
+
+ def _atom_iter(self, tag, ns=atom_ns):
+ return self.getiterator('{%s}%s' % (ns, tag))
+
+ def _atom_findtext(self, tag, ns=atom_ns):
+ return self.findtext('{%s}%s' % (ns, tag))
+
+ def _get_parent(self, tag, ns=atom_ns):
+ parent = self
+ while 1:
+ if parent.tag == '{%s}%s' % (ns, tag):
+ return parent
+ parent = parent.getparent()
+ if parent is None:
+ return None
+
+ @property
+ def feed(self):
+ return self._get_parent('feed')
+
+ def rel_links(self, rel='alternate'):
+ """
+ Return all the links with the given ``rel`` attribute. The
+ default relation is ``'alternate'``, and as specified for Atom
+ links with no ``rel`` attribute are assumed to mean alternate.
+ """
+ if rel is None:
+ return self._atom_iter('link')
+ return [
+ el for el in self._atom_iter('link')
+ if el.get('rel') == rel
+ or rel == 'alternate' and not el.get('rel')]
+
+ def __repr__(self):
+ tag = self.tag
+ if '}' in tag:
+ tag = tag.split('}', 1)[1]
+ return '<%s.%s atom:%s at %s>' % (
+ self.__class__.__module__,
+ self.__class__.__name__,
+ tag,
+ hex(abs(id(self)))[2:])
+
+class Feed(AtomElement):
+ """
+ For ``<feed>`` elements.
+ """
+
+ @property
+ def feed(self):
+ return self
+
+ entries = _findall_property('entry')
+ title = _text_element_property('title')
+ author = _element_property('author')
+
+class Entry(AtomElement):
+ """
+ For ``<entry>`` elements.
+ """
+
+ @property
+ def entry(self):
+ return self
+ id = _text_element_property('id')
+ title = _text_element_property('title')
+ published = _date_element_property('published')
+ updated = _date_element_property('updated')
+ edited = _date_element_property('edited', ns=app_ns)
+ def update_edited(self):
+ """
+ Set app:edited to current time
+ """
+ self.edited = datetime.utcnow()
+ def update_updated(self):
+ """
+ Set atom:updated to the current time
+ """
+ self.updated = datetime.utcnow()
+ def make_id(self):
+ """
+ Create an artificial id for this entry
+ """
+ assert not self.id, (
+ "You cannot make an id if one already exists")
+ self.id = 'uuid:%s' % uuid.uuid4()
+ def author__get(self):
+ el = self._atom_get('author')
+ if el is None:
+ if self.feed is not None:
+ return self.feed.author
+ return el
+ def author__set(self, value):
+ el = self._atom_get('author')
+ if el is not None:
+ self.remove(el)
+ self.append(value)
+ def author__del(self):
+ el = self._atom_get('author')
+ if el is not None:
+ self.remove(el)
+ author = property(author__get, author__set, author__del)
+
+ categories = _findall_property('category')
+
+class _EntryElement(AtomElement):
+ @property
+ def entry(self):
+ return self._get_parent('entry')
+
+class Category(_EntryElement):
+ """
+ For ``<category>`` elements.
+ """
+ term = _attr_element_property('term')
+ scheme = _attr_element_property('scheme', None)
+ label = _attr_element_property('label', None)
+
+ def as_string(self):
+ """
+ Returns the string representation of the category, using the
+ GData convention of ``{scheme}term``
+ """
+ if self.scheme is not None:
+ return '{%s}%s' % (self.scheme, self.term)
+ else:
+ return self.term
+
+class PersonElement(_EntryElement):
+ """
+ Represents authors and contributors
+ """
+
+ email = _text_element_property('email')
+ uri = _text_element_property('uri')
+ name = _text_element_property('name')
+
+class DateElement(_EntryElement):
+ """
+ For elements that contain a date in their text content.
+ """
+ date = _date_text_property()
+
+class TextElement(_EntryElement):
+
+ type = _attr_element_property('type', None)
+ src = _attr_element_property('src', None)
+
+ def _html__get(self):
+ """
+ Gives the parsed HTML of element's content. May return an
+ HtmlElement (from lxml.html) or an XHTML tree. If the element
+ is ``type="text"`` then it is returned as quoted HTML.
+
+ You can also set this attribute to either an lxml.html
+ element, an XHTML element, or an HTML string.
+
+ Raises AttributeError if this is not HTML content.
+ """
+ ## FIXME: should this handle text/html types?
+ if self.type == 'html':
+ content = self.text
+ elif self.type == 'text':
+ content = cgi.escape(self.text)
+ elif self.type == 'xhtml':
+ div = copy.deepcopy(self[0])
+ # Now remove the namespaces:
+ for el in div.getiterator():
+ if el.tag.startswith('{'):
+ el.tag = el.tag.split('}', 1)[1]
+ if div.tag.startswith('{'):
+ div.tag = el.tag.split('}', 1)[1]
+ from lxml.html import tostring
+ content = tostring(div)
+ else:
+ raise AttributeError(
+ "Not an HTML or text content (type=%r)" % self.type)
+ from lxml.html import fromstring
+ return fromstring(content)
+
+ def _html__set(self, value):
+ if value is None:
+ del self.html
+ return
+ if isinstance(value, basestring):
+ # Some HTML text
+ self.type = 'html'
+ self.text = value
+ return
+ if value.tag.startswith('{%s}' % xhtml_ns):
+ if value.tag != '{%s}div' % xhtml_ns:
+ # Need to wrap it in a <div>
+ el = self.makeelement('{%s}div' % xhtml_ns)
+ el.append(value)
+ value = el
+ self[:] = []
+ self.type = 'xhtml'
+ self.append(value)
+ return
+ from lxml import html
+ if isinstance(value, html.HtmlElement):
+ value = tostring(value)
+ self[:] = []
+ self.type = 'html'
+ self.text = value
+ return
+ raise TypeError(
+ "Unknown HTML type: %s" % type(value))
+
+ def _html__del(self):
+ self.text = None
+
+ html = property(_html__get, _html__set, _html__del, doc=_html__get.__doc__)
+
+ def _binary__get(self):
+ """
+ Gets/sets the binary content, which is base64 encoded in the
+ text.
+ """
+ text = self.text
+ if text is None:
+ raise AttributeError(
+ "No text (maybe in src?)")
+ text = text.decode('base64')
+ return text
+
+ def _binary__set(self, value):
+ if isinstance(value, unicode):
+ ## FIXME: is this kosher?
+ value = value.encode('utf8')
+ if not isinstance(value, str):
+ raise TypeError(
+ "Must set .binary to a str or unicode object (not %s)"
+ % type(value))
+ value = value.encode('base64')
+ self.text = value
+
+ def _binary__del(self):
+ self.text = None
+
+ binary = property(_binary__get, _binary__set, _binary__del, doc=_binary__get.__doc__)
+
+
+class LinkElement(_EntryElement):
+ """
+ For ``<link>`` elements.
+ """
+ href = _attr_element_property('href', None)
+ rel = _attr_element_property('rel', None)
+ type = _attr_element_property('type', None)
+ title = _attr_element_property('title', None)
+
+ def __repr__(self):
+ return '<%s.%s at %s rel=%r href=%r>' % (
+ self.__class__.__module__,
+ self.__class__.__name__,
+ hex(abs(id(self)))[2:],
+ self.rel, self.href)
+
+AtomLookup._elements.update(dict(
+ feed=Feed,
+ entry=Entry,
+ category=Category,
+ author=PersonElement,
+ contributor=PersonElement,
+ published=DateElement,
+ updated=DateElement,
+ content=TextElement,
+ summary=TextElement,
+ title=TextElement,
+ rights=TextElement,
+ subtitle=TextElement,
+ link=LinkElement,
+ ))
+
+class APPElement(etree.ElementBase):
+ def __repr__(self):
+ tag = self.tag
+ if '}' in tag:
+ tag = tag.split('}', 1)[1]
+ return '<%s.%s app:%s at %s>' % (
+ self.__class__.__module__,
+ self.__class__.__name__,
+ tag,
+ hex(abs(id(self)))[2:])
+
+class Service(APPElement):
+ workspaces = _findall_property('workspace', ns=app_ns)
+
+class Workspace(APPElement):
+ collections = _findall_property('collection', ns=app_ns)
+
+class Collection(APPElement):
+ pass
+
+class Edited(APPElement):
+ date = _date_text_property()
+
+AtomLookup._app_elements.update(dict(
+ service=Service,
+ workspace=Workspace,
+ collection=Collection,
+ edited=Edited,
+ ))
diff --git a/doc/s5/ep2008/atom.rng b/doc/s5/ep2008/atom.rng
new file mode 100644
index 0000000..cfa493b
--- /dev/null
+++ b/doc/s5/ep2008/atom.rng
@@ -0,0 +1,597 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ -*- rnc -*-
+ RELAX NG Compact Syntax Grammar for the
+ Atom Format Specification Version 11
+-->
+<grammar xmlns:xhtml="http://www.w3.org/1999/xhtml" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:s="http://www.ascc.net/xml/schematron" xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
+ <start>
+ <choice>
+ <ref name="atomFeed"/>
+ <ref name="atomEntry"/>
+ </choice>
+ </start>
+ <!-- Common attributes -->
+ <define name="atomCommonAttributes">
+ <optional>
+ <attribute name="xml:base">
+ <ref name="atomUri"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="xml:lang">
+ <ref name="atomLanguageTag"/>
+ </attribute>
+ </optional>
+ <zeroOrMore>
+ <ref name="undefinedAttribute"/>
+ </zeroOrMore>
+ </define>
+ <!-- Text Constructs -->
+ <define name="atomPlainTextConstruct">
+ <ref name="atomCommonAttributes"/>
+ <optional>
+ <attribute name="type">
+ <choice>
+ <value>text</value>
+ <value>html</value>
+ </choice>
+ </attribute>
+ </optional>
+ <text/>
+ </define>
+ <define name="atomXHTMLTextConstruct">
+ <ref name="atomCommonAttributes"/>
+ <attribute name="type">
+ <value>xhtml</value>
+ </attribute>
+ <ref name="xhtmlDiv"/>
+ </define>
+ <define name="atomTextConstruct">
+ <choice>
+ <ref name="atomPlainTextConstruct"/>
+ <ref name="atomXHTMLTextConstruct"/>
+ </choice>
+ </define>
+ <!-- Person Construct -->
+ <define name="atomPersonConstruct">
+ <ref name="atomCommonAttributes"/>
+ <interleave>
+ <element name="atom:name">
+ <text/>
+ </element>
+ <optional>
+ <element name="atom:uri">
+ <ref name="atomUri"/>
+ </element>
+ </optional>
+ <optional>
+ <element name="atom:email">
+ <ref name="atomEmailAddress"/>
+ </element>
+ </optional>
+ <zeroOrMore>
+ <ref name="extensionElement"/>
+ </zeroOrMore>
+ </interleave>
+ </define>
+ <!-- Date Construct -->
+ <define name="atomDateConstruct">
+ <ref name="atomCommonAttributes"/>
+ <data type="dateTime"/>
+ </define>
+ <!-- atom:feed -->
+ <define name="atomFeed">
+ <element name="atom:feed">
+ <s:rule context="atom:feed">
+ <s:assert test="atom:author or not(atom:entry[not(atom:author)])">An atom:feed must have an atom:author unless all of its atom:entry children have an atom:author.</s:assert>
+ </s:rule>
+ <ref name="atomCommonAttributes"/>
+ <interleave>
+ <zeroOrMore>
+ <ref name="atomAuthor"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="atomCategory"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="atomContributor"/>
+ </zeroOrMore>
+ <optional>
+ <ref name="atomGenerator"/>
+ </optional>
+ <optional>
+ <ref name="atomIcon"/>
+ </optional>
+ <ref name="atomId"/>
+ <zeroOrMore>
+ <ref name="atomLink"/>
+ </zeroOrMore>
+ <optional>
+ <ref name="atomLogo"/>
+ </optional>
+ <optional>
+ <ref name="atomRights"/>
+ </optional>
+ <optional>
+ <ref name="atomSubtitle"/>
+ </optional>
+ <ref name="atomTitle"/>
+ <ref name="atomUpdated"/>
+ <zeroOrMore>
+ <ref name="extensionElement"/>
+ </zeroOrMore>
+ </interleave>
+ <zeroOrMore>
+ <ref name="atomEntry"/>
+ </zeroOrMore>
+ </element>
+ </define>
+ <!-- atom:entry -->
+ <define name="atomEntry">
+ <element name="atom:entry">
+ <s:rule context="atom:entry">
+ <s:assert test="atom:link[@rel='alternate'] or atom:link[not(@rel)] or atom:content">An atom:entry must have at least one atom:link element with a rel attribute of 'alternate' or an atom:content.</s:assert>
+ </s:rule>
+ <s:rule context="atom:entry">
+ <s:assert test="atom:author or ../atom:author or atom:source/atom:author">An atom:entry must have an atom:author if its feed does not.</s:assert>
+ </s:rule>
+ <ref name="atomCommonAttributes"/>
+ <interleave>
+ <zeroOrMore>
+ <ref name="atomAuthor"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="atomCategory"/>
+ </zeroOrMore>
+ <optional>
+ <ref name="atomContent"/>
+ </optional>
+ <zeroOrMore>
+ <ref name="atomContributor"/>
+ </zeroOrMore>
+ <ref name="atomId"/>
+ <zeroOrMore>
+ <ref name="atomLink"/>
+ </zeroOrMore>
+ <optional>
+ <ref name="atomPublished"/>
+ </optional>
+ <optional>
+ <ref name="atomRights"/>
+ </optional>
+ <optional>
+ <ref name="atomSource"/>
+ </optional>
+ <optional>
+ <ref name="atomSummary"/>
+ </optional>
+ <ref name="atomTitle"/>
+ <ref name="atomUpdated"/>
+ <zeroOrMore>
+ <ref name="extensionElement"/>
+ </zeroOrMore>
+ </interleave>
+ </element>
+ </define>
+ <!-- atom:content -->
+ <define name="atomInlineTextContent">
+ <element name="atom:content">
+ <ref name="atomCommonAttributes"/>
+ <optional>
+ <attribute name="type">
+ <choice>
+ <value>text</value>
+ <value>html</value>
+ </choice>
+ </attribute>
+ </optional>
+ <zeroOrMore>
+ <text/>
+ </zeroOrMore>
+ </element>
+ </define>
+ <define name="atomInlineXHTMLContent">
+ <element name="atom:content">
+ <ref name="atomCommonAttributes"/>
+ <attribute name="type">
+ <value>xhtml</value>
+ </attribute>
+ <ref name="xhtmlDiv"/>
+ </element>
+ </define>
+ <define name="atomInlineOtherContent">
+ <element name="atom:content">
+ <ref name="atomCommonAttributes"/>
+ <optional>
+ <attribute name="type">
+ <ref name="atomMediaType"/>
+ </attribute>
+ </optional>
+ <zeroOrMore>
+ <choice>
+ <text/>
+ <ref name="anyElement"/>
+ </choice>
+ </zeroOrMore>
+ </element>
+ </define>
+ <define name="atomOutOfLineContent">
+ <element name="atom:content">
+ <ref name="atomCommonAttributes"/>
+ <optional>
+ <attribute name="type">
+ <ref name="atomMediaType"/>
+ </attribute>
+ </optional>
+ <attribute name="src">
+ <ref name="atomUri"/>
+ </attribute>
+ <empty/>
+ </element>
+ </define>
+ <define name="atomContent">
+ <choice>
+ <ref name="atomInlineTextContent"/>
+ <ref name="atomInlineXHTMLContent"/>
+ <ref name="atomInlineOtherContent"/>
+ <ref name="atomOutOfLineContent"/>
+ </choice>
+ </define>
+ <!-- atom:author -->
+ <define name="atomAuthor">
+ <element name="atom:author">
+ <ref name="atomPersonConstruct"/>
+ </element>
+ </define>
+ <!-- atom:category -->
+ <define name="atomCategory">
+ <element name="atom:category">
+ <ref name="atomCommonAttributes"/>
+ <attribute name="term"/>
+ <optional>
+ <attribute name="scheme">
+ <ref name="atomUri"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="label"/>
+ </optional>
+ <ref name="undefinedContent"/>
+ </element>
+ </define>
+ <!-- atom:contributor -->
+ <define name="atomContributor">
+ <element name="atom:contributor">
+ <ref name="atomPersonConstruct"/>
+ </element>
+ </define>
+ <!-- atom:generator -->
+ <define name="atomGenerator">
+ <element name="atom:generator">
+ <ref name="atomCommonAttributes"/>
+ <optional>
+ <attribute name="uri">
+ <ref name="atomUri"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="version"/>
+ </optional>
+ <text/>
+ </element>
+ </define>
+ <!-- atom:icon -->
+ <define name="atomIcon">
+ <element name="atom:icon">
+ <ref name="atomCommonAttributes"/>
+ <ref name="atomUri"/>
+ </element>
+ </define>
+ <!-- atom:id -->
+ <define name="atomId">
+ <element name="atom:id">
+ <ref name="atomCommonAttributes"/>
+ <ref name="atomUri"/>
+ </element>
+ </define>
+ <!-- atom:logo -->
+ <define name="atomLogo">
+ <element name="atom:logo">
+ <ref name="atomCommonAttributes"/>
+ <ref name="atomUri"/>
+ </element>
+ </define>
+ <!-- atom:link -->
+ <define name="atomLink">
+ <element name="atom:link">
+ <ref name="atomCommonAttributes"/>
+ <attribute name="href">
+ <ref name="atomUri"/>
+ </attribute>
+ <optional>
+ <attribute name="rel">
+ <choice>
+ <ref name="atomNCName"/>
+ <ref name="atomUri"/>
+ </choice>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="type">
+ <ref name="atomMediaType"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="hreflang">
+ <ref name="atomLanguageTag"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="title"/>
+ </optional>
+ <optional>
+ <attribute name="length"/>
+ </optional>
+ <ref name="undefinedContent"/>
+ </element>
+ </define>
+ <!-- atom:published -->
+ <define name="atomPublished">
+ <element name="atom:published">
+ <ref name="atomDateConstruct"/>
+ </element>
+ </define>
+ <!-- atom:rights -->
+ <define name="atomRights">
+ <element name="atom:rights">
+ <ref name="atomTextConstruct"/>
+ </element>
+ </define>
+ <!-- atom:source -->
+ <define name="atomSource">
+ <element name="atom:source">
+ <ref name="atomCommonAttributes"/>
+ <interleave>
+ <zeroOrMore>
+ <ref name="atomAuthor"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="atomCategory"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="atomContributor"/>
+ </zeroOrMore>
+ <optional>
+ <ref name="atomGenerator"/>
+ </optional>
+ <optional>
+ <ref name="atomIcon"/>
+ </optional>
+ <optional>
+ <ref name="atomId"/>
+ </optional>
+ <zeroOrMore>
+ <ref name="atomLink"/>
+ </zeroOrMore>
+ <optional>
+ <ref name="atomLogo"/>
+ </optional>
+ <optional>
+ <ref name="atomRights"/>
+ </optional>
+ <optional>
+ <ref name="atomSubtitle"/>
+ </optional>
+ <optional>
+ <ref name="atomTitle"/>
+ </optional>
+ <optional>
+ <ref name="atomUpdated"/>
+ </optional>
+ <zeroOrMore>
+ <ref name="extensionElement"/>
+ </zeroOrMore>
+ </interleave>
+ </element>
+ </define>
+ <!-- atom:subtitle -->
+ <define name="atomSubtitle">
+ <element name="atom:subtitle">
+ <ref name="atomTextConstruct"/>
+ </element>
+ </define>
+ <!-- atom:summary -->
+ <define name="atomSummary">
+ <element name="atom:summary">
+ <ref name="atomTextConstruct"/>
+ </element>
+ </define>
+ <!-- atom:title -->
+ <define name="atomTitle">
+ <element name="atom:title">
+ <ref name="atomTextConstruct"/>
+ </element>
+ </define>
+ <!-- atom:updated -->
+ <define name="atomUpdated">
+ <element name="atom:updated">
+ <ref name="atomDateConstruct"/>
+ </element>
+ </define>
+ <!-- Low-level simple types -->
+ <define name="atomNCName">
+ <data type="string">
+ <param name="minLength">1</param>
+ <param name="pattern">[^:]*</param>
+ </data>
+ </define>
+ <!-- Whatever a media type is, it contains at least one slash -->
+ <define name="atomMediaType">
+ <data type="string">
+ <param name="pattern">.+/.+</param>
+ </data>
+ </define>
+ <!-- As defined in RFC 3066 -->
+ <define name="atomLanguageTag">
+ <data type="string">
+ <param name="pattern">[A-Za-z]{1,8}(-[A-Za-z0-9]{1,8})*</param>
+ </data>
+ </define>
+ <!--
+ Unconstrained; it's not entirely clear how IRI fit into
+ xsd:anyURI so let's not try to constrain it here
+ -->
+ <define name="atomUri">
+ <text/>
+ </define>
+ <!-- Whatever an email address is, it contains at least one @ -->
+ <define name="atomEmailAddress">
+ <data type="string">
+ <param name="pattern">.+@.+</param>
+ </data>
+ </define>
+ <!-- Simple Extension -->
+ <define name="simpleExtensionElement">
+ <element>
+ <anyName>
+ <except>
+ <nsName ns="http://www.w3.org/2005/Atom"/>
+ </except>
+ </anyName>
+ <text/>
+ </element>
+ </define>
+ <!-- Structured Extension -->
+ <define name="structuredExtensionElement">
+ <element>
+ <anyName>
+ <except>
+ <nsName ns="http://www.w3.org/2005/Atom"/>
+ </except>
+ </anyName>
+ <choice>
+ <group>
+ <oneOrMore>
+ <attribute>
+ <anyName/>
+ </attribute>
+ </oneOrMore>
+ <zeroOrMore>
+ <choice>
+ <text/>
+ <ref name="anyElement"/>
+ </choice>
+ </zeroOrMore>
+ </group>
+ <group>
+ <zeroOrMore>
+ <attribute>
+ <anyName/>
+ </attribute>
+ </zeroOrMore>
+ <group>
+ <optional>
+ <text/>
+ </optional>
+ <oneOrMore>
+ <ref name="anyElement"/>
+ </oneOrMore>
+ <zeroOrMore>
+ <choice>
+ <text/>
+ <ref name="anyElement"/>
+ </choice>
+ </zeroOrMore>
+ </group>
+ </group>
+ </choice>
+ </element>
+ </define>
+ <!-- Other Extensibility -->
+ <define name="extensionElement">
+ <choice>
+ <ref name="simpleExtensionElement"/>
+ <ref name="structuredExtensionElement"/>
+ </choice>
+ </define>
+ <define name="undefinedAttribute">
+ <attribute>
+ <anyName>
+ <except>
+ <name>xml:base</name>
+ <name>xml:lang</name>
+ <nsName ns=""/>
+ </except>
+ </anyName>
+ </attribute>
+ </define>
+ <define name="undefinedContent">
+ <zeroOrMore>
+ <choice>
+ <text/>
+ <ref name="anyForeignElement"/>
+ </choice>
+ </zeroOrMore>
+ </define>
+ <define name="anyElement">
+ <element>
+ <anyName/>
+ <zeroOrMore>
+ <choice>
+ <attribute>
+ <anyName/>
+ </attribute>
+ <text/>
+ <ref name="anyElement"/>
+ </choice>
+ </zeroOrMore>
+ </element>
+ </define>
+ <define name="anyForeignElement">
+ <element>
+ <anyName>
+ <except>
+ <nsName ns="http://www.w3.org/2005/Atom"/>
+ </except>
+ </anyName>
+ <zeroOrMore>
+ <choice>
+ <attribute>
+ <anyName/>
+ </attribute>
+ <text/>
+ <ref name="anyElement"/>
+ </choice>
+ </zeroOrMore>
+ </element>
+ </define>
+ <!-- XHTML -->
+ <define name="anyXHTML">
+ <element>
+ <nsName ns="http://www.w3.org/1999/xhtml"/>
+ <zeroOrMore>
+ <choice>
+ <attribute>
+ <anyName/>
+ </attribute>
+ <text/>
+ <ref name="anyXHTML"/>
+ </choice>
+ </zeroOrMore>
+ </element>
+ </define>
+ <define name="xhtmlDiv">
+ <element name="xhtml:div">
+ <zeroOrMore>
+ <choice>
+ <attribute>
+ <anyName/>
+ </attribute>
+ <text/>
+ <ref name="anyXHTML"/>
+ </choice>
+ </zeroOrMore>
+ </element>
+ </define>
+</grammar>
diff --git a/doc/s5/ep2008/atomgen.py b/doc/s5/ep2008/atomgen.py
new file mode 100644
index 0000000..25d4ba1
--- /dev/null
+++ b/doc/s5/ep2008/atomgen.py
@@ -0,0 +1,27 @@
+# atomgen.py
+
+import os.path
+
+from lxml import etree
+from lxml.builder import ElementMaker
+
+ATOM_NAMESPACE = "http://www.w3.org/2005/Atom"
+
+A = ElementMaker(namespace=ATOM_NAMESPACE,
+ nsmap={None : ATOM_NAMESPACE})
+
+feed = A.feed
+entry = A.entry
+title = A.title
+author = A.author
+name = A.name
+link = A.link
+summary = A.summary
+id = A.id
+updated = A.updated
+# ... and so on and so forth ...
+
+
+# plus a little validation function: isvalid()
+isvalid = etree.RelaxNG(
+ file=os.path.join(os.path.abspath(os.path.dirname(__file__)), "atom.rng"))
diff --git a/doc/s5/ep2008/proxies.png b/doc/s5/ep2008/proxies.png
new file mode 100644
index 0000000..32c35f3
--- /dev/null
+++ b/doc/s5/ep2008/proxies.png
Binary files differ
diff --git a/doc/s5/lxml-ep2008.txt b/doc/s5/lxml-ep2008.txt
new file mode 100644
index 0000000..4f4593b
--- /dev/null
+++ b/doc/s5/lxml-ep2008.txt
@@ -0,0 +1,1130 @@
+====================================
+Implementing XML languages with lxml
+====================================
+
+Dr. Stefan Behnel
+-----------------
+
+.. class:: center
+
+ http://codespeak.net/lxml/
+
+ lxml-dev@codespeak.net
+
+ .. image:: tagpython.png
+
+.. footer:: Dr. Stefan Behnel, EuroPython 2008, Vilnius/Lietuva
+
+.. include:: <s5defs.txt>
+
+
+What is an »XML language«?
+==========================
+
+* a language in XML notation
+
+* aka »XML dialect«
+
+ * except that it's not a dialect
+
+* Examples:
+
+ * XML Schema
+
+ * Atom/RSS
+
+ * (X)HTML
+
+ * Open Document Format
+
+ * SOAP
+
+ * ... add your own one here
+
+
+Popular mistakes to avoid (1)
+=============================
+
+"That's easy, I can use regular expressions!"
+
+.. class:: incremental center
+
+ No, you can't.
+
+
+Popular mistakes to avoid (2)
+=============================
+
+"This is tree data, I'll take the DOM!"
+
+
+Popular mistakes to avoid (2)
+=============================
+
+"This is tree data, I'll take the DOM!"
+
+* DOM is ubiquitous, but it's as complicated as Java
+
+* uglify your application with tons of DOM code to
+
+ * walk over non-element nodes to find the data you need
+
+ * convert text content to other data types
+
+ * modify the XML tree in memory
+
+=> write verbose, redundant, hard-to-maintain code
+
+
+Popular mistakes to avoid (3)
+=============================
+
+"SAX is *so* fast and consumes *no* memory!"
+
+
+Popular mistakes to avoid (3)
+=============================
+
+"SAX is *so* fast and consumes *no* memory!"
+
+* but *writing* SAX code is *not* fast!
+
+* write error-prone, state-keeping SAX code to
+
+ * figure out where you are
+
+ * find the sections you need
+
+ * convert text content to other data types
+
+ * copy the XML data into custom data classes
+
+ * ... and don't forget the way back into XML!
+
+=> write confusing state-machine code
+
+=> debugging into existence
+
+
+Working with XML
+================
+
+ **Getting XML work done**
+
+ (instead of getting time wasted)
+
+
+How can you work with XML?
+==========================
+
+* Preparation:
+
+ * Implement usable data classes as an abstraction layer
+
+ * Implement a mapping from XML to the data classes
+
+ * Implement a mapping from the data classes to XML
+
+* Workflow:
+
+ * parse XML data
+
+ * map XML data to data classes
+
+ * work with data classes
+
+ * map data classes to XML
+
+ * serialise XML
+
+.. class:: incremental
+
+ * Approach:
+
+ * get rid of XML and do everything in your own code
+
+
+What if you could simplify this?
+================================
+
+* Preparation:
+
+ * Extend usable XML API classes into an abstraction layer
+
+* Workflow:
+
+ * parse XML data into XML API classes
+
+ * work with XML API classes
+
+ * serialise XML
+
+.. class:: incremental
+
+ * Approach:
+
+ * cover only the quirks of XML and make it work *for* you
+
+
+What if you could simplify this ...
+===================================
+
+* ... without sacrificing usability or flexibility?
+
+* ... using a high-speed, full-featured, pythonic XML toolkit?
+
+* ... with the power of XPath, XSLT and XML validation?
+
+.. class:: incremental center
+
+ \... then »lxml« is your friend!
+
+
+Overview
+========
+
+* What is lxml?
+
+ * what & who
+
+* How do you use it?
+
+ * Lesson 0: quick API overview
+
+ * ElementTree concepts and lxml features
+
+ * Lesson 1: parse XML
+
+ * how to get XML data into memory
+
+ * Lesson 2: generate XML
+
+ * how to write an XML generator for a language
+
+ * Lesson 3: working with XML trees made easy
+
+ * how to write an XML API for a language
+
+
+What is lxml?
+=============
+
+* a fast, full-featured toolkit for XML and HTML handling
+
+ * http://codespeak.net/lxml/
+
+ * lxml-dev@codespeak.net
+
+* based on and inspired by
+
+ * the C libraries libxml2 and libxslt (by Daniel Veillard)
+
+ * the ElementTree API (by Fredrik Lundh)
+
+ * the Cython compiler (by Robert Bradshaw, Greg Ewing & me)
+
+ * the Python language (by Guido & [*paste Misc/ACKS here*])
+
+ * user feedback, ideas and patches (by you!)
+
+ * keep doing that, we love you all!
+
+* maintained (and major parts) written by myself
+
+ * initial design and implementation by Martijn Faassen
+
+ * extensive HTML API and tools by Ian Bicking
+
+
+What do you get for your money?
+===============================
+
+* many tools in one:
+
+ * Generic, ElementTree compatible XML API: **lxml.etree**
+
+ * but faster for many tasks and much more feature-rich
+
+ * Special tool set for HTML handling: **lxml.html**
+
+ * Special API for pythonic data binding: **lxml.objectify**
+
+ * General purpose path languages: XPath and CSS selectors
+
+ * Validation: DTD, XML Schema, RelaxNG, Schematron
+
+ * XSLT, XInclude, C14N, ...
+
+ * Fast tree iteration, event-driven parsing, ...
+
+* it's free, but it's worth every €-Cent!
+
+ * what users say:
+
+ * »no qualification, I would recommend lxml for just about any
+ HTML task«
+
+ * »THE tool [...] for newbies and experienced developers«
+
+ * »you can do pretty much anything with an intuitive API«
+
+ * »lxml takes all the pain out of XML«
+
+
+Lesson 0: a quick overview
+==========================
+
+ why **»lxml takes all the pain out of XML«**
+
+ (a quick overview of lxml features and ElementTree concepts)
+
+..
+ >>> from lxml import etree, cssselect, html
+ >>> some_xml_data = "<root><speech class='dialog'><p>So be it!</p></speech><p>stuff</p></root>"
+ >>> some_html_data = "<p>Just a quick note<br>next line</p>"
+ >>> xml_tree = etree.XML(some_xml_data)
+ >>> html_tree = html.fragment_fromstring(some_html_data)
+
+
+Namespaces in ElementTree
+=========================
+
+* uses Clark notation:
+
+ * wrap namespace URI in ``{...}``
+
+ * append the tag name
+
+ .. sourcecode:: pycon
+
+ >>> tag = "{http://www.w3.org/the/namespace}tagname"
+ >>> element = etree.Element(tag)
+
+* no prefixes!
+
+* a single, self-containing tag identifier
+
+
+Text content in ElementTree
+===========================
+
+* uses ``.text`` and ``.tail`` attributes:
+
+ .. sourcecode:: pycon
+
+ >>> div = html.fragment_fromstring(
+ ... "<div><p>a paragraph<br>split in two</p> parts</div>")
+ >>> p = div[0]
+ >>> br = p[0]
+
+ >>> p.text
+ 'a paragraph'
+ >>> br.text
+ >>> br.tail
+ 'split in two'
+ >>> p.tail
+ ' parts'
+
+* no text nodes!
+
+ * simplifies tree traversal a lot
+
+ * simplifies many XML algorithms
+
+
+Attributes in ElementTree
+=========================
+
+* uses ``.get()`` and ``.set()`` methods:
+
+ .. sourcecode:: pycon
+
+ >>> root = etree.fromstring(
+ ... '<root a="the value" b="of an" c="attribute"/>')
+
+ >>> root.get('a')
+ 'the value'
+
+ >>> root.set('a', "THE value")
+ >>> root.get('a')
+ 'THE value'
+
+* or the ``.attrib`` dictionary property:
+
+ .. sourcecode:: pycon
+
+ >>> d = root.attrib
+
+ >>> list(sorted(d.keys()))
+ ['a', 'b', 'c']
+ >>> list(sorted(d.values()))
+ ['THE value', 'attribute', 'of an']
+
+
+Tree iteration in lxml.etree (1)
+================================
+
+..
+ >>> import collections
+
+.. sourcecode:: pycon
+
+ >>> root = etree.fromstring(
+ ... "<root> <a><b/><b/></a> <c><d/><e><f/></e><g/></c> </root>")
+
+ >>> print([child.tag for child in root]) # children
+ ['a', 'c']
+
+ >>> print([el.tag for el in root.iter()]) # self and descendants
+ ['root', 'a', 'b', 'b', 'c', 'd', 'e', 'f', 'g']
+
+ >>> print([el.tag for el in root.iterdescendants()])
+ ['a', 'b', 'b', 'c', 'd', 'e', 'f', 'g']
+
+
+ >>> def iter_breadth_first(root):
+ ... bfs_queue = collections.deque([root])
+ ... while bfs_queue:
+ ... el = bfs_queue.popleft() # pop next element
+ ... bfs_queue.extend(el) # append its children
+ ... yield el
+
+ >>> print([el.tag for el in iter_breadth_first(root)])
+ ['root', 'a', 'c', 'b', 'b', 'd', 'e', 'g', 'f']
+
+
+Tree iteration in lxml.etree (2)
+================================
+
+.. sourcecode:: pycon
+
+ >>> root = etree.fromstring(
+ ... "<root> <a><b/><b/></a> <c><d/><e><f/></e><g/></c> </root>")
+
+ >>> tree_walker = etree.iterwalk(root, events=('start', 'end'))
+
+ >>> for (event, element) in tree_walker:
+ ... print("%s (%s)" % (element.tag, event))
+ root (start)
+ a (start)
+ b (start)
+ b (end)
+ b (start)
+ b (end)
+ a (end)
+ c (start)
+ d (start)
+ d (end)
+ e (start)
+ f (start)
+ f (end)
+ e (end)
+ g (start)
+ g (end)
+ c (end)
+ root (end)
+
+
+Path languages in lxml
+======================
+
+.. sourcecode:: xml
+
+ <root>
+ <speech class='dialog'><p>So be it!</p></speech>
+ <p>stuff</p>
+ </root>
+
+* search it with XPath
+
+ .. sourcecode:: pycon
+
+ >>> find_paragraphs = etree.XPath("//p")
+ >>> paragraphs = find_paragraphs(xml_tree)
+
+ >>> print([ p.text for p in paragraphs ])
+ ['So be it!', 'stuff']
+
+* search it with CSS selectors
+
+ .. sourcecode:: pycon
+
+ >>> find_dialogs = cssselect.CSSSelector("speech.dialog p")
+ >>> paragraphs = find_dialogs(xml_tree)
+
+ >>> print([ p.text for p in paragraphs ])
+ ['So be it!']
+
+
+Summary of lesson 0
+===================
+
+* lxml comes with various tools
+
+ * that aim to hide the quirks of XML
+
+ * that simplify finding and handling data
+
+ * that make XML a pythonic tool by itself
+
+
+Lesson 1: parsing XML/HTML
+==========================
+
+ **The input side**
+
+ (a quick overview)
+
+
+Parsing XML and HTML from ...
+=============================
+
+* strings: ``fromstring(xml_data)``
+
+ * byte strings, but also unicode strings
+
+* filenames: ``parse(filename)``
+
+* HTTP/FTP URLs: ``parse(url)``
+
+* file objects: ``parse(f)``
+
+ * ``f = open(filename, 'rb')`` !
+
+* file-like objects: ``parse(f)``
+
+ * only need a ``f.read(size)`` method
+
+* data chunks: ``parser.feed(xml_chunk)``
+
+ * ``result = parser.close()``
+
+.. class:: small right
+
+ (parsing from strings and filenames/URLs frees the GIL)
+
+
+Example: parsing from a string
+==============================
+
+* using the ``fromstring()`` function:
+
+ .. sourcecode:: pycon
+
+ >>> root_element = etree.fromstring(some_xml_data)
+
+* using the ``fromstring()`` function with a specific parser:
+
+ .. sourcecode:: pycon
+
+ >>> parser = etree.HTMLParser(remove_comments=True)
+ >>> root_element = etree.fromstring(some_html_data, parser)
+
+* or the ``XML()`` and ``HTML()`` aliases for literals in code:
+
+ .. sourcecode:: pycon
+
+ >>> root_element = etree.XML("<root><child/></root>")
+ >>> root_element = etree.HTML("<p>some<br>paragraph</p>")
+
+
+Parsing XML into ...
+====================
+
+* a tree in memory
+
+ * ``parse()`` and ``fromstring()`` functions
+
+* a tree in memory, but step-by-step with a generator
+
+ * ``iterparse()`` generates ``(start/end, element)`` events
+
+ * tree can be cleaned up to save space
+
+* SAX-like callbacks without building a tree
+
+ * ``parse()`` and ``fromstring()`` functions
+
+ * pass a ``target`` object into the parser
+
+
+Summary of lesson 1
+===================
+
+* parsing XML/HTML in lxml is mostly straight forward
+
+ * simple functions that do the job
+
+* advanced use cases are pretty simple
+
+ * event-driven parsing using ``iterparse()``
+
+ * special parser configuration with keyword arguments
+
+ * configuration is generally local to a parser
+
+* BTW: parsing is *very* fast, as is serialising
+
+ * don't hesitate to do parse-serialise-parse cycles
+
+
+Lesson 2: generating XML
+========================
+
+ **The output side**
+
+ (and how to make it safe and simple)
+
+
+The example language: Atom
+==========================
+
+The Atom XML format
+
+* Namespace: http://www.w3.org/2005/Atom
+
+* W3C recommendation derived from RSS and friends
+
+* Atom feeds describe news entries and annotated links
+
+ * a ``feed`` contains one or more ``entry`` elements
+
+ * an ``entry`` contains ``author``, ``link``, ``summary`` and/or ``content``
+
+
+Example: generate XML (1)
+=========================
+
+The ElementMaker (or *E-factory*)
+
+.. sourcecode:: pycon
+
+ >>> from lxml.builder import ElementMaker
+ >>> A = ElementMaker(namespace="http://www.w3.org/2005/Atom",
+ ... nsmap={None : "http://www.w3.org/2005/Atom"})
+
+.. class:: incremental
+
+ .. sourcecode:: pycon
+
+ >>> atom = A.feed(
+ ... A.author( A.name("Stefan Behnel") ),
+ ... A.entry(
+ ... A.title("News from lxml"),
+ ... A.link(href="http://codespeak.net/lxml/"),
+ ... A.summary("See what's <b>fun</b> about lxml...",
+ ... type="html"),
+ ... )
+ ... )
+
+ .. sourcecode:: pycon
+
+ >>> from lxml.etree import tostring
+ >>> print( tostring(atom, pretty_print=True) )
+
+
+Example: generate XML (2)
+=========================
+
+.. sourcecode:: pycon
+
+ >>> atom = A.feed(
+ ... A.author( A.name("Stefan Behnel") ),
+ ... A.entry(
+ ... A.title("News from lxml"),
+ ... A.link(href="http://codespeak.net/lxml/"),
+ ... A.summary("See what's <b>fun</b> about lxml...",
+ ... type="html"),
+ ... )
+ ... )
+
+.. sourcecode:: xml
+
+ <feed xmlns="http://www.w3.org/2005/Atom">
+ <author>
+ <name>Stefan Behnel</name>
+ </author>
+ <entry>
+ <title>News from lxml</title>
+ <link href="http://codespeak.net/lxml/"/>
+ <summary type="html">See what's &lt;b&gt;fun&lt;/b&gt;
+ about lxml...</summary>
+ </entry>
+ </feed>
+
+
+Be careful what you type!
+=========================
+
+.. sourcecode:: pycon
+
+ >>> atom = A.feed(
+ ... A.author( A.name("Stefan Behnel") ),
+ ... A.entry(
+ ... A.titel("News from lxml"),
+ ... A.link(href="http://codespeak.net/lxml/"),
+ ... A.summary("See what's <b>fun</b> about lxml...",
+ ... type="html"),
+ ... )
+ ... )
+
+.. sourcecode:: xml
+
+ <feed xmlns="http://www.w3.org/2005/Atom">
+ <author>
+ <name>Stefan Behnel</name>
+ </author>
+ <entry>
+ <titel>News from lxml</titel>
+ <link href="http://codespeak.net/lxml/"/>
+ <summary type="html">See what's &lt;b&gt;fun&lt;/b&gt;
+ about lxml...</summary>
+ </entry>
+ </feed>
+
+
+Want more 'type safety'?
+========================
+
+Write an XML generator *module* instead:
+
+.. sourcecode:: python
+
+ # atomgen.py
+
+ from lxml import etree
+ from lxml.builder import ElementMaker
+
+ ATOM_NAMESPACE = "http://www.w3.org/2005/Atom"
+
+ A = ElementMaker(namespace=ATOM_NAMESPACE,
+ nsmap={None : ATOM_NAMESPACE})
+
+ feed = A.feed
+ entry = A.entry
+ title = A.title
+ # ... and so on and so forth ...
+
+
+ # plus a little validation function: isvalid()
+ isvalid = etree.RelaxNG(file="atom.rng")
+
+
+The Atom generator module
+=========================
+
+..
+ >>> import sys
+ >>> sys.path.insert(0, "ep2008")
+
+.. sourcecode:: pycon
+
+ >>> import atomgen as A
+
+ >>> atom = A.feed(
+ ... A.author( A.name("Stefan Behnel") ),
+ ... A.entry(
+ ... A.link(href="http://codespeak.net/lxml/"),
+ ... A.title("News from lxml"),
+ ... A.summary("See what's <b>fun</b> about lxml...",
+ ... type="html"),
+ ... )
+ ... )
+
+ >>> A.isvalid(atom) # ok, forgot the ID's => invalid XML ...
+ False
+
+ >>> title = A.titel("News from lxml")
+ Traceback (most recent call last):
+ ...
+ AttributeError: 'module' object has no attribute 'titel'
+
+
+Mixing languages (1)
+====================
+
+Atom can embed *serialised* HTML
+
+.. sourcecode:: pycon
+
+ >>> import lxml.html.builder as h
+
+ >>> html_fragment = h.DIV(
+ ... "this is some\n",
+ ... h.A("HTML", href="http://w3.org/MarkUp/"),
+ ... "\ncontent")
+
+.. class:: incremental
+
+ .. sourcecode:: pycon
+
+ >>> serialised_html = etree.tostring(html_fragment, method="html")
+
+ >>> summary = A.summary(serialised_html, type="html")
+
+ .. sourcecode:: pycon
+
+ >>> print(etree.tostring(summary))
+ <summary xmlns="http://www.w3.org/2005/Atom" type="html">
+ &lt;div&gt;this is some
+ &lt;a href="http://w3.org/MarkUp/"&gt;HTML&lt;/a&gt;
+ content&lt;/div&gt;
+ </summary>
+
+
+Mixing languages (2)
+====================
+
+Atom can also embed non-escaped XHTML
+
+.. sourcecode:: pycon
+
+ >>> from copy import deepcopy
+ >>> xhtml_fragment = deepcopy(html_fragment)
+
+ >>> from lxml.html import html_to_xhtml
+ >>> html_to_xhtml(xhtml_fragment)
+
+ >>> summary = A.summary(xhtml_fragment, type="xhtml")
+
+.. class:: incremental
+
+ .. sourcecode:: pycon
+
+ >>> print(etree.tostring(summary, pretty_print=True))
+ <summary xmlns="http://www.w3.org/2005/Atom" type="xhtml">
+ <html:div xmlns:html="http://www.w3.org/1999/xhtml">this is some
+ <html:a href="http://w3.org/MarkUp/">HTML</html:a>
+ content</html:div>
+ </summary>
+
+
+Summary of lesson 2
+===================
+
+* generating XML is easy
+
+ * use the ElementMaker
+
+* wrap it in a module that provides
+
+ * the target namespace
+
+ * an ElementMaker name for each language element
+
+ * a validator
+
+ * maybe additional helper functions
+
+* mixing languages is easy
+
+ * define a generator module for each
+
+\... this is all you need for the *output* side of XML languages
+
+
+Lesson 3: Designing XML APIs
+============================
+
+ **The Element API**
+
+ (and how to make it the way *you* want)
+
+
+Trees in C and in Python
+========================
+
+* Trees have two representations:
+
+ * a plain, complete, low-level C tree provided by libxml2
+
+ * a set of Python Element proxies, each representing one element
+
+* Proxies are created on-the-fly:
+
+ * lxml creates an Element object for a C node on request
+
+ * proxies are garbage collected when going out of scope
+
+ * XML trees are garbage collected when deleting the last proxy
+
+.. class:: center
+
+ .. image:: ep2008/proxies.png
+
+
+Mapping Python classes to nodes
+===============================
+
+* Proxies can be assigned to XML nodes *by user code*
+
+ * lxml tells you about a node, you return a class
+
+
+Example: a simple Element class (1)
+===================================
+
+* define a subclass of ElementBase
+
+ .. sourcecode:: pycon
+
+ >>> class HonkElement(etree.ElementBase):
+ ... @property
+ ... def honking(self):
+ ... return self.get('honking') == 'true'
+
+* let it replace the default Element class
+
+ .. sourcecode:: pycon
+
+ >>> lookup = etree.ElementDefaultClassLookup(
+ ... element=HonkElement)
+
+ >>> parser = etree.XMLParser()
+ >>> parser.set_element_class_lookup(lookup)
+
+
+Example: a simple Element class (2)
+===================================
+
+* use the new Element class
+
+ .. sourcecode:: pycon
+
+ >>> root = etree.XML('<root><honk honking="true"/></root>',
+ ... parser)
+
+ >>> root.honking
+ False
+ >>> root[0].honking
+ True
+
+
+Mapping Python classes to nodes
+===============================
+
+* The Element class lookup
+
+ * lxml tells you about a node, you return a class
+
+ * no restrictions on lookup algorithm
+
+ * each parser can use a different class lookup scheme
+
+ * lookup schemes can be chained through fallbacks
+
+* Classes can be selected based on
+
+ * the node type (element, comment or processing instruction)
+
+ * ``ElementDefaultClassLookup()``
+
+ * the namespaced node name
+
+ * ``CustomElementClassLookup()`` + a fallback
+
+ * ``ElementNamespaceClassLookup()`` + a fallback
+
+ * the value of an attribute (e.g. ``id`` or ``class``)
+
+ * ``AttributeBasedElementClassLookup()`` + a fallback
+
+ * read-only inspection of the tree
+
+ * ``PythonElementClassLookup()`` + a fallback
+
+
+Designing an Atom API
+=====================
+
+* a feed is a container for entries
+
+ .. sourcecode:: python
+
+ # atom.py
+
+ ATOM_NAMESPACE = "http://www.w3.org/2005/Atom"
+ _ATOM_NS = "{%s}" % ATOM_NAMESPACE
+
+ class FeedElement(etree.ElementBase):
+ @property
+ def entries(self):
+ return self.findall(_ATOM_NS + "entry")
+
+* it also has a couple of meta-data children, e.g. ``title``
+
+ .. sourcecode:: python
+
+ class FeedElement(etree.ElementBase):
+ # ...
+ @property
+ def title(self):
+ "return the title or None"
+ return self.find("title")
+
+
+Consider lxml.objectify
+=======================
+
+* ready-to-use, generic Python object API for XML
+
+.. sourcecode:: python
+
+ >>> from lxml import objectify
+
+ >>> feed = objectify.parse("atom-example.xml")
+ >>> print(feed.title)
+ Example Feed
+
+ >>> print([entry.title for entry in feed.entry])
+ ['Atom-Powered Robots Run Amok']
+
+ >>> print(feed.entry[0].title)
+ Atom-Powered Robots Run Amok
+
+
+Still room for more convenience
+===============================
+
+.. sourcecode:: python
+
+ from itertools import chain
+
+ class FeedElement(objectify.ObjectifiedElement):
+
+ def addIDs(self):
+ "initialise the IDs of feed and entries"
+
+ for element in chain([self], self.entry):
+ if element.find(_ATOM_NS + "id") is None:
+ id = etree.SubElement(self, _ATOM_NS + "id")
+ id.text = make_guid()
+
+
+Incremental API design
+======================
+
+* choose an XML API to start with
+
+ * lxml.etree is general purpose
+
+ * lxml.objectify is nice for document-style XML
+
+* fix Elements that really need some API sugar
+
+ * dict-mappings to children with specific content/attributes
+
+ * properties for specially typed attributes or child values
+
+ * simplified access to varying content types of an element
+
+ * shortcuts for unnecessarily deep subtrees
+
+* ignore what works well enough with the Element API
+
+ * lists of homogeneous children -> Element iteration
+
+ * string attributes -> .get()/.set()
+
+* let the API grow at your fingertips
+
+ * play with it and test use cases
+
+ * avoid "I want because I can" feature explosion!
+
+
+Setting up the Element mapping
+==============================
+
+Atom has a namespace => leave the mapping to lxml
+
+.. sourcecode:: python
+
+ # ...
+ _atom_lookup = etree.ElementNamespaceClassLookup(
+ objectify.ObjectifyElementClassLookup())
+
+ # map the classes to tag names
+ ns = _atom_lookup.get_namespace(ATOM_NAMESPACE)
+ ns["feed"] = FeedElement
+ ns["entry"] = EntryElement
+ # ... and so on
+ # or use ns.update(vars()) with appropriate class names
+
+ # create a parser that does some whitespace cleanup
+ atom_parser = etree.XMLParser(remove_blank_text=True)
+
+ # make it use our Atom classes
+ atom_parser.set_element_class_lookup(_atom_lookup)
+
+ # and help users in using our parser setup
+ def parse(input):
+ return etree.parse(input, atom_parser)
+
+
+Using your new Atom API
+=======================
+
+.. sourcecode:: pycon
+
+ >>> import atom
+ >>> feed = atom.parse("ep2008/atom-example.xml").getroot()
+
+ >>> print(len(feed.entry))
+ 1
+ >>> print([entry.title for entry in feed.entry])
+ ['Atom-Powered Robots Run Amok']
+
+ >>> link_tag = "{%s}link" % atom.ATOM_NAMESPACE
+ >>> print([link.get("href") for link in feed.iter(link_tag)])
+ ['http://example.org/', 'http://example.org/2003/12/13/atom03']
+
+
+Summary of lesson 3
+===================
+
+To implement an XML API ...
+
+1) start off with lxml's Element API
+
+ * or take a look at the object API of lxml.objectify
+
+2) specialise it into a set of custom Element classes
+
+3) map them to XML tags using one of the lookup schemes
+
+4) improve the API incrementally while using it
+
+ * discover inconveniences and beautify them
+
+ * avoid putting work into things that work
+
+
+Conclusion
+==========
+
+lxml ...
+
+* provides a convenient set of tools for XML and HTML
+
+ * parsing
+
+ * generating
+
+ * working with in-memory trees
+
+* follows Python idioms wherever possible
+
+ * highly extensible through wrapping and subclassing
+
+ * callable objects for XPath, CSS selectors, XSLT, schemas
+
+ * iteration for tree traversal (even while parsing)
+
+ * list-/dict-like APIs, properties, keyword arguments, ...
+
+* makes extension and specialisation easy
+
+ * write a special XML generator module in trivial code
+
+ * write your own XML API incrementally on-the-fly
diff --git a/doc/s5/rst2s5.py b/doc/s5/rst2s5.py
new file mode 100644
index 0000000..953bded
--- /dev/null
+++ b/doc/s5/rst2s5.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""
+ The Pygments reStructuredText directive
+ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ This fragment is a Docutils_ 0.5 directive that renders source code
+ (to HTML only, currently) via Pygments.
+
+ To use it, adjust the options below and copy the code into a module
+ that you import on initialization. The code then automatically
+ registers a ``sourcecode`` directive that you can use instead of
+ normal code blocks like this::
+
+ .. sourcecode:: python
+
+ My code goes here.
+
+ If you want to have different code styles, e.g. one with line numbers
+ and one without, add formatters with their names in the VARIANTS dict
+ below. You can invoke them instead of the DEFAULT one by using a
+ directive option::
+
+ .. sourcecode:: python
+ :linenos:
+
+ My code goes here.
+
+ Look at the `directive documentation`_ to get all the gory details.
+
+ .. _Docutils: http://docutils.sf.net/
+ .. _directive documentation:
+ http://docutils.sourceforge.net/docs/howto/rst-directives.html
+
+ :copyright: Copyright 2006-2009 by the Pygments team, see AUTHORS.
+ :license: BSD, see LICENSE for details.
+"""
+
+# Options
+# ~~~~~~~
+
+# Set to True if you want inline CSS styles instead of classes
+INLINESTYLES = False
+STYLE = "fruity"
+
+from pygments.formatters import HtmlFormatter
+
+# The default formatter
+DEFAULT = HtmlFormatter(noclasses=INLINESTYLES, style=STYLE)
+
+# Add name -> formatter pairs for every variant you want to use
+VARIANTS = {
+ # 'linenos': HtmlFormatter(noclasses=INLINESTYLES, linenos=True),
+}
+
+
+from docutils import nodes
+from docutils.parsers.rst import directives, Directive
+
+from pygments import highlight
+from pygments.lexers import get_lexer_by_name, TextLexer
+
+class Pygments(Directive):
+ """ Source code syntax highlighting.
+ """
+ required_arguments = 1
+ optional_arguments = 0
+ final_argument_whitespace = True
+ option_spec = dict([(key, directives.flag) for key in VARIANTS])
+ has_content = True
+
+ def run(self):
+ self.assert_has_content()
+ try:
+ lexer = get_lexer_by_name(self.arguments[0])
+ except ValueError:
+ # no lexer found - use the text one instead of an exception
+ lexer = TextLexer()
+ # take an arbitrary option if more than one is given
+ formatter = self.options and VARIANTS[self.options.keys()[0]] or DEFAULT
+
+# print >>open('ui/default/pygments.css', 'w'), formatter.get_style_defs('.highlight')
+ parsed = highlight(u'\n'.join(self.content), lexer, formatter)
+ return [nodes.raw('', parsed, format='html')]
+
+directives.register_directive('sourcecode', Pygments)
+
+from docutils.core import publish_cmdline, default_description
+
+description = ('Generates S5 (X)HTML slideshow documents from standalone '
+ 'reStructuredText sources. ' + default_description)
+
+publish_cmdline(writer_name='s5', description=description)
diff --git a/doc/s5/tagpython.png b/doc/s5/tagpython.png
new file mode 100644
index 0000000..1bedfc8
--- /dev/null
+++ b/doc/s5/tagpython.png
Binary files differ
diff --git a/doc/s5/ui/default/blank.gif b/doc/s5/ui/default/blank.gif
new file mode 100644
index 0000000..75b945d
--- /dev/null
+++ b/doc/s5/ui/default/blank.gif
Binary files differ
diff --git a/doc/s5/ui/default/bodybg.gif b/doc/s5/ui/default/bodybg.gif
new file mode 100644
index 0000000..5f448a1
--- /dev/null
+++ b/doc/s5/ui/default/bodybg.gif
Binary files differ
diff --git a/doc/s5/ui/default/framing.css b/doc/s5/ui/default/framing.css
new file mode 100644
index 0000000..14d8509
--- /dev/null
+++ b/doc/s5/ui/default/framing.css
@@ -0,0 +1,23 @@
+/* The following styles size, place, and layer the slide components.
+ Edit these if you want to change the overall slide layout.
+ The commented lines can be uncommented (and modified, if necessary)
+ to help you with the rearrangement process. */
+
+/* target = 1024x768 */
+
+div#header, div#footer, .slide {width: 100%; top: 0; left: 0;}
+div#header {top: 0; height: 3em; z-index: 1;}
+div#footer {top: auto; bottom: 0; height: 2.5em; z-index: 5;}
+.slide {top: 0; width: 92%; padding: 3.5em 4% 4%; z-index: 2; list-style: none;}
+div#controls {left: 50%; bottom: 0; width: 50%; z-index: 100;}
+div#controls form {position: absolute; bottom: 0; right: 0; width: 100%;
+ margin: 0;}
+#currentSlide {position: absolute; width: 10%; left: 45%; bottom: 1em; z-index: 10;}
+html>body #currentSlide {position: fixed;}
+
+/*
+div#header {background: #FCC;}
+div#footer {background: #CCF;}
+div#controls {background: #BBD;}
+div#currentSlide {background: #FFC;}
+*/
diff --git a/doc/s5/ui/default/iepngfix.htc b/doc/s5/ui/default/iepngfix.htc
new file mode 100644
index 0000000..4d90c87
--- /dev/null
+++ b/doc/s5/ui/default/iepngfix.htc
@@ -0,0 +1,42 @@
+<public:component>
+<public:attach event="onpropertychange" onevent="doFix()" />
+
+<script>
+
+// IE5.5+ PNG Alpha Fix v1.0 by Angus Turnbull http://www.twinhelix.com
+// Free usage permitted as long as this notice remains intact.
+
+// This must be a path to a blank image. That's all the configuration you need here.
+var blankImg = 'v11rc1/default/blank.gif';
+
+var f = 'DXImageTransform.Microsoft.AlphaImageLoader';
+
+function filt(s, m) {
+ if (filters[f]) {
+ filters[f].enabled = s ? true : false;
+ if (s) with (filters[f]) { src = s; sizingMethod = m }
+ } else if (s) style.filter = 'progid:'+f+'(src="'+s+'",sizingMethod="'+m+'")';
+}
+
+function doFix() {
+ if ((parseFloat(navigator.userAgent.match(/MSIE (\S+)/)[1]) < 5.5) ||
+ (event && !/(background|src)/.test(event.propertyName))) return;
+
+ if (tagName == 'IMG') {
+ if ((/\.png$/i).test(src)) {
+ filt(src, 'image'); // was 'scale'
+ src = blankImg;
+ } else if (src.indexOf(blankImg) < 0) filt();
+ } else if (style.backgroundImage) {
+ if (style.backgroundImage.match(/^url[("']+(.*\.png)[)"']+$/i)) {
+ var s = RegExp.$1;
+ style.backgroundImage = '';
+ filt(s, 'crop');
+ } else filt();
+ }
+}
+
+doFix();
+
+</script>
+</public:component> \ No newline at end of file
diff --git a/doc/s5/ui/default/lxml-logo64.png b/doc/s5/ui/default/lxml-logo64.png
new file mode 100644
index 0000000..9c920a9
--- /dev/null
+++ b/doc/s5/ui/default/lxml-logo64.png
Binary files differ
diff --git a/doc/s5/ui/default/opera.css b/doc/s5/ui/default/opera.css
new file mode 100644
index 0000000..9e9d2a3
--- /dev/null
+++ b/doc/s5/ui/default/opera.css
@@ -0,0 +1,7 @@
+/* DO NOT CHANGE THESE unless you really want to break Opera Show */
+.slide {
+ visibility: visible !important;
+ position: static !important;
+ page-break-before: always;
+}
+#slide0 {page-break-before: avoid;}
diff --git a/doc/s5/ui/default/outline.css b/doc/s5/ui/default/outline.css
new file mode 100644
index 0000000..62db519
--- /dev/null
+++ b/doc/s5/ui/default/outline.css
@@ -0,0 +1,15 @@
+/* don't change this unless you want the layout stuff to show up in the outline view! */
+
+.layout div, #footer *, #controlForm * {display: none;}
+#footer, #controls, #controlForm, #navLinks, #toggle {
+ display: block; visibility: visible; margin: 0; padding: 0;}
+#toggle {float: right; padding: 0.5em;}
+html>body #toggle {position: fixed; top: 0; right: 0;}
+
+/* making the outline look pretty-ish */
+
+#slide0 h1, #slide0 h2, #slide0 h3, #slide0 h4 {border: none; margin: 0;}
+#slide0 h1 {padding-top: 1.5em;}
+.slide h1 {margin: 1.5em 0 0; padding-top: 0.25em;
+ border-top: 1px solid #888; border-bottom: 1px solid #AAA;}
+#toggle {border: 1px solid; border-width: 0 0 1px 1px; background: #FFF;}
diff --git a/doc/s5/ui/default/pretty.css b/doc/s5/ui/default/pretty.css
new file mode 100644
index 0000000..b3ea640
--- /dev/null
+++ b/doc/s5/ui/default/pretty.css
@@ -0,0 +1,221 @@
+/* Following are the presentation styles -- edit away! */
+
+/* body {background: #FFF url(tagpython.png) 1em 1em no-repeat; color: #000; font-size: 2em;} */
+:link, :visited {text-decoration: none; color: #438e00;}
+#controls :active {color: #8A8 !important;}
+#controls :focus {outline: 1px dotted #227;}
+h1, h2, h3, h4 {font-size: 100%; margin: 0; padding: 0; font-weight: inherit;}
+ul, pre {margin: 0; line-height: 1em;}
+html, body {margin: 0; padding: 0;}
+
+blockquote, q {font-style: italic;}
+blockquote {padding: 0 2em 0.5em; margin: 0 1.5em 0.5em; text-align: center; font-size: 1em;}
+blockquote p {margin: 2em;}
+blockquote p strong {font-size: 1.5em;}
+blockquote i {font-style: normal;}
+blockquote b {display: block; margin-top: 0.5em; font-weight: normal; font-size: smaller; font-style: normal;}
+blockquote b i {font-style: italic;}
+
+kbd {font-weight: bold; font-size: 1em;}
+sup {font-size: smaller; line-height: 1px;}
+
+.slide code {padding: 2px 0.25em; font-weight: bold; color: #533;}
+.slide code.bad, code del {color: red;}
+.slide code.old {color: silver;}
+.slide .pre {padding: 0; margin: 0 0 0 0; color: #533; font-size: 80%;}
+.slide pre {padding: 0; margin: 0.25em 0 0.5em 0.5em; color: #533; font-size: 90%;}
+/* .slide pre {padding: 0; margin: 0 0 0 0; color: #533; font-size: 90%;} */
+.slide pre code {display: block;}
+.slide div > ul {padding-left: 0; margin-left: 0; list-style: disc; }
+.slide li {margin-top: 0.75em; margin-right: 0;}
+.slide ul ul {line-height: 1; padding-left: 1em; margin-left: 2%; margin-right: 5%; list-style: disc; }
+.slide ul ul li {margin: .4em; font-size: 85%; list-style: square;}
+.slide img.leader {display: block; margin: 0 auto;}
+
+div#header, div#footer {background: #438e00; color: #CDC;
+ font-family: Verdana, Helvetica, sans-serif; padding: 0;}
+div#header {background: #438e00 url(lxml-logo64.png) 1ex 0.6ex no-repeat;
+ line-height: 1px;}
+div#footer {font-size: 0.5em; font-weight: bold; padding: 0.6em 0;}
+#footer h1, #footer h2 {display: block; padding: 0 1em;}
+#footer h2 {font-style: italic;}
+#footer a {color: #CDC;}
+
+div.long {font-size: 0.75em;}
+.slide h1 {position: absolute; top: 0.4em; left: 87px; z-index: 1;
+ margin: 0; padding: 0.2em 0 0 25px; white-space: nowrap;
+ font: bold 150%/1em Helvetica, sans-serif; /* text-transform: capitalize; */
+ color: #DED; background: #438e00;}
+.slide h3 {font-size: 130%;}
+h1 abbr {font-variant: small-caps;}
+
+div#controls {position: absolute; left: 50%; bottom: 0;
+ width: 50%;
+ text-align: right; font: bold 0.7em Verdana, Helvetica, sans-serif;}
+html>body div#controls {position: fixed; padding: 0 0 1em 0;
+ top: auto;}
+div#controls form {position: absolute; bottom: 0; right: 0; width: 100%;
+ margin: 0; padding: 0;}
+#controls #navLinks a {padding: 0; margin: 0 0.5em;
+ background: #438e00; border: none; color: #DED;
+ cursor: pointer;}
+#controls #navList {height: 1em;}
+#controls #navList #jumplist {position: absolute; bottom: 0; right: 0; background: #EEE; color: #272;}
+
+#currentSlide {text-align: center; font-size: 0.5em; color: #CDC; left: 90%; bottom: 2px;}
+
+#slide0 {padding-top: 3em; font-size: 90%;}
+#slide0 h1 {position: static; margin: 1em 0 0; padding: 0;
+ font: bold 2em Helvetica, sans-serif; white-space: normal;
+ color: #000; background: transparent;}
+#slide0 h2 {font: bold italic 1em Helvetica, sans-serif; margin: 1.25em;}
+#slide0 h3 {margin-top: 1.5em; font-size: 1.5em;}
+#slide0 h4 {margin-top: 0; font-size: 1em;}
+
+ul.urls {list-style: none; display: inline; margin: 0;}
+.urls li {display: inline; margin: 0;}
+.note {display: none;}
+.external {border-bottom: 1px dotted gray;}
+html>body .external {border-bottom: none;}
+/* .external:after {content: " \274F"; font-size: smaller; color: #7B7;} */
+
+/* .incremental, .incremental *, .incremental *:after {color: #DED; visibility: visible;} */
+.incremental, .incremental *, .incremental *:after {visibility: hidden;}
+img.incremental {visibility: hidden;}
+.slide .current {color: #B02;}
+
+.center {text-align: center; }
+.right {text-align: right; }
+.small {font-size: 60%; }
+img.center {display: block; margin-left: auto; margin-right: auto; }
+
+.slide .syntax {padding: 2px 0.25em; font-weight: bold; color: #533; font-size:85%; }
+
+/* diagnostics
+
+li:after {content: " [" attr(class) "]"; color: #F88;}
+ */
+
+/* Syntax highlighting */
+
+/* .syntax { background: #f0f0f0; } */
+.syntax .c { color: #60a0b0; font-style: italic } /* Comment */
+.syntax .err { border: 1px solid #FF0000 } /* Error */
+.syntax .k { color: #007020; font-weight: bold } /* Keyword */
+.syntax .o { color: #666666 } /* Operator */
+.syntax .cm { color: #60a0b0; font-style: italic } /* Comment.Multiline */
+.syntax .cp { color: #007020 } /* Comment.Preproc */
+.syntax .c1 { color: #60a0b0; font-style: italic } /* Comment.Single */
+.syntax .cs { color: #60a0b0; background-color: #fff0f0 } /* Comment.Special */
+.syntax .gd { color: #A00000 } /* Generic.Deleted */
+.syntax .ge { font-style: italic } /* Generic.Emph */
+.syntax .gr { color: #FF0000 } /* Generic.Error */
+.syntax .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+.syntax .gi { color: #00A000 } /* Generic.Inserted */
+.syntax .go { color: #404040 } /* Generic.Output */
+.syntax .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */
+.syntax .gs { font-weight: bold } /* Generic.Strong */
+.syntax .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+.syntax .gt { color: #0040D0 } /* Generic.Traceback */
+.syntax .kc { color: #007020; font-weight: bold } /* Keyword.Constant */
+.syntax .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */
+.syntax .kp { color: #007020 } /* Keyword.Pseudo */
+.syntax .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */
+.syntax .kt { color: #902000 } /* Keyword.Type */
+.syntax .m { color: #40a070 } /* Literal.Number */
+.syntax .s { color: #4070a0 } /* Literal.String */
+.syntax .na { color: #4070a0 } /* Name.Attribute */
+.syntax .nb { color: #007020 } /* Name.Builtin */
+.syntax .nc { color: #0e84b5; font-weight: bold } /* Name.Class */
+.syntax .no { color: #60add5 } /* Name.Constant */
+.syntax .nd { color: #555555; font-weight: bold } /* Name.Decorator */
+.syntax .ni { color: #d55537; font-weight: bold } /* Name.Entity */
+.syntax .ne { color: #007020 } /* Name.Exception */
+.syntax .nf { color: #06287e } /* Name.Function */
+.syntax .nl { color: #002070; font-weight: bold } /* Name.Label */
+.syntax .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */
+.syntax .nt { color: #062873; font-weight: bold } /* Name.Tag */
+.syntax .nv { color: #bb60d5 } /* Name.Variable */
+.syntax .ow { color: #007020; font-weight: bold } /* Operator.Word */
+.syntax .w { color: #bbbbbb } /* Text.Whitespace */
+.syntax .mf { color: #40a070 } /* Literal.Number.Float */
+.syntax .mh { color: #40a070 } /* Literal.Number.Hex */
+.syntax .mi { color: #40a070 } /* Literal.Number.Integer */
+.syntax .mo { color: #40a070 } /* Literal.Number.Oct */
+.syntax .sb { color: #4070a0 } /* Literal.String.Backtick */
+.syntax .sc { color: #4070a0 } /* Literal.String.Char */
+.syntax .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */
+.syntax .s2 { color: #4070a0 } /* Literal.String.Double */
+.syntax .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */
+.syntax .sh { color: #4070a0 } /* Literal.String.Heredoc */
+.syntax .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */
+.syntax .sx { color: #c65d09 } /* Literal.String.Other */
+.syntax .sr { color: #235388 } /* Literal.String.Regex */
+.syntax .s1 { color: #4070a0 } /* Literal.String.Single */
+.syntax .ss { color: #517918 } /* Literal.String.Symbol */
+.syntax .bp { color: #007020 } /* Name.Builtin.Pseudo */
+.syntax .vc { color: #bb60d5 } /* Name.Variable.Class */
+.syntax .vg { color: #bb60d5 } /* Name.Variable.Global */
+.syntax .vi { color: #bb60d5 } /* Name.Variable.Instance */
+.syntax .il { color: #40a070 } /* Literal.Number.Integer.Long */
+
+/* .highlight { background: #f0f0f0; } */
+.highlight .c { color: #60a0b0; font-style: italic } /* Comment */
+.highlight .err { border: 1px solid #FF0000 } /* Error */
+.highlight .k { color: #007020; font-weight: bold } /* Keyword */
+.highlight .o { color: #666666 } /* Operator */
+.highlight .cm { color: #60a0b0; font-style: italic } /* Comment.Multiline */
+.highlight .cp { color: #007020 } /* Comment.Preproc */
+.highlight .c1 { color: #60a0b0; font-style: italic } /* Comment.Single */
+.highlight .cs { color: #60a0b0; background-color: #fff0f0 } /* Comment.Special */
+.highlight .gd { color: #A00000 } /* Generic.Deleted */
+.highlight .ge { font-style: italic } /* Generic.Emph */
+.highlight .gr { color: #FF0000 } /* Generic.Error */
+.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
+.highlight .gi { color: #00A000 } /* Generic.Inserted */
+.highlight .go { color: #404040 } /* Generic.Output */
+.highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */
+.highlight .gs { font-weight: bold } /* Generic.Strong */
+.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
+.highlight .gt { color: #0040D0 } /* Generic.Traceback */
+.highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */
+.highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */
+.highlight .kp { color: #007020 } /* Keyword.Pseudo */
+.highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */
+.highlight .kt { color: #902000 } /* Keyword.Type */
+.highlight .m { color: #40a070 } /* Literal.Number */
+.highlight .s { color: #4070a0 } /* Literal.String */
+.highlight .na { color: #4070a0 } /* Name.Attribute */
+.highlight .nb { color: #007020 } /* Name.Builtin */
+.highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */
+.highlight .no { color: #60add5 } /* Name.Constant */
+.highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */
+.highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */
+.highlight .ne { color: #007020 } /* Name.Exception */
+.highlight .nf { color: #06287e } /* Name.Function */
+.highlight .nl { color: #002070; font-weight: bold } /* Name.Label */
+.highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */
+.highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */
+.highlight .nv { color: #bb60d5 } /* Name.Variable */
+.highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */
+.highlight .w { color: #bbbbbb } /* Text.Whitespace */
+.highlight .mf { color: #40a070 } /* Literal.Number.Float */
+.highlight .mh { color: #40a070 } /* Literal.Number.Hex */
+.highlight .mi { color: #40a070 } /* Literal.Number.Integer */
+.highlight .mo { color: #40a070 } /* Literal.Number.Oct */
+.highlight .sb { color: #4070a0 } /* Literal.String.Backtick */
+.highlight .sc { color: #4070a0 } /* Literal.String.Char */
+.highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */
+.highlight .s2 { color: #4070a0 } /* Literal.String.Double */
+.highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */
+.highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */
+.highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */
+.highlight .sx { color: #c65d09 } /* Literal.String.Other */
+.highlight .sr { color: #235388 } /* Literal.String.Regex */
+.highlight .s1 { color: #4070a0 } /* Literal.String.Single */
+.highlight .ss { color: #517918 } /* Literal.String.Symbol */
+.highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */
+.highlight .vc { color: #bb60d5 } /* Name.Variable.Class */
+.highlight .vg { color: #bb60d5 } /* Name.Variable.Global */
+.highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */
+.highlight .il { color: #40a070 } /* Literal.Number.Integer.Long */
diff --git a/doc/s5/ui/default/print.css b/doc/s5/ui/default/print.css
new file mode 100644
index 0000000..e52c441
--- /dev/null
+++ b/doc/s5/ui/default/print.css
@@ -0,0 +1,24 @@
+/* The following rule is necessary to have all slides appear in print! DO NOT REMOVE IT! */
+.slide, ul {page-break-inside: avoid; visibility: visible !important;}
+h1 {page-break-after: avoid;}
+
+body {font-size: 12pt; background: white;}
+* {color: black;}
+
+#slide0 h1 {font-size: 200%; border: none; margin: 0.5em 0 0.25em;}
+#slide0 h3 {margin: 0; padding: 0;}
+#slide0 h4 {margin: 0 0 0.5em; padding: 0;}
+#slide0 {margin-bottom: 3em;}
+
+h1 {border-top: 2pt solid gray; border-bottom: 1px dotted silver;}
+.extra {background: transparent !important;}
+div.extra, pre.extra, .example {font-size: 10pt; color: #333;}
+ul.extra a {font-weight: bold;}
+p.example {display: none;}
+
+#header {display: none;}
+#footer h1 {margin: 0; border-bottom: 1px solid; color: gray; font-style: italic;}
+#footer h2, #controls {display: none;}
+
+/* The following rule keeps the layout stuff out of print. Remove at your own risk! */
+.layout, .layout * {display: none !important;}
diff --git a/doc/s5/ui/default/s5-core.css b/doc/s5/ui/default/s5-core.css
new file mode 100644
index 0000000..86444e0
--- /dev/null
+++ b/doc/s5/ui/default/s5-core.css
@@ -0,0 +1,9 @@
+/* Do not edit or override these styles! The system will likely break if you do. */
+
+div#header, div#footer, div#controls, .slide {position: absolute;}
+html>body div#header, html>body div#footer,
+ html>body div#controls, html>body .slide {position: fixed;}
+.handout {display: none;}
+.layout {display: block;}
+.slide, .hideme, .incremental {visibility: hidden;}
+#slide0 {visibility: visible;}
diff --git a/doc/s5/ui/default/slides.css b/doc/s5/ui/default/slides.css
new file mode 100644
index 0000000..0786d7d
--- /dev/null
+++ b/doc/s5/ui/default/slides.css
@@ -0,0 +1,3 @@
+@import url(s5-core.css); /* required to make the slide show run at all */
+@import url(framing.css); /* sets basic placement and size of slide components */
+@import url(pretty.css); /* stuff that makes the slides look better than blah */ \ No newline at end of file
diff --git a/doc/s5/ui/default/slides.js b/doc/s5/ui/default/slides.js
new file mode 100644
index 0000000..07f40ff
--- /dev/null
+++ b/doc/s5/ui/default/slides.js
@@ -0,0 +1,552 @@
+// S5 v1.1 slides.js -- released into the Public Domain
+//
+// Please see http://www.meyerweb.com/eric/tools/s5/credits.html for information
+// about all the wonderful and talented contributors to this code!
+
+var undef;
+var slideCSS = '';
+var snum = 0;
+var smax = 1;
+var incpos = 0;
+var number = undef;
+var s5mode = true;
+var defaultView = 'slideshow';
+var controlVis = 'visible';
+
+var isIE = navigator.appName == 'Microsoft Internet Explorer' ? 1 : 0;
+var isOp = navigator.userAgent.indexOf('Opera') > -1 ? 1 : 0;
+var isGe = navigator.userAgent.indexOf('Gecko') > -1 && navigator.userAgent.indexOf('Safari') < 1 ? 1 : 0;
+
+function hasClass(object, className) {
+ if (!object.className) return false;
+ return (object.className.search('(^|\\s)' + className + '(\\s|$)') != -1);
+}
+
+function hasValue(object, value) {
+ if (!object) return false;
+ return (object.search('(^|\\s)' + value + '(\\s|$)') != -1);
+}
+
+function removeClass(object,className) {
+ if (!object) return;
+ object.className = object.className.replace(new RegExp('(^|\\s)'+className+'(\\s|$)'), RegExp.$1+RegExp.$2);
+}
+
+function addClass(object,className) {
+ if (!object || hasClass(object, className)) return;
+ if (object.className) {
+ object.className += ' '+className;
+ } else {
+ object.className = className;
+ }
+}
+
+function GetElementsWithClassName(elementName,className) {
+ var allElements = document.getElementsByTagName(elementName);
+ var elemColl = new Array();
+ for (var i = 0; i< allElements.length; i++) {
+ if (hasClass(allElements[i], className)) {
+ elemColl[elemColl.length] = allElements[i];
+ }
+ }
+ return elemColl;
+}
+
+function isParentOrSelf(element, id) {
+ if (element == null || element.nodeName=='BODY') return false;
+ else if (element.id == id) return true;
+ else return isParentOrSelf(element.parentNode, id);
+}
+
+function nodeValue(node) {
+ var result = "";
+ if (node.nodeType == 1) {
+ var children = node.childNodes;
+ for (var i = 0; i < children.length; ++i) {
+ result += nodeValue(children[i]);
+ }
+ }
+ else if (node.nodeType == 3) {
+ result = node.nodeValue;
+ }
+ return(result);
+}
+
+function slideLabel() {
+ var slideColl = GetElementsWithClassName('*','slide');
+ var list = document.getElementById('jumplist');
+ smax = slideColl.length;
+ for (var n = 0; n < smax; n++) {
+ var obj = slideColl[n];
+
+ var did = 'slide' + n.toString();
+ obj.setAttribute('id',did);
+ if (isOp) continue;
+
+ var otext = '';
+ var menu = obj.firstChild;
+ if (!menu) continue; // to cope with empty slides
+ while (menu && menu.nodeType == 3) {
+ menu = menu.nextSibling;
+ }
+ if (!menu) continue; // to cope with slides with only text nodes
+
+ var menunodes = menu.childNodes;
+ for (var o = 0; o < menunodes.length; o++) {
+ otext += nodeValue(menunodes[o]);
+ }
+ list.options[list.length] = new Option(n + ' : ' + otext, n);
+ }
+}
+
+function currentSlide() {
+ var cs;
+ if (document.getElementById) {
+ cs = document.getElementById('currentSlide');
+ } else {
+ cs = document.currentSlide;
+ }
+ cs.innerHTML = '<span id="csHere">' + snum + '<\/span> ' +
+ '<span id="csSep">\/<\/span> ' +
+ '<span id="csTotal">' + (smax-1) + '<\/span>';
+ if (snum == 0) {
+ cs.style.visibility = 'hidden';
+ } else {
+ cs.style.visibility = 'visible';
+ }
+}
+
+function go(step) {
+ if (document.getElementById('slideProj').disabled || step == 0) return;
+ var jl = document.getElementById('jumplist');
+ var cid = 'slide' + snum;
+ var ce = document.getElementById(cid);
+ if (incrementals[snum].length > 0) {
+ for (var i = 0; i < incrementals[snum].length; i++) {
+ removeClass(incrementals[snum][i], 'current');
+ removeClass(incrementals[snum][i], 'incremental');
+ }
+ }
+ if (step != 'j') {
+ snum += step;
+ lmax = smax - 1;
+ if (snum > lmax) snum = lmax;
+ if (snum < 0) snum = 0;
+ } else
+ snum = parseInt(jl.value);
+ var nid = 'slide' + snum;
+ var ne = document.getElementById(nid);
+ if (!ne) {
+ ne = document.getElementById('slide0');
+ snum = 0;
+ }
+ if (step < 0) {incpos = incrementals[snum].length} else {incpos = 0;}
+ if (incrementals[snum].length > 0 && incpos == 0) {
+ for (var i = 0; i < incrementals[snum].length; i++) {
+ if (hasClass(incrementals[snum][i], 'current'))
+ incpos = i + 1;
+ else
+ addClass(incrementals[snum][i], 'incremental');
+ }
+ }
+ if (incrementals[snum].length > 0 && incpos > 0)
+ addClass(incrementals[snum][incpos - 1], 'current');
+ ce.style.visibility = 'hidden';
+ ne.style.visibility = 'visible';
+ jl.selectedIndex = snum;
+ currentSlide();
+ number = 0;
+}
+
+function goTo(target) {
+ if (target >= smax || target == snum) return;
+ go(target - snum);
+}
+
+function subgo(step) {
+ if (step > 0) {
+ removeClass(incrementals[snum][incpos - 1],'current');
+ removeClass(incrementals[snum][incpos], 'incremental');
+ addClass(incrementals[snum][incpos],'current');
+ incpos++;
+ } else {
+ incpos--;
+ removeClass(incrementals[snum][incpos],'current');
+ addClass(incrementals[snum][incpos], 'incremental');
+ addClass(incrementals[snum][incpos - 1],'current');
+ }
+}
+
+function toggle() {
+ var slideColl = GetElementsWithClassName('*','slide');
+ var slides = document.getElementById('slideProj');
+ var outline = document.getElementById('outlineStyle');
+ if (!slides.disabled) {
+ slides.disabled = true;
+ outline.disabled = false;
+ s5mode = false;
+ fontSize('1em');
+ for (var n = 0; n < smax; n++) {
+ var slide = slideColl[n];
+ slide.style.visibility = 'visible';
+ }
+ } else {
+ slides.disabled = false;
+ outline.disabled = true;
+ s5mode = true;
+ fontScale();
+ for (var n = 0; n < smax; n++) {
+ var slide = slideColl[n];
+ slide.style.visibility = 'hidden';
+ }
+ slideColl[snum].style.visibility = 'visible';
+ }
+}
+
+function showHide(action) {
+ var obj = GetElementsWithClassName('*','hideme')[0];
+ switch (action) {
+ case 's': obj.style.visibility = 'visible'; break;
+ case 'h': obj.style.visibility = 'hidden'; break;
+ case 'k':
+ if (obj.style.visibility != 'visible') {
+ obj.style.visibility = 'visible';
+ } else {
+ obj.style.visibility = 'hidden';
+ }
+ break;
+ }
+}
+
+// 'keys' code adapted from MozPoint (http://mozpoint.mozdev.org/)
+function keys(key) {
+ if (!key) {
+ key = event;
+ key.which = key.keyCode;
+ }
+ if (key.which == 84) {
+ toggle();
+ return;
+ }
+ if (s5mode) {
+ switch (key.which) {
+ case 10: // return
+ case 13: // enter
+ if (window.event && isParentOrSelf(window.event.srcElement, 'controls')) return;
+ if (key.target && isParentOrSelf(key.target, 'controls')) return;
+ if(number != undef) {
+ goTo(number);
+ break;
+ }
+ case 32: // spacebar
+ case 34: // page down
+ case 39: // rightkey
+ case 40: // downkey
+ if(number != undef) {
+ go(number);
+ } else if (!incrementals[snum] || incpos >= incrementals[snum].length) {
+ go(1);
+ } else {
+ subgo(1);
+ }
+ break;
+ case 33: // page up
+ case 37: // leftkey
+ case 38: // upkey
+ if(number != undef) {
+ go(-1 * number);
+ } else if (!incrementals[snum] || incpos <= 0) {
+ go(-1);
+ } else {
+ subgo(-1);
+ }
+ break;
+ case 36: // home
+ goTo(0);
+ break;
+ case 35: // end
+ goTo(smax-1);
+ break;
+ case 67: // c
+ showHide('k');
+ break;
+ }
+ if (key.which < 48 || key.which > 57) {
+ number = undef;
+ } else {
+ if (window.event && isParentOrSelf(window.event.srcElement, 'controls')) return;
+ if (key.target && isParentOrSelf(key.target, 'controls')) return;
+ number = (((number != undef) ? number : 0) * 10) + (key.which - 48);
+ }
+ }
+ return false;
+}
+
+function clicker(e) {
+ number = undef;
+ var target;
+ if (window.event) {
+ target = window.event.srcElement;
+ e = window.event;
+ } else target = e.target;
+ if (target.getAttribute('href') != null || hasValue(target.rel, 'external') || isParentOrSelf(target, 'controls') || isParentOrSelf(target,'embed') || isParentOrSelf(target,'object')) return true;
+ if (!e.which || e.which == 1) {
+ if (!incrementals[snum] || incpos >= incrementals[snum].length) {
+ go(1);
+ } else {
+ subgo(1);
+ }
+ }
+}
+
+function findSlide(hash) {
+ var target = null;
+ var slides = GetElementsWithClassName('*','slide');
+ for (var i = 0; i < slides.length; i++) {
+ var targetSlide = slides[i];
+ if ( (targetSlide.name && targetSlide.name == hash)
+ || (targetSlide.id && targetSlide.id == hash) ) {
+ target = targetSlide;
+ break;
+ }
+ }
+ while(target != null && target.nodeName != 'BODY') {
+ if (hasClass(target, 'slide')) {
+ return parseInt(target.id.slice(5));
+ }
+ target = target.parentNode;
+ }
+ return null;
+}
+
+function slideJump() {
+ if (window.location.hash == null) return;
+ var sregex = /^#slide(\d+)$/;
+ var matches = sregex.exec(window.location.hash);
+ var dest = null;
+ if (matches != null) {
+ dest = parseInt(matches[1]);
+ } else {
+ dest = findSlide(window.location.hash.slice(1));
+ }
+ if (dest != null)
+ go(dest - snum);
+}
+
+function fixLinks() {
+ var thisUri = window.location.href;
+ thisUri = thisUri.slice(0, thisUri.length - window.location.hash.length);
+ var aelements = document.getElementsByTagName('A');
+ for (var i = 0; i < aelements.length; i++) {
+ var a = aelements[i].href;
+ var slideID = a.match('\#slide[0-9]{1,2}');
+ if ((slideID) && (slideID[0].slice(0,1) == '#')) {
+ var dest = findSlide(slideID[0].slice(1));
+ if (dest != null) {
+ if (aelements[i].addEventListener) {
+ aelements[i].addEventListener("click", new Function("e",
+ "if (document.getElementById('slideProj').disabled) return;" +
+ "go("+dest+" - snum); " +
+ "if (e.preventDefault) e.preventDefault();"), true);
+ } else if (aelements[i].attachEvent) {
+ aelements[i].attachEvent("onclick", new Function("",
+ "if (document.getElementById('slideProj').disabled) return;" +
+ "go("+dest+" - snum); " +
+ "event.returnValue = false;"));
+ }
+ }
+ }
+ }
+}
+
+function externalLinks() {
+ if (!document.getElementsByTagName) return;
+ var anchors = document.getElementsByTagName('a');
+ for (var i=0; i<anchors.length; i++) {
+ var anchor = anchors[i];
+ if (anchor.getAttribute('href') && hasValue(anchor.rel, 'external')) {
+ anchor.target = '_blank';
+ addClass(anchor,'external');
+ }
+ }
+}
+
+function createControls() {
+ var controlsDiv = document.getElementById("controls");
+ if (!controlsDiv) return;
+ var hider = ' onmouseover="showHide(\'s\');" onmouseout="showHide(\'h\');"';
+ var hideDiv, hideList = '';
+ if (controlVis == 'hidden') {
+ hideDiv = hider;
+ } else {
+ hideList = hider;
+ }
+ controlsDiv.innerHTML = '<form action="#" id="controlForm"' + hideDiv + '>' +
+ '<div id="navLinks">' +
+ '<a accesskey="t" id="toggle" href="javascript:toggle();">&#216;<\/a>' +
+ '<a accesskey="z" id="prev" href="javascript:go(-1);">&laquo;<\/a>' +
+ '<a accesskey="x" id="next" href="javascript:go(1);">&raquo;<\/a>' +
+ '<div id="navList"' + hideList + '><select id="jumplist" onchange="go(\'j\');"><\/select><\/div>' +
+ '<\/div><\/form>';
+ if (controlVis == 'hidden') {
+ var hidden = document.getElementById('navLinks');
+ } else {
+ var hidden = document.getElementById('jumplist');
+ }
+ addClass(hidden,'hideme');
+}
+
+function fontScale() { // causes layout problems in FireFox that get fixed if browser's Reload is used; same may be true of other Gecko-based browsers
+ if (!s5mode) return false;
+ var vScale = 22; // both yield 32 (after rounding) at 1024x768
+ var hScale = 32; // perhaps should auto-calculate based on theme's declared value?
+ if (window.innerHeight) {
+ var vSize = window.innerHeight;
+ var hSize = window.innerWidth;
+ } else if (document.documentElement.clientHeight) {
+ var vSize = document.documentElement.clientHeight;
+ var hSize = document.documentElement.clientWidth;
+ } else if (document.body.clientHeight) {
+ var vSize = document.body.clientHeight;
+ var hSize = document.body.clientWidth;
+ } else {
+ var vSize = 700; // assuming 1024x768, minus chrome and such
+ var hSize = 1024; // these do not account for kiosk mode or Opera Show
+ }
+ var newSize = Math.min(Math.round(vSize/vScale),Math.round(hSize/hScale));
+ fontSize(newSize + 'px');
+ if (isGe) { // hack to counter incremental reflow bugs
+ var obj = document.getElementsByTagName('body')[0];
+ obj.style.display = 'none';
+ obj.style.display = 'block';
+ }
+}
+
+function fontSize(value) {
+ if (!(s5ss = document.getElementById('s5ss'))) {
+ if (!isIE) {
+ document.getElementsByTagName('head')[0].appendChild(s5ss = document.createElement('style'));
+ s5ss.setAttribute('media','screen, projection');
+ s5ss.setAttribute('id','s5ss');
+ } else {
+ document.createStyleSheet();
+ document.s5ss = document.styleSheets[document.styleSheets.length - 1];
+ }
+ }
+ if (!isIE) {
+ while (s5ss.lastChild) s5ss.removeChild(s5ss.lastChild);
+ s5ss.appendChild(document.createTextNode('body {font-size: ' + value + ' !important;}'));
+ } else {
+ document.s5ss.addRule('body','font-size: ' + value + ' !important;');
+ }
+}
+
+function notOperaFix() {
+ slideCSS = document.getElementById('slideProj').href;
+ var slides = document.getElementById('slideProj');
+ var outline = document.getElementById('outlineStyle');
+ slides.setAttribute('media','screen');
+ outline.disabled = true;
+ if (isGe) {
+ slides.setAttribute('href','null'); // Gecko fix
+ slides.setAttribute('href',slideCSS); // Gecko fix
+ }
+ if (isIE && document.styleSheets && document.styleSheets[0]) {
+ document.styleSheets[0].addRule('img', 'behavior: url(ui/default/iepngfix.htc)');
+ document.styleSheets[0].addRule('div', 'behavior: url(ui/default/iepngfix.htc)');
+ document.styleSheets[0].addRule('.slide', 'behavior: url(ui/default/iepngfix.htc)');
+ }
+}
+
+function getIncrementals(obj) {
+ var incrementals = new Array();
+ if (!obj)
+ return incrementals;
+ var children = obj.childNodes;
+ for (var i = 0; i < children.length; i++) {
+ var child = children[i];
+ if (hasClass(child, 'incremental')) {
+ if (child.nodeName == 'OL' || child.nodeName == 'UL') {
+ removeClass(child, 'incremental');
+ for (var j = 0; j < child.childNodes.length; j++) {
+ if (child.childNodes[j].nodeType == 1) {
+ addClass(child.childNodes[j], 'incremental');
+ }
+ }
+ } else {
+ incrementals[incrementals.length] = child;
+ removeClass(child,'incremental');
+ }
+ }
+ if (hasClass(child, 'show-first')) {
+ if (child.nodeName == 'OL' || child.nodeName == 'UL') {
+ removeClass(child, 'show-first');
+ if (child.childNodes[isGe].nodeType == 1) {
+ removeClass(child.childNodes[isGe], 'incremental');
+ }
+ } else {
+ incrementals[incrementals.length] = child;
+ }
+ }
+ incrementals = incrementals.concat(getIncrementals(child));
+ }
+ return incrementals;
+}
+
+function createIncrementals() {
+ var incrementals = new Array();
+ for (var i = 0; i < smax; i++) {
+ incrementals[i] = getIncrementals(document.getElementById('slide'+i));
+ }
+ return incrementals;
+}
+
+function defaultCheck() {
+ var allMetas = document.getElementsByTagName('meta');
+ for (var i = 0; i< allMetas.length; i++) {
+ if (allMetas[i].name == 'defaultView') {
+ defaultView = allMetas[i].content;
+ }
+ if (allMetas[i].name == 'controlVis') {
+ controlVis = allMetas[i].content;
+ }
+ }
+}
+
+// Key trap fix, new function body for trap()
+function trap(e) {
+ if (!e) {
+ e = event;
+ e.which = e.keyCode;
+ }
+ try {
+ modifierKey = e.ctrlKey || e.altKey || e.metaKey;
+ }
+ catch(e) {
+ modifierKey = false;
+ }
+ return modifierKey || e.which == 0;
+}
+
+function startup() {
+ defaultCheck();
+ if (!isOp) createControls();
+ slideLabel();
+ fixLinks();
+ externalLinks();
+ fontScale();
+ if (!isOp) {
+ notOperaFix();
+ incrementals = createIncrementals();
+ slideJump();
+ if (defaultView == 'outline') {
+ toggle();
+ }
+ document.onkeyup = keys;
+ document.onkeypress = trap;
+ document.onclick = clicker;
+ }
+}
+
+window.onload = startup;
+window.onresize = function(){setTimeout('fontScale()', 50);} \ No newline at end of file
diff --git a/doc/s5/ui/default/tagpython.png b/doc/s5/ui/default/tagpython.png
new file mode 100644
index 0000000..1bedfc8
--- /dev/null
+++ b/doc/s5/ui/default/tagpython.png
Binary files differ
diff --git a/doc/sax.txt b/doc/sax.txt
new file mode 100644
index 0000000..be41385
--- /dev/null
+++ b/doc/sax.txt
@@ -0,0 +1,137 @@
+Sax support
+===========
+
+In this document we'll describe lxml's SAX support. lxml has support for
+producing SAX events for an ElementTree or Element. lxml can also turn SAX
+events into an ElementTree. The SAX API used by lxml is compatible with that
+in the Python core (xml.sax), so is useful for interfacing lxml with code that
+uses the Python core SAX facilities.
+
+.. contents::
+..
+ 1 Building a tree from SAX events
+ 2 Producing SAX events from an ElementTree or Element
+ 3 Interfacing with pulldom/minidom
+
+..
+ >>> try: from StringIO import StringIO
+ ... except ImportError:
+ ... from io import BytesIO
+ ... def StringIO(s):
+ ... if isinstance(s, str): s = s.encode("UTF-8")
+ ... return BytesIO(s)
+
+
+Building a tree from SAX events
+-------------------------------
+
+First of all, lxml has support for building a new tree given SAX events. To
+do this, we use the special SAX content handler defined by lxml named
+``lxml.sax.ElementTreeContentHandler``:
+
+.. sourcecode:: pycon
+
+ >>> import lxml.sax
+ >>> handler = lxml.sax.ElementTreeContentHandler()
+
+Now let's fire some SAX events at it:
+
+.. sourcecode:: pycon
+
+ >>> handler.startElementNS((None, 'a'), 'a', {})
+ >>> handler.startElementNS((None, 'b'), 'b', {(None, 'foo'): 'bar'})
+ >>> handler.characters('Hello world')
+ >>> handler.endElementNS((None, 'b'), 'b')
+ >>> handler.endElementNS((None, 'a'), 'a')
+
+This constructs an equivalent tree. You can access it through the ``etree``
+property of the handler:
+
+.. sourcecode:: pycon
+
+ >>> tree = handler.etree
+ >>> lxml.etree.tostring(tree.getroot())
+ b'<a><b foo="bar">Hello world</b></a>'
+
+By passing a ``makeelement`` function the constructor of
+``ElementTreeContentHandler``, e.g. the one of a parser you configured, you
+can determine which element class lookup scheme should be used.
+
+
+Producing SAX events from an ElementTree or Element
+---------------------------------------------------
+
+Let's make a tree we can generate SAX events for:
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO('<a><b>Text</b></a>')
+ >>> tree = lxml.etree.parse(f)
+
+To see whether the correct SAX events are produced, we'll write a custom
+content handler.:
+
+.. sourcecode:: pycon
+
+ >>> from xml.sax.handler import ContentHandler
+ >>> class MyContentHandler(ContentHandler):
+ ... def __init__(self):
+ ... self.a_amount = 0
+ ... self.b_amount = 0
+ ... self.text = None
+ ...
+ ... def startElementNS(self, name, qname, attributes):
+ ... uri, localname = name
+ ... if localname == 'a':
+ ... self.a_amount += 1
+ ... if localname == 'b':
+ ... self.b_amount += 1
+ ...
+ ... def characters(self, data):
+ ... self.text = data
+
+Note that it only defines the startElementNS() method and not startElement().
+The SAX event generator in lxml.sax currently only supports namespace-aware
+processing.
+
+To test the content handler, we can produce SAX events from the tree:
+
+.. sourcecode:: pycon
+
+ >>> handler = MyContentHandler()
+ >>> lxml.sax.saxify(tree, handler)
+
+This is what we expect:
+
+.. sourcecode:: pycon
+
+ >>> handler.a_amount
+ 1
+ >>> handler.b_amount
+ 1
+ >>> handler.text
+ 'Text'
+
+
+Interfacing with pulldom/minidom
+--------------------------------
+
+lxml.sax is a simple way to interface with the standard XML support in the
+Python library. Note, however, that this is a one-way solution, as Python's
+DOM implementation cannot generate SAX events from a DOM tree.
+
+You can use xml.dom.pulldom to build a minidom from lxml:
+
+.. sourcecode:: pycon
+
+ >>> from xml.dom.pulldom import SAX2DOM
+ >>> handler = SAX2DOM()
+ >>> lxml.sax.saxify(tree, handler)
+
+PullDOM makes the result available through the ``document`` attribute:
+
+.. sourcecode:: pycon
+
+ >>> dom = handler.document
+ >>> print(dom.firstChild.localName)
+ a
diff --git a/doc/test.xml b/doc/test.xml
new file mode 100644
index 0000000..d80a5e2
--- /dev/null
+++ b/doc/test.xml
@@ -0,0 +1 @@
+<a/>
diff --git a/doc/tutorial.txt b/doc/tutorial.txt
new file mode 100644
index 0000000..489a145
--- /dev/null
+++ b/doc/tutorial.txt
@@ -0,0 +1,1508 @@
+=======================
+The lxml.etree Tutorial
+=======================
+
+.. meta::
+ :description: The lxml tutorial on XML processing with Python
+ :keywords: XML processing with Python, lxml, lxml.etree, tutorial, ElementTree, Python, XML, HTML
+
+:Author:
+ Stefan Behnel
+
+This is a tutorial on XML processing with ``lxml.etree``. It briefly
+overviews the main concepts of the `ElementTree API`_, and some simple
+enhancements that make your life as a programmer easier.
+
+For a complete reference of the API, see the `generated API
+documentation`_.
+
+.. _`ElementTree API`: http://effbot.org/zone/element-index.htm#documentation
+.. _`generated API documentation`: api/index.html
+
+.. contents::
+..
+ 1 The Element class
+ 1.1 Elements are lists
+ 1.2 Elements carry attributes
+ 1.3 Elements contain text
+ 1.4 Using XPath to find text
+ 1.5 Tree iteration
+ 1.6 Serialisation
+ 2 The ElementTree class
+ 3 Parsing from strings and files
+ 3.1 The fromstring() function
+ 3.2 The XML() function
+ 3.3 The parse() function
+ 3.4 Parser objects
+ 3.5 Incremental parsing
+ 3.6 Event-driven parsing
+ 4 Namespaces
+ 5 The E-factory
+ 6 ElementPath
+
+
+..
+ >>> try: unicode = unicode
+ ... except (NameError, KeyError): unicode = str
+
+ >>> try: basestring = basestring
+ ... except (NameError, KeyError): basestring = str
+
+ >>> try: next = next
+ ... except NameError:
+ ... def next(it): return it.next()
+
+A common way to import ``lxml.etree`` is as follows:
+
+.. sourcecode:: pycon
+
+ >>> from lxml import etree
+
+If your code only uses the ElementTree API and does not rely on any
+functionality that is specific to ``lxml.etree``, you can also use (any part
+of) the following import chain as a fall-back to the original ElementTree:
+
+.. sourcecode:: python
+
+ try:
+ from lxml import etree
+ print("running with lxml.etree")
+ except ImportError:
+ try:
+ # Python 2.5
+ import xml.etree.cElementTree as etree
+ print("running with cElementTree on Python 2.5+")
+ except ImportError:
+ try:
+ # Python 2.5
+ import xml.etree.ElementTree as etree
+ print("running with ElementTree on Python 2.5+")
+ except ImportError:
+ try:
+ # normal cElementTree install
+ import cElementTree as etree
+ print("running with cElementTree")
+ except ImportError:
+ try:
+ # normal ElementTree install
+ import elementtree.ElementTree as etree
+ print("running with ElementTree")
+ except ImportError:
+ print("Failed to import ElementTree from any known place")
+
+To aid in writing portable code, this tutorial makes it clear in the examples
+which part of the presented API is an extension of ``lxml.etree`` over the
+original `ElementTree API`_, as defined by Fredrik Lundh's `ElementTree
+library`_.
+
+.. _`ElementTree library`: http://effbot.org/zone/element-index.htm
+
+..
+ >>> import sys
+ >>> from lxml import etree as _etree
+ >>> if sys.version_info[0] >= 3:
+ ... class etree_mock(object):
+ ... def __getattr__(self, name): return getattr(_etree, name)
+ ... def tostring(self, *args, **kwargs):
+ ... s = _etree.tostring(*args, **kwargs)
+ ... if isinstance(s, bytes) and bytes([10]) in s: s = s.decode("utf-8") # CR
+ ... if s[-1] == '\n': s = s[:-1]
+ ... return s
+ ... else:
+ ... class etree_mock(object):
+ ... def __getattr__(self, name): return getattr(_etree, name)
+ ... def tostring(self, *args, **kwargs):
+ ... s = _etree.tostring(*args, **kwargs)
+ ... if s[-1] == '\n': s = s[:-1]
+ ... return s
+ >>> etree = etree_mock()
+
+
+The Element class
+=================
+
+An ``Element`` is the main container object for the ElementTree API. Most of
+the XML tree functionality is accessed through this class. Elements are
+easily created through the ``Element`` factory:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.Element("root")
+
+The XML tag name of elements is accessed through the ``tag`` property:
+
+.. sourcecode:: pycon
+
+ >>> print(root.tag)
+ root
+
+Elements are organised in an XML tree structure. To create child elements and
+add them to a parent element, you can use the ``append()`` method:
+
+.. sourcecode:: pycon
+
+ >>> root.append( etree.Element("child1") )
+
+However, this is so common that there is a shorter and much more efficient way
+to do this: the ``SubElement`` factory. It accepts the same arguments as the
+``Element`` factory, but additionally requires the parent as first argument:
+
+.. sourcecode:: pycon
+
+ >>> child2 = etree.SubElement(root, "child2")
+ >>> child3 = etree.SubElement(root, "child3")
+
+To see that this is really XML, you can serialise the tree you have created:
+
+.. sourcecode:: pycon
+
+ >>> print(etree.tostring(root, pretty_print=True))
+ <root>
+ <child1/>
+ <child2/>
+ <child3/>
+ </root>
+
+
+Elements are lists
+------------------
+
+To make the access to these subelements easy and straight forward,
+elements mimic the behaviour of normal Python lists as closely as
+possible:
+
+.. sourcecode:: pycon
+
+ >>> child = root[0]
+ >>> print(child.tag)
+ child1
+
+ >>> print(len(root))
+ 3
+
+ >>> root.index(root[1]) # lxml.etree only!
+ 1
+
+ >>> children = list(root)
+
+ >>> for child in root:
+ ... print(child.tag)
+ child1
+ child2
+ child3
+
+ >>> root.insert(0, etree.Element("child0"))
+ >>> start = root[:1]
+ >>> end = root[-1:]
+
+ >>> print(start[0].tag)
+ child0
+ >>> print(end[0].tag)
+ child3
+
+Prior to ElementTree 1.3 and lxml 2.0, you could also check the truth value of
+an Element to see if it has children, i.e. if the list of children is empty:
+
+.. sourcecode:: python
+
+ if root: # this no longer works!
+ print("The root element has children")
+
+This is no longer supported as people tend to expect that a "something"
+evaluates to True and expect Elements to be "something", may they have
+children or not. So, many users find it surprising that any Element
+would evaluate to False in an if-statement like the above. Instead,
+use ``len(element)``, which is both more explicit and less error prone.
+
+.. sourcecode:: pycon
+
+ >>> print(etree.iselement(root)) # test if it's some kind of Element
+ True
+ >>> if len(root): # test if it has children
+ ... print("The root element has children")
+ The root element has children
+
+There is another important case where the behaviour of Elements in lxml
+(in 2.0 and later) deviates from that of lists and from that of the
+original ElementTree (prior to version 1.3 or Python 2.7/3.2):
+
+.. sourcecode:: pycon
+
+ >>> for child in root:
+ ... print(child.tag)
+ child0
+ child1
+ child2
+ child3
+ >>> root[0] = root[-1] # this moves the element in lxml.etree!
+ >>> for child in root:
+ ... print(child.tag)
+ child3
+ child1
+ child2
+
+In this example, the last element is *moved* to a different position,
+instead of being copied, i.e. it is automatically removed from its
+previous position when it is put in a different place. In lists,
+objects can appear in multiple positions at the same time, and the
+above assignment would just copy the item reference into the first
+position, so that both contain the exact same item:
+
+.. sourcecode:: pycon
+
+ >>> l = [0, 1, 2, 3]
+ >>> l[0] = l[-1]
+ >>> l
+ [3, 1, 2, 3]
+
+Note that in the original ElementTree, a single Element object can sit
+in any number of places in any number of trees, which allows for the same
+copy operation as with lists. The obvious drawback is that modifications
+to such an Element will apply to all places where it appears in a tree,
+which may or may not be intended.
+
+The upside of this difference is that an Element in ``lxml.etree`` always
+has exactly one parent, which can be queried through the ``getparent()``
+method. This is not supported in the original ElementTree.
+
+.. sourcecode:: pycon
+
+ >>> root is root[0].getparent() # lxml.etree only!
+ True
+
+If you want to *copy* an element to a different position in ``lxml.etree``,
+consider creating an independent *deep copy* using the ``copy`` module
+from Python's standard library:
+
+.. sourcecode:: pycon
+
+ >>> from copy import deepcopy
+
+ >>> element = etree.Element("neu")
+ >>> element.append( deepcopy(root[1]) )
+
+ >>> print(element[0].tag)
+ child1
+ >>> print([ c.tag for c in root ])
+ ['child3', 'child1', 'child2']
+
+The siblings (or neighbours) of an element are accessed as next and previous
+elements:
+
+.. sourcecode:: pycon
+
+ >>> root[0] is root[1].getprevious() # lxml.etree only!
+ True
+ >>> root[1] is root[0].getnext() # lxml.etree only!
+ True
+
+
+Elements carry attributes as a dict
+-----------------------------------
+
+XML elements support attributes. You can create them directly in the Element
+factory:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.Element("root", interesting="totally")
+ >>> etree.tostring(root)
+ b'<root interesting="totally"/>'
+
+Attributes are just unordered name-value pairs, so a very convenient way
+of dealing with them is through the dictionary-like interface of Elements:
+
+.. sourcecode:: pycon
+
+ >>> print(root.get("interesting"))
+ totally
+
+ >>> print(root.get("hello"))
+ None
+ >>> root.set("hello", "Huhu")
+ >>> print(root.get("hello"))
+ Huhu
+
+ >>> etree.tostring(root)
+ b'<root interesting="totally" hello="Huhu"/>'
+
+ >>> sorted(root.keys())
+ ['hello', 'interesting']
+
+ >>> for name, value in sorted(root.items()):
+ ... print('%s = %r' % (name, value))
+ hello = 'Huhu'
+ interesting = 'totally'
+
+For the cases where you want to do item lookup or have other reasons for
+getting a 'real' dictionary-like object, e.g. for passing it around,
+you can use the ``attrib`` property:
+
+.. sourcecode:: pycon
+
+ >>> attributes = root.attrib
+
+ >>> print(attributes["interesting"])
+ totally
+ >>> print(attributes.get("no-such-attribute"))
+ None
+
+ >>> attributes["hello"] = "Guten Tag"
+ >>> print(attributes["hello"])
+ Guten Tag
+ >>> print(root.get("hello"))
+ Guten Tag
+
+Note that ``attrib`` is a dict-like object backed by the Element itself.
+This means that any changes to the Element are reflected in ``attrib``
+and vice versa. It also means that the XML tree stays alive in memory
+as long as the ``attrib`` of one of its Elements is in use. To get an
+independent snapshot of the attributes that does not depend on the XML
+tree, copy it into a dict:
+
+.. sourcecode:: pycon
+
+ >>> d = dict(root.attrib)
+ >>> sorted(d.items())
+ [('hello', 'Guten Tag'), ('interesting', 'totally')]
+
+
+Elements contain text
+---------------------
+
+Elements can contain text:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.Element("root")
+ >>> root.text = "TEXT"
+
+ >>> print(root.text)
+ TEXT
+
+ >>> etree.tostring(root)
+ b'<root>TEXT</root>'
+
+In many XML documents (*data-centric* documents), this is the only place where
+text can be found. It is encapsulated by a leaf tag at the very bottom of the
+tree hierarchy.
+
+However, if XML is used for tagged text documents such as (X)HTML, text can
+also appear between different elements, right in the middle of the tree:
+
+.. sourcecode:: html
+
+ <html><body>Hello<br/>World</body></html>
+
+Here, the ``<br/>`` tag is surrounded by text. This is often referred to as
+*document-style* or *mixed-content* XML. Elements support this through their
+``tail`` property. It contains the text that directly follows the element, up
+to the next element in the XML tree:
+
+.. sourcecode:: pycon
+
+ >>> html = etree.Element("html")
+ >>> body = etree.SubElement(html, "body")
+ >>> body.text = "TEXT"
+
+ >>> etree.tostring(html)
+ b'<html><body>TEXT</body></html>'
+
+ >>> br = etree.SubElement(body, "br")
+ >>> etree.tostring(html)
+ b'<html><body>TEXT<br/></body></html>'
+
+ >>> br.tail = "TAIL"
+ >>> etree.tostring(html)
+ b'<html><body>TEXT<br/>TAIL</body></html>'
+
+The two properties ``.text`` and ``.tail`` are enough to represent any
+text content in an XML document. This way, the ElementTree API does
+not require any `special text nodes`_ in addition to the Element
+class, that tend to get in the way fairly often (as you might know
+from classic DOM_ APIs).
+
+However, there are cases where the tail text also gets in the way.
+For example, when you serialise an Element from within the tree, you
+do not always want its tail text in the result (although you would
+still want the tail text of its children). For this purpose, the
+``tostring()`` function accepts the keyword argument ``with_tail``:
+
+.. sourcecode:: pycon
+
+ >>> etree.tostring(br)
+ b'<br/>TAIL'
+ >>> etree.tostring(br, with_tail=False) # lxml.etree only!
+ b'<br/>'
+
+.. _`special text nodes`: http://www.w3.org/TR/DOM-Level-3-Core/core.html#ID-1312295772
+.. _DOM: http://www.w3.org/TR/DOM-Level-3-Core/core.html
+
+If you want to read *only* the text, i.e. without any intermediate
+tags, you have to recursively concatenate all ``text`` and ``tail``
+attributes in the correct order. Again, the ``tostring()`` function
+comes to the rescue, this time using the ``method`` keyword:
+
+.. sourcecode:: pycon
+
+ >>> etree.tostring(html, method="text")
+ b'TEXTTAIL'
+
+
+Using XPath to find text
+------------------------
+
+.. _XPath: xpathxslt.html#xpath
+
+Another way to extract the text content of a tree is XPath_, which
+also allows you to extract the separate text chunks into a list:
+
+.. sourcecode:: pycon
+
+ >>> print(html.xpath("string()")) # lxml.etree only!
+ TEXTTAIL
+ >>> print(html.xpath("//text()")) # lxml.etree only!
+ ['TEXT', 'TAIL']
+
+If you want to use this more often, you can wrap it in a function:
+
+.. sourcecode:: pycon
+
+ >>> build_text_list = etree.XPath("//text()") # lxml.etree only!
+ >>> print(build_text_list(html))
+ ['TEXT', 'TAIL']
+
+Note that a string result returned by XPath is a special 'smart'
+object that knows about its origins. You can ask it where it came
+from through its ``getparent()`` method, just as you would with
+Elements:
+
+.. sourcecode:: pycon
+
+ >>> texts = build_text_list(html)
+ >>> print(texts[0])
+ TEXT
+ >>> parent = texts[0].getparent()
+ >>> print(parent.tag)
+ body
+
+ >>> print(texts[1])
+ TAIL
+ >>> print(texts[1].getparent().tag)
+ br
+
+You can also find out if it's normal text content or tail text:
+
+.. sourcecode:: pycon
+
+ >>> print(texts[0].is_text)
+ True
+ >>> print(texts[1].is_text)
+ False
+ >>> print(texts[1].is_tail)
+ True
+
+While this works for the results of the ``text()`` function, lxml will
+not tell you the origin of a string value that was constructed by the
+XPath functions ``string()`` or ``concat()``:
+
+.. sourcecode:: pycon
+
+ >>> stringify = etree.XPath("string()")
+ >>> print(stringify(html))
+ TEXTTAIL
+ >>> print(stringify(html).getparent())
+ None
+
+
+Tree iteration
+--------------
+
+For problems like the above, where you want to recursively traverse the tree
+and do something with its elements, tree iteration is a very convenient
+solution. Elements provide a tree iterator for this purpose. It yields
+elements in *document order*, i.e. in the order their tags would appear if you
+serialised the tree to XML:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.Element("root")
+ >>> etree.SubElement(root, "child").text = "Child 1"
+ >>> etree.SubElement(root, "child").text = "Child 2"
+ >>> etree.SubElement(root, "another").text = "Child 3"
+
+ >>> print(etree.tostring(root, pretty_print=True))
+ <root>
+ <child>Child 1</child>
+ <child>Child 2</child>
+ <another>Child 3</another>
+ </root>
+
+ >>> for element in root.iter():
+ ... print("%s - %s" % (element.tag, element.text))
+ root - None
+ child - Child 1
+ child - Child 2
+ another - Child 3
+
+If you know you are only interested in a single tag, you can pass its name to
+``iter()`` to have it filter for you. Starting with lxml 3.0, you can also
+pass more than one tag to intercept on multiple tags during iteration.
+
+.. sourcecode:: pycon
+
+ >>> for element in root.iter("child"):
+ ... print("%s - %s" % (element.tag, element.text))
+ child - Child 1
+ child - Child 2
+
+ >>> for element in root.iter("another", "child"):
+ ... print("%s - %s" % (element.tag, element.text))
+ child - Child 1
+ child - Child 2
+ another - Child 3
+
+By default, iteration yields all nodes in the tree, including
+ProcessingInstructions, Comments and Entity instances. If you want to
+make sure only Element objects are returned, you can pass the
+``Element`` factory as tag parameter:
+
+.. sourcecode:: pycon
+
+ >>> root.append(etree.Entity("#234"))
+ >>> root.append(etree.Comment("some comment"))
+
+ >>> for element in root.iter():
+ ... if isinstance(element.tag, basestring): # or 'str' in Python 3
+ ... print("%s - %s" % (element.tag, element.text))
+ ... else:
+ ... print("SPECIAL: %s - %s" % (element, element.text))
+ root - None
+ child - Child 1
+ child - Child 2
+ another - Child 3
+ SPECIAL: &#234; - &#234;
+ SPECIAL: <!--some comment--> - some comment
+
+ >>> for element in root.iter(tag=etree.Element):
+ ... print("%s - %s" % (element.tag, element.text))
+ root - None
+ child - Child 1
+ child - Child 2
+ another - Child 3
+
+ >>> for element in root.iter(tag=etree.Entity):
+ ... print(element.text)
+ &#234;
+
+Note that passing a wildcard ``"*"`` tag name will also yield all
+``Element`` nodes (and only elements).
+
+In ``lxml.etree``, elements provide `further iterators`_ for all directions in the
+tree: children, parents (or rather ancestors) and siblings.
+
+.. _`further iterators`: api.html#iteration
+
+
+Serialisation
+-------------
+
+Serialisation commonly uses the ``tostring()`` function that returns a
+string, or the ``ElementTree.write()`` method that writes to a file, a
+file-like object, or a URL (via FTP PUT or HTTP POST). Both calls accept
+the same keyword arguments like ``pretty_print`` for formatted output
+or ``encoding`` to select a specific output encoding other than plain
+ASCII:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML('<root><a><b/></a></root>')
+
+ >>> etree.tostring(root)
+ b'<root><a><b/></a></root>'
+
+ >>> print(etree.tostring(root, xml_declaration=True))
+ <?xml version='1.0' encoding='ASCII'?>
+ <root><a><b/></a></root>
+
+ >>> print(etree.tostring(root, encoding='iso-8859-1'))
+ <?xml version='1.0' encoding='iso-8859-1'?>
+ <root><a><b/></a></root>
+
+ >>> print(etree.tostring(root, pretty_print=True))
+ <root>
+ <a>
+ <b/>
+ </a>
+ </root>
+
+Note that pretty printing appends a newline at the end.
+
+For more fine-grained control over the pretty-printing, you can add
+whitespace indentation to the tree before serialising it, using the
+``indent()`` function (added in lxml 4.5):
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML('<root><a><b/>\n</a></root>')
+ >>> print(etree.tostring(root))
+ <root><a><b/>
+ </a></root>
+
+ >>> etree.indent(root)
+ >>> print(etree.tostring(root))
+ <root>
+ <a>
+ <b/>
+ </a>
+ </root>
+
+ >>> root.text
+ '\n '
+ >>> root[0].text
+ '\n '
+
+ >>> etree.indent(root, space=" ")
+ >>> print(etree.tostring(root))
+ <root>
+ <a>
+ <b/>
+ </a>
+ </root>
+
+ >>> etree.indent(root, space="\t")
+ >>> etree.tostring(root)
+ '<root>\n\t<a>\n\t\t<b/>\n\t</a>\n</root>'
+
+In lxml 2.0 and later (as well as ElementTree 1.3), the serialisation
+functions can do more than XML serialisation. You can serialise to
+HTML or extract the text content by passing the ``method`` keyword:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML(
+ ... '<html><head/><body><p>Hello<br/>World</p></body></html>')
+
+ >>> etree.tostring(root) # default: method = 'xml'
+ b'<html><head/><body><p>Hello<br/>World</p></body></html>'
+
+ >>> etree.tostring(root, method='xml') # same as above
+ b'<html><head/><body><p>Hello<br/>World</p></body></html>'
+
+ >>> etree.tostring(root, method='html')
+ b'<html><head></head><body><p>Hello<br>World</p></body></html>'
+
+ >>> print(etree.tostring(root, method='html', pretty_print=True))
+ <html>
+ <head></head>
+ <body><p>Hello<br>World</p></body>
+ </html>
+
+ >>> etree.tostring(root, method='text')
+ b'HelloWorld'
+
+As for XML serialisation, the default encoding for plain text
+serialisation is ASCII:
+
+.. sourcecode:: pycon
+
+ >>> br = next(root.iter('br')) # get first result of iteration
+ >>> br.tail = u'W\xf6rld'
+
+ >>> etree.tostring(root, method='text') # doctest: +ELLIPSIS
+ Traceback (most recent call last):
+ ...
+ UnicodeEncodeError: 'ascii' codec can't encode character u'\xf6' ...
+
+ >>> etree.tostring(root, method='text', encoding="UTF-8")
+ b'HelloW\xc3\xb6rld'
+
+Here, serialising to a Python unicode string instead of a byte string
+might become handy. Just pass the name ``'unicode'`` as encoding:
+
+.. sourcecode:: pycon
+
+ >>> etree.tostring(root, encoding='unicode', method='text')
+ u'HelloW\xf6rld'
+
+The W3C has a good `article about the Unicode character set and
+character encodings
+<http://www.w3.org/International/tutorials/tutorial-char-enc/>`_.
+
+
+The ElementTree class
+=====================
+
+An ``ElementTree`` is mainly a document wrapper around a tree with a
+root node. It provides a couple of methods for serialisation and
+general document handling.
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML('''\
+ ... <?xml version="1.0"?>
+ ... <!DOCTYPE root SYSTEM "test" [ <!ENTITY tasty "parsnips"> ]>
+ ... <root>
+ ... <a>&tasty;</a>
+ ... </root>
+ ... ''')
+
+ >>> tree = etree.ElementTree(root)
+ >>> print(tree.docinfo.xml_version)
+ 1.0
+ >>> print(tree.docinfo.doctype)
+ <!DOCTYPE root SYSTEM "test">
+
+ >>> tree.docinfo.public_id = '-//W3C//DTD XHTML 1.0 Transitional//EN'
+ >>> tree.docinfo.system_url = 'file://local.dtd'
+ >>> print(tree.docinfo.doctype)
+ <!DOCTYPE root PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "file://local.dtd">
+
+An ``ElementTree`` is also what you get back when you call the
+``parse()`` function to parse files or file-like objects (see the
+parsing section below).
+
+One of the important differences is that the ``ElementTree`` class
+serialises as a complete document, as opposed to a single ``Element``.
+This includes top-level processing instructions and comments, as well
+as a DOCTYPE and other DTD content in the document:
+
+.. sourcecode:: pycon
+
+ >>> print(etree.tostring(tree)) # lxml 1.3.4 and later
+ <!DOCTYPE root PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "file://local.dtd" [
+ <!ENTITY tasty "parsnips">
+ ]>
+ <root>
+ <a>parsnips</a>
+ </root>
+
+In the original xml.etree.ElementTree implementation and in lxml
+up to 1.3.3, the output looks the same as when serialising only
+the root Element:
+
+.. sourcecode:: pycon
+
+ >>> print(etree.tostring(tree.getroot()))
+ <root>
+ <a>parsnips</a>
+ </root>
+
+This serialisation behaviour has changed in lxml 1.3.4. Before,
+the tree was serialised without DTD content, which made lxml
+lose DTD information in an input-output cycle.
+
+
+Parsing from strings and files
+==============================
+
+``lxml.etree`` supports parsing XML in a number of ways and from all
+important sources, namely strings, files, URLs (http/ftp) and
+file-like objects. The main parse functions are ``fromstring()`` and
+``parse()``, both called with the source as first argument. By
+default, they use the standard parser, but you can always pass a
+different parser as second argument.
+
+
+The fromstring() function
+-------------------------
+
+The ``fromstring()`` function is the easiest way to parse a string:
+
+.. sourcecode:: pycon
+
+ >>> some_xml_data = "<root>data</root>"
+
+ >>> root = etree.fromstring(some_xml_data)
+ >>> print(root.tag)
+ root
+ >>> etree.tostring(root)
+ b'<root>data</root>'
+
+
+The XML() function
+------------------
+
+The ``XML()`` function behaves like the ``fromstring()`` function, but is
+commonly used to write XML literals right into the source:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<root>data</root>")
+ >>> print(root.tag)
+ root
+ >>> etree.tostring(root)
+ b'<root>data</root>'
+
+There is also a corresponding function ``HTML()`` for HTML literals.
+
+.. sourcecode:: pycon
+
+ >>> root = etree.HTML("<p>data</p>")
+ >>> etree.tostring(root)
+ b'<html><body><p>data</p></body></html>'
+
+
+The parse() function
+--------------------
+
+The ``parse()`` function is used to parse from files and file-like objects.
+
+As an example of such a file-like object, the following code uses the
+``BytesIO`` class for reading from a string instead of an external file.
+That class comes from the ``io`` module in Python 2.6 and later. In older
+Python versions, you will have to use the ``StringIO`` class from the
+``StringIO`` module. However, in real life, you would obviously avoid
+doing this all together and use the string parsing functions above.
+
+.. sourcecode:: pycon
+
+ >>> from io import BytesIO
+ >>> some_file_or_file_like_object = BytesIO(b"<root>data</root>")
+
+ >>> tree = etree.parse(some_file_or_file_like_object)
+
+ >>> etree.tostring(tree)
+ b'<root>data</root>'
+
+Note that ``parse()`` returns an ElementTree object, not an Element object as
+the string parser functions:
+
+.. sourcecode:: pycon
+
+ >>> root = tree.getroot()
+ >>> print(root.tag)
+ root
+ >>> etree.tostring(root)
+ b'<root>data</root>'
+
+The reasoning behind this difference is that ``parse()`` returns a
+complete document from a file, while the string parsing functions are
+commonly used to parse XML fragments.
+
+The ``parse()`` function supports any of the following sources:
+
+* an open file object (make sure to open it in binary mode)
+
+* a file-like object that has a ``.read(byte_count)`` method returning
+ a byte string on each call
+
+* a filename string
+
+* an HTTP or FTP URL string
+
+Note that passing a filename or URL is usually faster than passing an
+open file or file-like object. However, the HTTP/FTP client in libxml2
+is rather simple, so things like HTTP authentication require a dedicated
+URL request library, e.g. ``urllib2`` or ``requests``. These libraries
+usually provide a file-like object for the result that you can parse
+from while the response is streaming in.
+
+
+Parser objects
+--------------
+
+By default, ``lxml.etree`` uses a standard parser with a default setup. If
+you want to configure the parser, you can create a new instance:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser(remove_blank_text=True) # lxml.etree only!
+
+This creates a parser that removes empty text between tags while parsing,
+which can reduce the size of the tree and avoid dangling tail text if you know
+that whitespace-only content is not meaningful for your data. An example:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<root> <a/> <b> </b> </root>", parser)
+
+ >>> etree.tostring(root)
+ b'<root><a/><b> </b></root>'
+
+Note that the whitespace content inside the ``<b>`` tag was not removed, as
+content at leaf elements tends to be data content (even if blank). You can
+easily remove it in an additional step by traversing the tree:
+
+.. sourcecode:: pycon
+
+ >>> for element in root.iter("*"):
+ ... if element.text is not None and not element.text.strip():
+ ... element.text = None
+
+ >>> etree.tostring(root)
+ b'<root><a/><b/></root>'
+
+See ``help(etree.XMLParser)`` to find out about the available parser options.
+
+
+Incremental parsing
+-------------------
+
+``lxml.etree`` provides two ways for incremental step-by-step parsing. One is
+through file-like objects, where it calls the ``read()`` method repeatedly.
+This is best used where the data arrives from a source like ``urllib`` or any
+other file-like object that can provide data on request. Note that the parser
+will block and wait until data becomes available in this case:
+
+.. sourcecode:: pycon
+
+ >>> class DataSource:
+ ... data = [ b"<roo", b"t><", b"a/", b"><", b"/root>" ]
+ ... def read(self, requested_size):
+ ... try:
+ ... return self.data.pop(0)
+ ... except IndexError:
+ ... return b''
+
+ >>> tree = etree.parse(DataSource())
+
+ >>> etree.tostring(tree)
+ b'<root><a/></root>'
+
+The second way is through a feed parser interface, given by the ``feed(data)``
+and ``close()`` methods:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser()
+
+ >>> parser.feed("<roo")
+ >>> parser.feed("t><")
+ >>> parser.feed("a/")
+ >>> parser.feed("><")
+ >>> parser.feed("/root>")
+
+ >>> root = parser.close()
+
+ >>> etree.tostring(root)
+ b'<root><a/></root>'
+
+Here, you can interrupt the parsing process at any time and continue it later
+on with another call to the ``feed()`` method. This comes in handy if you
+want to avoid blocking calls to the parser, e.g. in frameworks like Twisted,
+or whenever data comes in slowly or in chunks and you want to do other things
+while waiting for the next chunk.
+
+After calling the ``close()`` method (or when an exception was raised
+by the parser), you can reuse the parser by calling its ``feed()``
+method again:
+
+.. sourcecode:: pycon
+
+ >>> parser.feed("<root/>")
+ >>> root = parser.close()
+ >>> etree.tostring(root)
+ b'<root/>'
+
+
+Event-driven parsing
+--------------------
+
+Sometimes, all you need from a document is a small fraction somewhere deep
+inside the tree, so parsing the whole tree into memory, traversing it and
+dropping it can be too much overhead. ``lxml.etree`` supports this use case
+with two event-driven parser interfaces, one that generates parser events
+while building the tree (``iterparse``), and one that does not build the tree
+at all, and instead calls feedback methods on a target object in a SAX-like
+fashion.
+
+Here is a simple ``iterparse()`` example:
+
+.. sourcecode:: pycon
+
+ >>> some_file_like = BytesIO(b"<root><a>data</a></root>")
+
+ >>> for event, element in etree.iterparse(some_file_like):
+ ... print("%s, %4s, %s" % (event, element.tag, element.text))
+ end, a, data
+ end, root, None
+
+By default, ``iterparse()`` only generates events when it is done parsing an
+element, but you can control this through the ``events`` keyword argument:
+
+.. sourcecode:: pycon
+
+ >>> some_file_like = BytesIO(b"<root><a>data</a></root>")
+
+ >>> for event, element in etree.iterparse(some_file_like,
+ ... events=("start", "end")):
+ ... print("%5s, %4s, %s" % (event, element.tag, element.text))
+ start, root, None
+ start, a, data
+ end, a, data
+ end, root, None
+
+Note that the text, tail, and children of an Element are not necessarily present
+yet when receiving the ``start`` event. Only the ``end`` event guarantees
+that the Element has been parsed completely.
+
+It also allows you to ``.clear()`` or modify the content of an Element to
+save memory. So if you parse a large tree and you want to keep memory
+usage small, you should clean up parts of the tree that you no longer
+need. The ``keep_tail=True`` argument to ``.clear()`` makes sure that
+(tail) text content that follows the current element will not be touched.
+It is highly discouraged to modify any content that the parser may not
+have completely read through yet.
+
+.. sourcecode:: pycon
+
+ >>> some_file_like = BytesIO(
+ ... b"<root><a><b>data</b></a><a><b/></a></root>")
+
+ >>> for event, element in etree.iterparse(some_file_like):
+ ... if element.tag == 'b':
+ ... print(element.text)
+ ... elif element.tag == 'a':
+ ... print("** cleaning up the subtree")
+ ... element.clear(keep_tail=True)
+ data
+ ** cleaning up the subtree
+ None
+ ** cleaning up the subtree
+
+A very important use case for ``iterparse()`` is parsing large
+generated XML files, e.g. database dumps. Most often, these XML
+formats only have one main data item element that hangs directly below
+the root node and that is repeated thousands of times. In this case,
+it is best practice to let ``lxml.etree`` do the tree building and only to
+intercept on exactly this one Element, using the normal tree API
+for data extraction.
+
+.. sourcecode:: pycon
+
+ >>> xml_file = BytesIO(b'''\
+ ... <root>
+ ... <a><b>ABC</b><c>abc</c></a>
+ ... <a><b>MORE DATA</b><c>more data</c></a>
+ ... <a><b>XYZ</b><c>xyz</c></a>
+ ... </root>''')
+
+ >>> for _, element in etree.iterparse(xml_file, tag='a'):
+ ... print('%s -- %s' % (element.findtext('b'), element[1].text))
+ ... element.clear(keep_tail=True)
+ ABC -- abc
+ MORE DATA -- more data
+ XYZ -- xyz
+
+If, for some reason, building the tree is not desired at all, the
+target parser interface of ``lxml.etree`` can be used. It creates
+SAX-like events by calling the methods of a target object. By
+implementing some or all of these methods, you can control which
+events are generated:
+
+.. sourcecode:: pycon
+
+ >>> class ParserTarget:
+ ... events = []
+ ... close_count = 0
+ ... def start(self, tag, attrib):
+ ... self.events.append(("start", tag, attrib))
+ ... def close(self):
+ ... events, self.events = self.events, []
+ ... self.close_count += 1
+ ... return events
+
+ >>> parser_target = ParserTarget()
+
+ >>> parser = etree.XMLParser(target=parser_target)
+ >>> events = etree.fromstring('<root test="true"/>', parser)
+
+ >>> print(parser_target.close_count)
+ 1
+
+ >>> for event in events:
+ ... print('event: %s - tag: %s' % (event[0], event[1]))
+ ... for attr, value in event[2].items():
+ ... print(' * %s = %s' % (attr, value))
+ event: start - tag: root
+ * test = true
+
+You can reuse the parser and its target as often as you like, so you
+should take care that the ``.close()`` method really resets the
+target to a usable state (also in the case of an error!).
+
+.. sourcecode:: pycon
+
+ >>> events = etree.fromstring('<root test="true"/>', parser)
+ >>> print(parser_target.close_count)
+ 2
+ >>> events = etree.fromstring('<root test="true"/>', parser)
+ >>> print(parser_target.close_count)
+ 3
+ >>> events = etree.fromstring('<root test="true"/>', parser)
+ >>> print(parser_target.close_count)
+ 4
+
+ >>> for event in events:
+ ... print('event: %s - tag: %s' % (event[0], event[1]))
+ ... for attr, value in event[2].items():
+ ... print(' * %s = %s' % (attr, value))
+ event: start - tag: root
+ * test = true
+
+
+Namespaces
+==========
+
+The ElementTree API avoids
+`namespace prefixes <http://www.w3.org/TR/xml-names/#ns-qualnames>`_
+wherever possible and deploys the real namespace (the URI) instead:
+
+.. sourcecode:: pycon
+
+ >>> xhtml = etree.Element("{http://www.w3.org/1999/xhtml}html")
+ >>> body = etree.SubElement(xhtml, "{http://www.w3.org/1999/xhtml}body")
+ >>> body.text = "Hello World"
+
+ >>> print(etree.tostring(xhtml, pretty_print=True))
+ <html:html xmlns:html="http://www.w3.org/1999/xhtml">
+ <html:body>Hello World</html:body>
+ </html:html>
+
+The notation that ElementTree uses was originally brought up by
+`James Clark <http://www.jclark.com/xml/xmlns.htm>`_. It has the major
+advantage of providing a universally qualified name for a tag, regardless
+of any prefixes that may or may not have been used or defined in a document.
+By moving the indirection of prefixes out of the way, it makes namespace
+aware code much clearer and easier to get right.
+
+As you can see from the example, prefixes only become important when
+you serialise the result. However, the above code looks somewhat
+verbose due to the lengthy namespace names. And retyping or copying a
+string over and over again is error prone. It is therefore common
+practice to store a namespace URI in a global variable. To adapt the
+namespace prefixes for serialisation, you can also pass a mapping to
+the Element factory function, e.g. to define the default namespace:
+
+.. sourcecode:: pycon
+
+ >>> XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
+ >>> XHTML = "{%s}" % XHTML_NAMESPACE
+
+ >>> NSMAP = {None : XHTML_NAMESPACE} # the default namespace (no prefix)
+
+ >>> xhtml = etree.Element(XHTML + "html", nsmap=NSMAP) # lxml only!
+ >>> body = etree.SubElement(xhtml, XHTML + "body")
+ >>> body.text = "Hello World"
+
+ >>> print(etree.tostring(xhtml, pretty_print=True))
+ <html xmlns="http://www.w3.org/1999/xhtml">
+ <body>Hello World</body>
+ </html>
+
+You can also use the ``QName`` helper class to build or split qualified
+tag names:
+
+.. sourcecode:: pycon
+
+ >>> tag = etree.QName('http://www.w3.org/1999/xhtml', 'html')
+ >>> print(tag.localname)
+ html
+ >>> print(tag.namespace)
+ http://www.w3.org/1999/xhtml
+ >>> print(tag.text)
+ {http://www.w3.org/1999/xhtml}html
+
+ >>> tag = etree.QName('{http://www.w3.org/1999/xhtml}html')
+ >>> print(tag.localname)
+ html
+ >>> print(tag.namespace)
+ http://www.w3.org/1999/xhtml
+
+ >>> root = etree.Element('{http://www.w3.org/1999/xhtml}html')
+ >>> tag = etree.QName(root)
+ >>> print(tag.localname)
+ html
+
+ >>> tag = etree.QName(root, 'script')
+ >>> print(tag.text)
+ {http://www.w3.org/1999/xhtml}script
+ >>> tag = etree.QName('{http://www.w3.org/1999/xhtml}html', 'script')
+ >>> print(tag.text)
+ {http://www.w3.org/1999/xhtml}script
+
+lxml.etree allows you to look up the current namespaces defined for a
+node through the ``.nsmap`` property:
+
+.. sourcecode:: pycon
+
+ >>> xhtml.nsmap
+ {None: 'http://www.w3.org/1999/xhtml'}
+
+Note, however, that this includes all prefixes known in the context of
+an Element, not only those that it defines itself.
+
+.. sourcecode:: pycon
+
+ >>> root = etree.Element('root', nsmap={'a': 'http://a.b/c'})
+ >>> child = etree.SubElement(root, 'child',
+ ... nsmap={'b': 'http://b.c/d'})
+ >>> len(root.nsmap)
+ 1
+ >>> len(child.nsmap)
+ 2
+ >>> child.nsmap['a']
+ 'http://a.b/c'
+ >>> child.nsmap['b']
+ 'http://b.c/d'
+
+Therefore, modifying the returned dict cannot have any meaningful
+impact on the Element. Any changes to it are ignored.
+
+Namespaces on attributes work alike, but as of version 2.3, ``lxml.etree``
+will ensure that the attribute uses a prefixed namespace
+declaration. This is because unprefixed attribute names are not
+considered being in a namespace by the XML namespace specification
+(`section 6.2`_), so they may end up losing their namespace on a
+serialise-parse roundtrip, even if they appear in a namespaced
+element.
+
+.. sourcecode:: pycon
+
+ >>> body.set(XHTML + "bgcolor", "#CCFFAA")
+
+ >>> print(etree.tostring(xhtml, pretty_print=True))
+ <html xmlns="http://www.w3.org/1999/xhtml">
+ <body xmlns:html="http://www.w3.org/1999/xhtml" html:bgcolor="#CCFFAA">Hello World</body>
+ </html>
+
+ >>> print(body.get("bgcolor"))
+ None
+ >>> body.get(XHTML + "bgcolor")
+ '#CCFFAA'
+
+.. _`section 6.2`: http://www.w3.org/TR/2009/REC-xml-names-20091208/#defaulting
+
+You can also use XPath with fully qualified names:
+
+.. sourcecode:: pycon
+
+ >>> find_xhtml_body = etree.ETXPath( # lxml only !
+ ... "//{%s}body" % XHTML_NAMESPACE)
+ >>> results = find_xhtml_body(xhtml)
+
+ >>> print(results[0].tag)
+ {http://www.w3.org/1999/xhtml}body
+
+For convenience, you can use ``"*"`` wildcards in all iterators of ``lxml.etree``,
+both for tag names and namespaces:
+
+.. sourcecode:: pycon
+
+ >>> for el in xhtml.iter('*'): print(el.tag) # any element
+ {http://www.w3.org/1999/xhtml}html
+ {http://www.w3.org/1999/xhtml}body
+ >>> for el in xhtml.iter('{http://www.w3.org/1999/xhtml}*'): print(el.tag)
+ {http://www.w3.org/1999/xhtml}html
+ {http://www.w3.org/1999/xhtml}body
+ >>> for el in xhtml.iter('{*}body'): print(el.tag)
+ {http://www.w3.org/1999/xhtml}body
+
+To look for elements that do not have a namespace, either use the
+plain tag name or provide the empty namespace explicitly:
+
+.. sourcecode:: pycon
+
+ >>> [ el.tag for el in xhtml.iter('{http://www.w3.org/1999/xhtml}body') ]
+ ['{http://www.w3.org/1999/xhtml}body']
+ >>> [ el.tag for el in xhtml.iter('body') ]
+ []
+ >>> [ el.tag for el in xhtml.iter('{}body') ]
+ []
+ >>> [ el.tag for el in xhtml.iter('{}*') ]
+ []
+
+
+The E-factory
+=============
+
+The ``E-factory`` provides a simple and compact syntax for generating XML and
+HTML:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.builder import E
+
+ >>> def CLASS(*args): # class is a reserved word in Python
+ ... return {"class":' '.join(args)}
+
+ >>> html = page = (
+ ... E.html( # create an Element called "html"
+ ... E.head(
+ ... E.title("This is a sample document")
+ ... ),
+ ... E.body(
+ ... E.h1("Hello!", CLASS("title")),
+ ... E.p("This is a paragraph with ", E.b("bold"), " text in it!"),
+ ... E.p("This is another paragraph, with a", "\n ",
+ ... E.a("link", href="http://www.python.org"), "."),
+ ... E.p("Here are some reserved characters: <spam&egg>."),
+ ... etree.XML("<p>And finally an embedded XHTML fragment.</p>"),
+ ... )
+ ... )
+ ... )
+
+ >>> print(etree.tostring(page, pretty_print=True))
+ <html>
+ <head>
+ <title>This is a sample document</title>
+ </head>
+ <body>
+ <h1 class="title">Hello!</h1>
+ <p>This is a paragraph with <b>bold</b> text in it!</p>
+ <p>This is another paragraph, with a
+ <a href="http://www.python.org">link</a>.</p>
+ <p>Here are some reserved characters: &lt;spam&amp;egg&gt;.</p>
+ <p>And finally an embedded XHTML fragment.</p>
+ </body>
+ </html>
+
+Element creation based on attribute access makes it easy to build up a
+simple vocabulary for an XML language:
+
+.. sourcecode:: pycon
+
+ >>> from lxml.builder import ElementMaker # lxml only !
+
+ >>> E = ElementMaker(namespace="http://my.de/fault/namespace",
+ ... nsmap={'p' : "http://my.de/fault/namespace"})
+
+ >>> DOC = E.doc
+ >>> TITLE = E.title
+ >>> SECTION = E.section
+ >>> PAR = E.par
+
+ >>> my_doc = DOC(
+ ... TITLE("The dog and the hog"),
+ ... SECTION(
+ ... TITLE("The dog"),
+ ... PAR("Once upon a time, ..."),
+ ... PAR("And then ...")
+ ... ),
+ ... SECTION(
+ ... TITLE("The hog"),
+ ... PAR("Sooner or later ...")
+ ... )
+ ... )
+
+ >>> print(etree.tostring(my_doc, pretty_print=True))
+ <p:doc xmlns:p="http://my.de/fault/namespace">
+ <p:title>The dog and the hog</p:title>
+ <p:section>
+ <p:title>The dog</p:title>
+ <p:par>Once upon a time, ...</p:par>
+ <p:par>And then ...</p:par>
+ </p:section>
+ <p:section>
+ <p:title>The hog</p:title>
+ <p:par>Sooner or later ...</p:par>
+ </p:section>
+ </p:doc>
+
+One such example is the module ``lxml.html.builder``, which provides a
+vocabulary for HTML.
+
+When dealing with multiple namespaces, it is good practice to define
+one ElementMaker for each namespace URI. Again, note how the above
+example predefines the tag builders in named constants. That makes it
+easy to put all tag declarations of a namespace into one Python module
+and to import/use the tag name constants from there. This avoids
+pitfalls like typos or accidentally missing namespaces.
+
+
+ElementPath
+===========
+
+The ElementTree library comes with a simple XPath-like path language
+called ElementPath_. The main difference is that you can use the
+``{namespace}tag`` notation in ElementPath expressions. However,
+advanced features like value comparison and functions are not
+available.
+
+.. _ElementPath: http://effbot.org/zone/element-xpath.htm
+.. _`full XPath implementation`: xpathxslt.html#xpath
+
+In addition to a `full XPath implementation`_, ``lxml.etree`` supports the
+ElementPath language in the same way ElementTree does, even using
+(almost) the same implementation. The API provides four methods here
+that you can find on Elements and ElementTrees:
+
+* ``iterfind()`` iterates over all Elements that match the path
+ expression
+
+* ``findall()`` returns a list of matching Elements
+
+* ``find()`` efficiently returns only the first match
+
+* ``findtext()`` returns the ``.text`` content of the first match
+
+Here are some examples:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<root><a x='123'>aText<b/><c/><b/></a></root>")
+
+Find a child of an Element:
+
+.. sourcecode:: pycon
+
+ >>> print(root.find("b"))
+ None
+ >>> print(root.find("a").tag)
+ a
+
+Find an Element anywhere in the tree:
+
+.. sourcecode:: pycon
+
+ >>> print(root.find(".//b").tag)
+ b
+ >>> [ b.tag for b in root.iterfind(".//b") ]
+ ['b', 'b']
+
+Find Elements with a certain attribute:
+
+.. sourcecode:: pycon
+
+ >>> print(root.findall(".//a[@x]")[0].tag)
+ a
+ >>> print(root.findall(".//a[@y]"))
+ []
+
+In lxml 3.4, there is a new helper to generate a structural ElementPath
+expression for an Element:
+
+.. sourcecode:: pycon
+
+ >>> tree = etree.ElementTree(root)
+ >>> a = root[0]
+ >>> print(tree.getelementpath(a[0]))
+ a/b[1]
+ >>> print(tree.getelementpath(a[1]))
+ a/c
+ >>> print(tree.getelementpath(a[2]))
+ a/b[2]
+ >>> tree.find(tree.getelementpath(a[2])) == a[2]
+ True
+
+As long as the tree is not modified, this path expression represents an
+identifier for a given element that can be used to ``find()`` it in the same
+tree later. Compared to XPath, ElementPath expressions have the advantage
+of being self-contained even for documents that use namespaces.
+
+The ``.iter()`` method is a special case that only finds specific tags
+in the tree by their name, not based on a path. That means that the
+following commands are equivalent in the success case:
+
+.. sourcecode:: pycon
+
+ >>> print(root.find(".//b").tag)
+ b
+ >>> print(next(root.iterfind(".//b")).tag)
+ b
+ >>> print(next(root.iter("b")).tag)
+ b
+
+Note that the ``.find()`` method simply returns None if no match is found,
+whereas the other two examples would raise a ``StopIteration`` exception.
diff --git a/doc/valgrind.txt b/doc/valgrind.txt
new file mode 100644
index 0000000..8df7295
--- /dev/null
+++ b/doc/valgrind.txt
@@ -0,0 +1,3 @@
+The command used to run the tests with valgrind:
+
+valgrind --tool=memcheck --leak-check=full --suppressions=valgrind-python.supp python2.7 test.py
diff --git a/doc/validation.txt b/doc/validation.txt
new file mode 100644
index 0000000..af9d007
--- /dev/null
+++ b/doc/validation.txt
@@ -0,0 +1,677 @@
+====================
+Validation with lxml
+====================
+
+Apart from the built-in DTD support in parsers, lxml currently supports three
+schema languages: DTD_, `Relax NG`_ and `XML Schema`_. All three provide
+identical APIs in lxml, represented by validator classes with the obvious
+names.
+
+.. _DTD: http://en.wikipedia.org/wiki/Document_Type_Definition
+.. _`Relax NG`: http://www.relaxng.org/
+.. _`XML Schema`: http://www.w3.org/XML/Schema
+
+lxml also provides support for ISO-`Schematron`_, based on the pure-XSLT
+`skeleton implementation`_ of Schematron:
+
+.. _Schematron: http://www.schematron.com
+.. _`skeleton implementation`: http://www.schematron.com/implementation.html
+
+There is also basic support for `pre-ISO-Schematron` through the libxml2
+Schematron features. However, this does not currently support error reporting
+in the validation phase due to insufficiencies in the implementation as of
+libxml2 2.6.30.
+
+.. _`pre-ISO-Schematron`: http://www.ascc.net/xml/schematron
+
+.. contents::
+..
+ 1 Validation at parse time
+ 2 DTD
+ 3 RelaxNG
+ 4 XMLSchema
+ 5 Schematron
+ 6 (Pre-ISO-Schematron)
+
+The usual setup procedure:
+
+.. sourcecode:: pycon
+
+ >>> from lxml import etree
+
+..
+ >>> try: from StringIO import StringIO
+ ... except ImportError:
+ ... from io import BytesIO
+ ... def StringIO(s):
+ ... if isinstance(s, str): s = s.encode("UTF-8")
+ ... return BytesIO(s)
+
+
+Validation at parse time
+------------------------
+
+The parser in lxml can do on-the-fly validation of a document against
+a DTD or an XML schema. The DTD is retrieved automatically based on
+the DOCTYPE of the parsed document. All you have to do is use a
+parser that has DTD validation enabled:
+
+.. sourcecode:: pycon
+
+ >>> parser = etree.XMLParser(dtd_validation=True)
+
+Obviously, a request for validation enables the DTD loading feature.
+There are two other options that enable loading the DTD, but that do
+not perform any validation. The first is the ``load_dtd`` keyword
+option, which simply loads the DTD into the parser and makes it
+available to the document as external subset. You can retrieve the
+DTD from the parsed document using the ``docinfo`` property of the
+result ElementTree object. The internal subset is available as
+``internalDTD``, the external subset is provided as ``externalDTD``.
+
+The third way to activate DTD loading is with the
+``attribute_defaults`` option, which loads the DTD and weaves
+attribute default values into the document. Again, no validation is
+performed unless explicitly requested.
+
+XML schema is supported in a similar way, but requires an explicit
+schema to be provided:
+
+.. sourcecode:: pycon
+
+ >>> schema_root = etree.XML('''\
+ ... <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ ... <xsd:element name="a" type="xsd:integer"/>
+ ... </xsd:schema>
+ ... ''')
+ >>> schema = etree.XMLSchema(schema_root)
+
+ >>> parser = etree.XMLParser(schema = schema)
+ >>> root = etree.fromstring("<a>5</a>", parser)
+
+If the validation fails (be it for a DTD or an XML schema), the parser
+will raise an exception:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.fromstring("<a>no int</a>", parser) # doctest: +ELLIPSIS
+ Traceback (most recent call last):
+ lxml.etree.XMLSyntaxError: Element 'a': 'no int' is not a valid value of the atomic type 'xs:integer'...
+
+If you want the parser to succeed regardless of the outcome of the
+validation, you should use a non validating parser and run the
+validation separately after parsing the document.
+
+
+DTD
+---
+
+As described above, the parser support for DTDs depends on internal or
+external subsets of the XML file. This means that the XML file itself
+must either contain a DTD or must reference a DTD to make this work.
+If you want to validate an XML document against a DTD that is not
+referenced by the document itself, you can use the ``DTD`` class.
+
+To use the ``DTD`` class, you must first pass a filename or file-like object
+into the constructor to parse a DTD:
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO("<!ELEMENT b EMPTY>")
+ >>> dtd = etree.DTD(f)
+
+Now you can use it to validate documents:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<b/>")
+ >>> print(dtd.validate(root))
+ True
+
+ >>> root = etree.XML("<b><a/></b>")
+ >>> print(dtd.validate(root))
+ False
+
+The reason for the validation failure can be found in the error log:
+
+.. sourcecode:: pycon
+
+ >>> print(dtd.error_log.filter_from_errors()[0])
+ <string>:1:0:ERROR:VALID:DTD_NOT_EMPTY: Element b was declared EMPTY this one has content
+
+As an alternative to parsing from a file, you can use the
+``external_id`` keyword argument to parse from a catalog. The
+following example reads the DocBook DTD in version 4.2, if available
+in the system catalog:
+
+.. sourcecode:: python
+
+ dtd = etree.DTD(external_id = "-//OASIS//DTD DocBook XML V4.2//EN")
+
+The DTD information is available as attributes on the DTD object. The method
+``iterelements`` provides an iterator over the element declarations:
+
+.. sourcecode:: pycon
+
+ >>> dtd = etree.DTD(StringIO('<!ELEMENT a EMPTY><!ELEMENT b EMPTY>'))
+ >>> for el in dtd.iterelements():
+ ... print(el.name)
+ a
+ b
+
+The method ``elements`` returns the element declarations as a list:
+
+.. sourcecode:: pycon
+
+ >>> dtd = etree.DTD(StringIO('<!ELEMENT a EMPTY><!ELEMENT b EMPTY>'))
+ >>> len(dtd.elements())
+ 2
+
+An element declaration object provides the following attributes/methods:
+
+ - ``name``: The name of the element;
+
+ - ``type``: The element type, one of "undefined", "empty", "any", "mixed", or "element";
+
+ - ``content``: Element content declaration (see below);
+
+ - ``iterattributes()``: Return an iterator over attribute declarations (see below);
+
+ - ``attributes()``: Return a list of attribute declarations.
+
+The ``content`` attribute contains information about the content model of the element.
+These element content declaration objects form a binary tree (via the ``left`` and ``right``
+attributes), that makes it possible to reconstruct the content model expression. Here's a
+list of all attributes:
+
+ - ``name``: If this object represents an element in the content model expression,
+ ``name`` is the name of the element, otherwise it is ``None``;
+
+ - ``type``: The type of the node: one of "pcdata", "element", "seq", or "or";
+
+ - ``occur``: How often this element (or this combination of elements) may occur:
+ one of "once", "opt", "mult", or "plus"
+
+ - ``left``: The left hand subexpression
+
+ - ``right``: The right hand subexpression
+
+For example, the element declaration ``<!ELEMENT a (a|b)+>`` results
+in the following element content declaration objects:
+
+.. sourcecode:: pycon
+
+ >>> dtd = etree.DTD(StringIO('<!ELEMENT a (a|b)+>'))
+ >>> content = dtd.elements()[0].content
+ >>> content.type, content.occur, content.name
+ ('or', 'plus', None)
+
+ >>> left, right = content.left, content.right
+ >>> left.type, left.occur, left.name
+ ('element', 'once', 'a')
+ >>> right.type, right.occur, right.name
+ ('element', 'once', 'b')
+
+Attributes declarations have the following attributes/methods:
+
+ - ``name``: The name of the attribute;
+
+ - ``elemname``: The name of the element the attribute belongs to;
+
+ - ``type``: The attribute type, one of "cdata", "id", "idref", "idrefs", "entity",
+ "entities", "nmtoken", "nmtokens", "enumeration", or "notation";
+
+ - ``default``: The type of the default value, one of "none", "required", "implied",
+ or "fixed";
+
+ - ``defaultValue``: The default value;
+
+ - ``itervalues()``: Return an iterator over the allowed attribute values (if the attribute
+ is of type "enumeration");
+
+ - ``values()``: Return a list of allowed attribute values.
+
+Entity declarations are available via the ``iterentities`` and ``entities`` methods:
+
+ >>> dtd = etree.DTD(StringIO('<!ENTITY hurz "&#x40;">'))
+ >>> entity = dtd.entities()[0]
+ >>> entity.name, entity.orig, entity.content
+ ('hurz', '&#x40;', '@')
+
+
+RelaxNG
+-------
+
+The ``RelaxNG`` class takes an ElementTree object to construct a Relax NG
+validator:
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO('''\
+ ... <element name="a" xmlns="http://relaxng.org/ns/structure/1.0">
+ ... <zeroOrMore>
+ ... <element name="b">
+ ... <text />
+ ... </element>
+ ... </zeroOrMore>
+ ... </element>
+ ... ''')
+ >>> relaxng_doc = etree.parse(f)
+ >>> relaxng = etree.RelaxNG(relaxng_doc)
+
+Alternatively, pass a filename to the ``file`` keyword argument to parse from
+a file. This also enables correct handling of include files from within the
+RelaxNG parser.
+
+You can then validate some ElementTree document against the schema. You'll get
+back True if the document is valid against the Relax NG schema, and False if
+not:
+
+.. sourcecode:: pycon
+
+ >>> valid = StringIO('<a><b></b></a>')
+ >>> doc = etree.parse(valid)
+ >>> relaxng.validate(doc)
+ True
+
+ >>> invalid = StringIO('<a><c></c></a>')
+ >>> doc2 = etree.parse(invalid)
+ >>> relaxng.validate(doc2)
+ False
+
+Calling the schema object has the same effect as calling its validate
+method. This is sometimes used in conditional statements:
+
+.. sourcecode:: pycon
+
+ >>> invalid = StringIO('<a><c></c></a>')
+ >>> doc2 = etree.parse(invalid)
+ >>> if not relaxng(doc2):
+ ... print("invalid!")
+ invalid!
+
+If you prefer getting an exception when validating, you can use the
+``assert_`` or ``assertValid`` methods:
+
+.. sourcecode:: pycon
+
+ >>> relaxng.assertValid(doc2)
+ Traceback (most recent call last):
+ ...
+ lxml.etree.DocumentInvalid: Did not expect element c there, line 1
+
+ >>> relaxng.assert_(doc2)
+ Traceback (most recent call last):
+ ...
+ AssertionError: Did not expect element c there, line 1
+
+If you want to find out why the validation failed in the second case, you can
+look up the error log of the validation process and check it for relevant
+messages:
+
+.. sourcecode:: pycon
+
+ >>> log = relaxng.error_log
+ >>> print(log.last_error)
+ <string>:1:0:ERROR:RELAXNGV:RELAXNG_ERR_ELEMWRONG: Did not expect element c there
+
+You can see that the error (ERROR) happened during RelaxNG validation
+(RELAXNGV). The message then tells you what went wrong. You can also
+look at the error domain and its type directly:
+
+.. sourcecode:: pycon
+
+ >>> error = log.last_error
+ >>> print(error.domain_name)
+ RELAXNGV
+ >>> print(error.type_name)
+ RELAXNG_ERR_ELEMWRONG
+
+Note that this error log is local to the RelaxNG object. It will only
+contain log entries that appeared during the validation.
+
+Similar to XSLT, there's also a less efficient but easier shortcut method to
+do one-shot RelaxNG validation:
+
+.. sourcecode:: pycon
+
+ >>> doc.relaxng(relaxng_doc)
+ True
+ >>> doc2.relaxng(relaxng_doc)
+ False
+
+libxml2 does not currently support the `RelaxNG Compact Syntax`_.
+However, if `rnc2rng`_ is installed, lxml 3.6 and later can use it
+internally to parse the input schema. It recognises the `.rnc` file
+extension and also allows parsing an RNC schema from a string using
+`RelaxNG.from_rnc_string()`.
+
+Alternatively, the trang_ translator can convert the compact syntax
+to the XML syntax, which can then be used with lxml.
+
+.. _`rnc2rng`: https://pypi.python.org/pypi/rnc2rng
+.. _`RelaxNG Compact Syntax`: http://relaxng.org/compact-tutorial.html
+.. _trang: http://www.thaiopensource.com/relaxng/trang.html
+
+
+XMLSchema
+---------
+
+lxml.etree also has XML Schema (XSD) support, using the class
+lxml.etree.XMLSchema. The API is very similar to the Relax NG and DTD
+classes. Pass an ElementTree object to construct a XMLSchema validator:
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO('''\
+ ... <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ ... <xsd:element name="a" type="AType"/>
+ ... <xsd:complexType name="AType">
+ ... <xsd:sequence>
+ ... <xsd:element name="b" type="xsd:string" />
+ ... </xsd:sequence>
+ ... </xsd:complexType>
+ ... </xsd:schema>
+ ... ''')
+ >>> xmlschema_doc = etree.parse(f)
+ >>> xmlschema = etree.XMLSchema(xmlschema_doc)
+
+You can then validate some ElementTree document with this. Like with RelaxNG,
+you'll get back true if the document is valid against the XML schema, and
+false if not:
+
+.. sourcecode:: pycon
+
+ >>> valid = StringIO('<a><b></b></a>')
+ >>> doc = etree.parse(valid)
+ >>> xmlschema.validate(doc)
+ True
+
+ >>> invalid = StringIO('<a><c></c></a>')
+ >>> doc2 = etree.parse(invalid)
+ >>> xmlschema.validate(doc2)
+ False
+
+Calling the schema object has the same effect as calling its validate method.
+This is sometimes used in conditional statements:
+
+.. sourcecode:: pycon
+
+ >>> invalid = StringIO('<a><c></c></a>')
+ >>> doc2 = etree.parse(invalid)
+ >>> if not xmlschema(doc2):
+ ... print("invalid!")
+ invalid!
+
+If you prefer getting an exception when validating, you can use the
+``assert_`` or ``assertValid`` methods:
+
+.. sourcecode:: pycon
+
+ >>> xmlschema.assertValid(doc2)
+ Traceback (most recent call last):
+ ...
+ lxml.etree.DocumentInvalid: Element 'c': This element is not expected. Expected is ( b )., line 1
+
+ >>> xmlschema.assert_(doc2)
+ Traceback (most recent call last):
+ ...
+ AssertionError: Element 'c': This element is not expected. Expected is ( b )., line 1
+
+Error reporting works as for the RelaxNG class:
+
+.. sourcecode:: pycon
+
+ >>> log = xmlschema.error_log
+ >>> error = log.last_error
+ >>> print(error.domain_name)
+ SCHEMASV
+ >>> print(error.type_name)
+ SCHEMAV_ELEMENT_CONTENT
+
+If you were to print this log entry, you would get something like the
+following. Note that the error message depends on the libxml2 version in
+use::
+
+ <string>:1:ERROR::SCHEMAV_ELEMENT_CONTENT: Element 'c': This element is not expected. Expected is ( b ).
+
+Similar to XSLT and RelaxNG, there's also a less efficient but easier shortcut
+method to do XML Schema validation:
+
+.. sourcecode:: pycon
+
+ >>> doc.xmlschema(xmlschema_doc)
+ True
+ >>> doc2.xmlschema(xmlschema_doc)
+ False
+
+Schematron
+----------
+
+From version 2.3 on lxml features ISO-`Schematron`_ support built on the
+de-facto reference implementation of Schematron, the pure-XSLT-1.0
+`skeleton implementation`_. This is provided by the lxml.isoschematron package
+that implements the Schematron class, with an API compatible to the other
+validators'. Pass an Element or ElementTree object to construct a Schematron
+validator:
+
+.. sourcecode:: pycon
+
+ >>> from lxml import isoschematron
+ >>> f = StringIO('''\
+ ... <schema xmlns="http://purl.oclc.org/dsdl/schematron" >
+ ... <pattern id="sum_equals_100_percent">
+ ... <title>Sum equals 100%.</title>
+ ... <rule context="Total">
+ ... <assert test="sum(//Percent)=100">Sum is not 100%.</assert>
+ ... </rule>
+ ... </pattern>
+ ... </schema>
+ ... ''')
+
+ >>> sct_doc = etree.parse(f)
+ >>> schematron = isoschematron.Schematron(sct_doc)
+
+You can then validate some ElementTree document with this. Just like with
+XMLSchema or RelaxNG, you'll get back true if the document is valid against the
+schema, and false if not:
+
+.. sourcecode:: pycon
+
+ >>> valid = StringIO('''\
+ ... <Total>
+ ... <Percent>20</Percent>
+ ... <Percent>30</Percent>
+ ... <Percent>50</Percent>
+ ... </Total>
+ ... ''')
+
+ >>> doc = etree.parse(valid)
+ >>> schematron.validate(doc)
+ True
+
+ >>> etree.SubElement(doc.getroot(), "Percent").text = "10"
+
+ >>> schematron.validate(doc)
+ False
+
+Calling the schema object has the same effect as calling its validate method.
+This can be useful for conditional statements:
+
+.. sourcecode:: pycon
+
+ >>> is_valid = isoschematron.Schematron(sct_doc)
+
+ >>> if not is_valid(doc):
+ ... print("invalid!")
+ invalid!
+
+Built on a pure-xslt implementation, the actual validator is created as an
+XSLT 1.0 stylesheet using these steps:
+
+0. (Extract embedded Schematron from XML Schema or RelaxNG schema)
+1. Process inclusions
+2. Process abstract patterns
+3. Compile the schematron schema to XSLT
+
+To allow more control over the individual steps, isoschematron.Schematron
+supports an extended API:
+
+The ``include`` and ``expand`` keyword arguments can be used to switch off
+steps 1) and 2).
+
+To set parameters for steps 1), 2) and 3) dictionaries containing parameters
+for XSLT can be provided using the keyword arguments ``include_params``,
+``expand_params`` or ``compile_params``. Schematron automatically converts these
+parameters to stylesheet parameters so you need not worry to set string
+parameters using quotes or to use XSLT.strparam(). If you ever need to pass an
+XPath as argument to the XSLT stylesheet you can pass in an etree.XPath object
+(see XPath and XSLT with lxml: Stylesheet-parameters_ for background on this).
+
+The ``phase`` parameter of the compile step is additionally exposed as a keyword
+argument. If set, it overrides occurrence in ``compile_params``. Note that
+isoschematron.Schematron might expose more common parameters as additional keyword
+args in the future.
+
+By setting ``store_schematron`` to True, the (included-and-expanded) schematron
+document tree is stored and made available through the ``schematron`` property.
+
+Similarly, setting ``store_xslt`` to True will result in the validation XSLT
+document tree being kept; it can be retrieved through the ``validator_xslt``
+property.
+
+Finally, with ``store_report`` set to True (default: False), the resulting
+validation report document gets stored and can be accessed as the
+``validation_report`` property.
+
+.. _Stylesheet-parameters: xpathxslt.html#stylesheet-parameters
+
+Using the ``phase`` parameter of isoschematron.Schematron allows for selective
+validation of predefined pattern groups:
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO('''\
+ ... <schema xmlns="http://purl.oclc.org/dsdl/schematron" >
+ ... <phase id="phase.sum_check">
+ ... <active pattern="sum_equals_100_percent"/>
+ ... </phase>
+ ... <phase id="phase.entries_check">
+ ... <active pattern="all_positive"/>
+ ... </phase>
+ ... <pattern id="sum_equals_100_percent">
+ ... <title>Sum equals 100%.</title>
+ ... <rule context="Total">
+ ... <assert test="sum(//Percent)=100">Sum is not 100%.</assert>
+ ... </rule>
+ ... </pattern>
+ ... <pattern id="all_positive">
+ ... <title>All entries must be positive.</title>
+ ... <rule context="Percent">
+ ... <assert test="number(.)>0">Number (<value-of select="."/>) not positive</assert>
+ ... </rule>
+ ... </pattern>
+ ... </schema>
+ ... ''')
+
+ >>> sct_doc = etree.parse(f)
+ >>> schematron = isoschematron.Schematron(sct_doc)
+
+ >>> valid = StringIO('''\
+ ... <Total>
+ ... <Percent>20</Percent>
+ ... <Percent>30</Percent>
+ ... <Percent>50</Percent>
+ ... </Total>
+ ... ''')
+
+ >>> doc = etree.parse(valid)
+ >>> schematron.validate(doc)
+ True
+
+ >>> invalid_positive = StringIO('''\
+ ... <Total>
+ ... <Percent>0</Percent>
+ ... <Percent>50</Percent>
+ ... <Percent>50</Percent>
+ ... </Total>
+ ... ''')
+
+ >>> doc = etree.parse(invalid_positive)
+
+ >>> schematron.validate(doc)
+ False
+
+If the constraint of Percent entries being positive is not of interest in a
+certain validation scenario, it can now be disabled:
+
+.. sourcecode:: pycon
+
+ >>> selective = isoschematron.Schematron(sct_doc, phase="phase.sum_check")
+ >>> selective.validate(doc)
+ True
+
+The usage of validation phases is a unique feature of ISO-Schematron and can be
+a very powerful tool e.g. for establishing validation stages or to provide
+different validators for different "validation audiences".
+
+(Pre-ISO-Schematron)
+--------------------
+
+Since version 2.0, lxml.etree features `pre-ISO-Schematron`_ support, using the
+class lxml.etree.Schematron. It requires at least libxml2 2.6.21 to
+work. The API is the same as for the other validators. Pass an
+ElementTree object to construct a Schematron validator:
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO('''\
+ ... <schema xmlns="http://www.ascc.net/xml/schematron" >
+ ... <pattern name="Sum equals 100%.">
+ ... <rule context="Total">
+ ... <assert test="sum(//Percent)=100">Sum is not 100%.</assert>
+ ... </rule>
+ ... </pattern>
+ ... </schema>
+ ... ''')
+
+ >>> sct_doc = etree.parse(f)
+ >>> schematron = etree.Schematron(sct_doc)
+
+You can then validate some ElementTree document with this. Like with RelaxNG,
+you'll get back true if the document is valid against the schema, and false if
+not:
+
+.. sourcecode:: pycon
+
+ >>> valid = StringIO('''\
+ ... <Total>
+ ... <Percent>20</Percent>
+ ... <Percent>30</Percent>
+ ... <Percent>50</Percent>
+ ... </Total>
+ ... ''')
+
+ >>> doc = etree.parse(valid)
+ >>> schematron.validate(doc)
+ True
+
+ >>> etree.SubElement(doc.getroot(), "Percent").text = "10"
+
+ >>> schematron.validate(doc)
+ False
+
+Calling the schema object has the same effect as calling its validate method.
+This is sometimes used in conditional statements:
+
+.. sourcecode:: pycon
+
+ >>> is_valid = etree.Schematron(sct_doc)
+
+ >>> if not is_valid(doc):
+ ... print("invalid!")
+ invalid!
+
+Note that libxml2 restricts error reporting to the parsing step (when creating
+the Schematron instance). There is not currently any support for error
+reporting during validation.
diff --git a/doc/xpathxslt.txt b/doc/xpathxslt.txt
new file mode 100644
index 0000000..8b2870e
--- /dev/null
+++ b/doc/xpathxslt.txt
@@ -0,0 +1,785 @@
+========================
+XPath and XSLT with lxml
+========================
+
+lxml supports XPath 1.0, XSLT 1.0 and the EXSLT extensions through
+libxml2 and libxslt in a standards compliant way.
+
+.. contents::
+..
+ 1 XPath
+ 1.1 The ``xpath()`` method
+ 1.2 Namespaces and prefixes
+ 1.3 XPath return values
+ 1.4 Generating XPath expressions
+ 1.5 The ``XPath`` class
+ 1.6 Regular expressions in XPath
+ 1.7 The ``XPathEvaluator`` classes
+ 1.8 ``ETXPath``
+ 1.9 Error handling
+ 2 XSLT
+ 2.1 XSLT result objects
+ 2.2 Stylesheet parameters
+ 2.3 The ``xslt()`` tree method
+ 2.4 Dealing with stylesheet complexity
+ 2.5 Profiling
+
+The usual setup procedure:
+
+.. sourcecode:: pycon
+
+ >>> from lxml import etree
+
+..
+ >>> try: from StringIO import StringIO
+ ... except ImportError:
+ ... from io import BytesIO
+ ... def StringIO(s):
+ ... if isinstance(s, str): s = s.encode("UTF-8")
+ ... return BytesIO(s)
+
+ >>> import sys
+ >>> if sys.version_info[0] == 2:
+ ... from __builtin__ import unicode as str
+
+
+XPath
+=====
+
+lxml.etree supports the simple path syntax of the `find, findall and
+findtext`_ methods on ElementTree and Element, as known from the original
+ElementTree library (ElementPath_). As an lxml specific extension, these
+classes also provide an ``xpath()`` method that supports expressions in the
+complete XPath syntax, as well as `custom extension functions`_.
+
+.. _ElementPath: http://effbot.org/zone/element-xpath.htm
+.. _`find, findall and findtext`: http://effbot.org/zone/element.htm#searching-for-subelements
+.. _`custom extension functions`: extensions.html#xpath-extension-functions
+.. _`XSLT extension elements`: extensions.html#xslt-extension-elements
+
+There are also specialized XPath evaluator classes that are more efficient for
+frequent evaluation: ``XPath`` and ``XPathEvaluator``. See the `performance
+comparison`_ to learn when to use which. Their semantics when used on
+Elements and ElementTrees are the same as for the ``xpath()`` method described
+here.
+
+Note that the ``.find*()`` methods are usually faster than the full-blown XPath
+support. They also support incremental tree processing through the ``.iterfind()``
+method, whereas XPath always collects all results before returning them.
+
+.. _`performance comparison`: performance.html#xpath
+
+
+The ``xpath()`` method
+----------------------
+
+For ElementTree, the xpath method performs a global XPath query against the
+document (if absolute) or against the root node (if relative):
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO('<foo><bar></bar></foo>')
+ >>> tree = etree.parse(f)
+
+ >>> r = tree.xpath('/foo/bar')
+ >>> len(r)
+ 1
+ >>> r[0].tag
+ 'bar'
+
+ >>> r = tree.xpath('bar')
+ >>> r[0].tag
+ 'bar'
+
+When ``xpath()`` is used on an Element, the XPath expression is evaluated
+against the element (if relative) or against the root tree (if absolute):
+
+.. sourcecode:: pycon
+
+ >>> root = tree.getroot()
+ >>> r = root.xpath('bar')
+ >>> r[0].tag
+ 'bar'
+
+ >>> bar = root[0]
+ >>> r = bar.xpath('/foo/bar')
+ >>> r[0].tag
+ 'bar'
+
+ >>> tree = bar.getroottree()
+ >>> r = tree.xpath('/foo/bar')
+ >>> r[0].tag
+ 'bar'
+
+The ``xpath()`` method has support for XPath variables:
+
+.. sourcecode:: pycon
+
+ >>> expr = "//*[local-name() = $name]"
+
+ >>> print(root.xpath(expr, name = "foo")[0].tag)
+ foo
+
+ >>> print(root.xpath(expr, name = "bar")[0].tag)
+ bar
+
+ >>> print(root.xpath("$text", text = "Hello World!"))
+ Hello World!
+
+
+Namespaces and prefixes
+-----------------------
+
+If your XPath expression uses namespace prefixes, you must define them
+in a prefix mapping. To this end, pass a dictionary to the
+``namespaces`` keyword argument that maps the namespace prefixes used
+in the XPath expression to namespace URIs:
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO('''\
+ ... <a:foo xmlns:a="http://codespeak.net/ns/test1"
+ ... xmlns:b="http://codespeak.net/ns/test2">
+ ... <b:bar>Text</b:bar>
+ ... </a:foo>
+ ... ''')
+ >>> doc = etree.parse(f)
+
+ >>> r = doc.xpath('/x:foo/b:bar',
+ ... namespaces={'x': 'http://codespeak.net/ns/test1',
+ ... 'b': 'http://codespeak.net/ns/test2'})
+ >>> len(r)
+ 1
+ >>> r[0].tag
+ '{http://codespeak.net/ns/test2}bar'
+ >>> r[0].text
+ 'Text'
+
+The prefixes you choose here are not linked to the prefixes used
+inside the XML document. The document may define whatever prefixes it
+likes, including the empty prefix, without breaking the above code.
+
+Note that XPath does not have a notion of a default namespace. The
+empty prefix is therefore undefined for XPath and cannot be used in
+namespace prefix mappings.
+
+There is also an optional ``extensions`` argument which is used to
+define `custom extension functions`_ in Python that are local to this
+evaluation. The namespace prefixes that they use in the XPath
+expression must also be defined in the namespace prefix mapping.
+
+
+XPath return values
+-------------------
+
+The return value types of XPath evaluations vary, depending on the
+XPath expression used:
+
+* True or False, when the XPath expression has a boolean result
+
+* a float, when the XPath expression has a numeric result (integer or float)
+
+* a 'smart' string (as described below), when the XPath expression has
+ a string result.
+
+* a list of items, when the XPath expression has a list as result.
+ The items may include Elements (also comments and processing
+ instructions), strings and tuples. Text nodes and attributes in the
+ result are returned as 'smart' string values. Namespace
+ declarations are returned as tuples of strings: ``(prefix, URI)``.
+
+XPath string results are 'smart' in that they provide a
+``getparent()`` method that knows their origin:
+
+* for attribute values, ``result.getparent()`` returns the Element
+ that carries them. An example is ``//foo/@attribute``, where the
+ parent would be a ``foo`` Element.
+
+* for the ``text()`` function (as in ``//text()``), it returns the
+ Element that contains the text or tail that was returned.
+
+You can distinguish between different text origins with the boolean
+properties ``is_text``, ``is_tail`` and ``is_attribute``.
+
+Note that ``getparent()`` may not always return an Element. For
+example, the XPath functions ``string()`` and ``concat()`` will
+construct strings that do not have an origin. For them,
+``getparent()`` will return None.
+
+There are certain cases where the smart string behaviour is
+undesirable. For example, it means that the tree will be kept alive
+by the string, which may have a considerable memory impact in the case
+that the string value is the only thing in the tree that is actually
+of interest. For these cases, you can deactivate the parental
+relationship using the keyword argument ``smart_strings``.
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<root><a>TEXT</a></root>")
+
+ >>> find_text = etree.XPath("//text()")
+ >>> text = find_text(root)[0]
+ >>> print(text)
+ TEXT
+ >>> print(text.getparent().text)
+ TEXT
+
+ >>> find_text = etree.XPath("//text()", smart_strings=False)
+ >>> text = find_text(root)[0]
+ >>> print(text)
+ TEXT
+ >>> hasattr(text, 'getparent')
+ False
+
+
+Generating XPath expressions
+----------------------------
+
+ElementTree objects have a method ``getpath(element)``, which returns a
+structural, absolute XPath expression to find that element:
+
+.. sourcecode:: pycon
+
+ >>> a = etree.Element("a")
+ >>> b = etree.SubElement(a, "b")
+ >>> c = etree.SubElement(a, "c")
+ >>> d1 = etree.SubElement(c, "d")
+ >>> d2 = etree.SubElement(c, "d")
+
+ >>> tree = etree.ElementTree(c)
+ >>> print(tree.getpath(d2))
+ /c/d[2]
+ >>> tree.xpath(tree.getpath(d2)) == [d2]
+ True
+
+
+The ``XPath`` class
+-------------------
+
+The ``XPath`` class compiles an XPath expression into a callable function:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<root><a><b/></a><b/></root>")
+
+ >>> find = etree.XPath("//b")
+ >>> print(find(root)[0].tag)
+ b
+
+The compilation takes as much time as in the ``xpath()`` method, but it is
+done only once per class instantiation. This makes it especially efficient
+for repeated evaluation of the same XPath expression.
+
+Just like the ``xpath()`` method, the ``XPath`` class supports XPath
+variables:
+
+.. sourcecode:: pycon
+
+ >>> count_elements = etree.XPath("count(//*[local-name() = $name])")
+
+ >>> print(count_elements(root, name = "a"))
+ 1.0
+ >>> print(count_elements(root, name = "b"))
+ 2.0
+
+This supports very efficient evaluation of modified versions of an XPath
+expression, as compilation is still only required once.
+
+Prefix-to-namespace mappings can be passed as second parameter:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<root xmlns='NS'><a><b/></a><b/></root>")
+
+ >>> find = etree.XPath("//n:b", namespaces={'n':'NS'})
+ >>> print(find(root)[0].tag)
+ {NS}b
+
+
+Regular expressions in XPath
+----------------------------
+
+By default, ``XPath`` supports regular expressions in the EXSLT_ namespace:
+
+.. sourcecode:: pycon
+
+ >>> regexpNS = "http://exslt.org/regular-expressions"
+ >>> find = etree.XPath("//*[re:test(., '^abc$', 'i')]",
+ ... namespaces={'re':regexpNS})
+
+ >>> root = etree.XML("<root><a>aB</a><b>aBc</b></root>")
+ >>> print(find(root)[0].text)
+ aBc
+
+.. _EXSLT: http://www.exslt.org/
+
+You can disable this with the boolean keyword argument ``regexp`` which
+defaults to True.
+
+
+The ``XPathEvaluator`` classes
+------------------------------
+
+lxml.etree provides two other efficient XPath evaluators that work on
+ElementTrees or Elements respectively: ``XPathDocumentEvaluator`` and
+``XPathElementEvaluator``. They are automatically selected if you use the
+XPathEvaluator helper for instantiation:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<root><a><b/></a><b/></root>")
+ >>> xpatheval = etree.XPathEvaluator(root)
+
+ >>> print(isinstance(xpatheval, etree.XPathElementEvaluator))
+ True
+
+ >>> print(xpatheval("//b")[0].tag)
+ b
+
+This class provides efficient support for evaluating different XPath
+expressions on the same Element or ElementTree.
+
+
+``ETXPath``
+-----------
+
+ElementTree supports a language named ElementPath_ in its ``find*()`` methods.
+One of the main differences between XPath and ElementPath is that the XPath
+language requires an indirection through prefixes for namespace support,
+whereas ElementTree uses the Clark notation (``{ns}name``) to avoid prefixes
+completely. The other major difference regards the capabilities of both path
+languages. Where XPath supports various sophisticated ways of restricting the
+result set through functions and boolean expressions, ElementPath only
+supports pure path traversal without nesting or further conditions. So, while
+the ElementPath syntax is self-contained and therefore easier to write and
+handle, XPath is much more powerful and expressive.
+
+lxml.etree bridges this gap through the class ``ETXPath``, which accepts XPath
+expressions with namespaces in Clark notation. It is identical to the
+``XPath`` class, except for the namespace notation. Normally, you would
+write:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML("<root xmlns='ns'><a><b/></a><b/></root>")
+
+ >>> find = etree.XPath("//p:b", namespaces={'p' : 'ns'})
+ >>> print(find(root)[0].tag)
+ {ns}b
+
+``ETXPath`` allows you to change this to:
+
+.. sourcecode:: pycon
+
+ >>> find = etree.ETXPath("//{ns}b")
+ >>> print(find(root)[0].tag)
+ {ns}b
+
+
+Error handling
+--------------
+
+lxml.etree raises exceptions when errors occur while parsing or evaluating an
+XPath expression:
+
+.. sourcecode:: pycon
+
+ >>> find = etree.XPath("\\")
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XPathSyntaxError: Invalid expression
+
+lxml will also try to give you a hint what went wrong, so if you pass a more
+complex expression, you may get a somewhat more specific error:
+
+.. sourcecode:: pycon
+
+ >>> find = etree.XPath("//*[1.1.1]")
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XPathSyntaxError: Invalid predicate
+
+During evaluation, lxml will emit an XPathEvalError on errors:
+
+.. sourcecode:: pycon
+
+ >>> find = etree.XPath("//ns:a")
+ >>> find(root)
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XPathEvalError: Undefined namespace prefix
+
+This works for the ``XPath`` class, however, the other evaluators (including
+the ``xpath()`` method) are one-shot operations that do parsing and evaluation
+in one step. They therefore raise evaluation exceptions in all cases:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.Element("test")
+ >>> find = root.xpath("//*[1.1.1]")
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XPathEvalError: Invalid predicate
+
+ >>> find = root.xpath("//ns:a")
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XPathEvalError: Undefined namespace prefix
+
+ >>> find = root.xpath("\\")
+ Traceback (most recent call last):
+ ...
+ lxml.etree.XPathEvalError: Invalid expression
+
+Note that lxml versions before 1.3 always raised an ``XPathSyntaxError`` for
+all errors, including evaluation errors. The best way to support older
+versions is to except on the superclass ``XPathError``.
+
+
+XSLT
+====
+
+lxml.etree introduces a new class, lxml.etree.XSLT. The class can be
+given an ElementTree or Element object to construct an XSLT
+transformer:
+
+.. sourcecode:: pycon
+
+ >>> xslt_root = etree.XML('''\
+ ... <xsl:stylesheet version="1.0"
+ ... xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ ... <xsl:template match="/">
+ ... <foo><xsl:value-of select="/a/b/text()" /></foo>
+ ... </xsl:template>
+ ... </xsl:stylesheet>''')
+ >>> transform = etree.XSLT(xslt_root)
+
+You can then run the transformation on an ElementTree document by simply
+calling it, and this results in another ElementTree object:
+
+.. sourcecode:: pycon
+
+ >>> f = StringIO('<a><b>Text</b></a>')
+ >>> doc = etree.parse(f)
+ >>> result_tree = transform(doc)
+
+By default, XSLT supports all extension functions from libxslt and
+libexslt as well as Python regular expressions through the `EXSLT
+regexp functions`_. Also see the documentation on `custom extension
+functions`_, `XSLT extension elements`_ and `document resolvers`_.
+There is a separate section on `controlling access`_ to external
+documents and resources.
+
+.. _`EXSLT regexp functions`: http://www.exslt.org/regexp/
+.. _`document resolvers`: resolvers.html
+.. _`controlling access`: resolvers.html#i-o-access-control-in-xslt
+
+
+XSLT result objects
+-------------------
+
+The result of an XSL transformation can be accessed like a normal ElementTree
+document:
+
+.. sourcecode:: pycon
+
+ >>> root = etree.XML('<a><b>Text</b></a>')
+ >>> result = transform(root)
+
+ >>> result.getroot().text
+ 'Text'
+
+but, as opposed to normal ElementTree objects, can also be turned into an (XML
+or text) string by applying the ``bytes()`` function (``str()`` in Python 2):
+
+.. sourcecode:: pycon
+
+ >>> bytes(result)
+ b'<?xml version="1.0"?>\n<foo>Text</foo>\n'
+
+The result is always a plain string, encoded as requested by the ``xsl:output``
+element in the stylesheet. If you want a Python Unicode/Text string instead,
+you should set this encoding to ``UTF-8`` (unless the `ASCII` default
+is sufficient). This allows you to call the builtin ``str()`` function on
+the result (``unicode()`` in Python 2):
+
+.. sourcecode:: pycon
+
+ >>> str(result)
+ u'<?xml version="1.0"?>\n<foo>Text</foo>\n'
+
+You can use other encodings at the cost of multiple recoding. Encodings that
+are not supported by Python will result in an error:
+
+.. sourcecode:: pycon
+
+ >>> xslt_tree = etree.XML('''\
+ ... <xsl:stylesheet version="1.0"
+ ... xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ ... <xsl:output encoding="UCS4"/>
+ ... <xsl:template match="/">
+ ... <foo><xsl:value-of select="/a/b/text()" /></foo>
+ ... </xsl:template>
+ ... </xsl:stylesheet>''')
+ >>> transform = etree.XSLT(xslt_tree)
+
+ >>> result = transform(doc)
+ >>> str(result)
+ Traceback (most recent call last):
+ ...
+ LookupError: unknown encoding: UCS4
+
+While it is possible to use the ``.write()`` method (known from ``ElementTree``
+objects) to serialise the XSLT result into a file, it is better to use the
+``.write_output()`` method. The latter knows about the ``<xsl:output>`` tag
+and writes the expected data into the output file.
+
+.. sourcecode:: pycon
+
+ >>> xslt_root = etree.XML('''\
+ ... <xsl:stylesheet version="1.0"
+ ... xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ ... <xsl:output method="text" encoding="utf8" />
+ ... <xsl:template match="/">
+ ... <foo><xsl:value-of select="/a/b/text()" /></foo>
+ ... </xsl:template>
+ ... </xsl:stylesheet>''')
+ >>> transform = etree.XSLT(xslt_root)
+
+ >>> result = transform(doc)
+ >>> result.write_output("output.txt.gz", compression=9) # doctest: +SKIP
+
+..
+
+ >>> from io import BytesIO
+ >>> out = BytesIO()
+ >>> result.write_output(out)
+ >>> data = out.getvalue()
+ >>> b'Text' in data
+ True
+
+
+Stylesheet parameters
+---------------------
+
+It is possible to pass parameters, in the form of XPath expressions, to the
+XSLT template:
+
+.. sourcecode:: pycon
+
+ >>> xslt_tree = etree.XML('''\
+ ... <xsl:stylesheet version="1.0"
+ ... xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ ... <xsl:param name="a" />
+ ... <xsl:template match="/">
+ ... <foo><xsl:value-of select="$a" /></foo>
+ ... </xsl:template>
+ ... </xsl:stylesheet>''')
+ >>> transform = etree.XSLT(xslt_tree)
+ >>> doc_root = etree.XML('<a><b>Text</b></a>')
+
+The parameters are passed as keyword parameters to the transform call.
+First, let's try passing in a simple integer expression:
+
+.. sourcecode:: pycon
+
+ >>> result = transform(doc_root, a="5")
+ >>> bytes(result)
+ b'<?xml version="1.0"?>\n<foo>5</foo>\n'
+
+You can use any valid XPath expression as parameter value:
+
+.. sourcecode:: pycon
+
+ >>> result = transform(doc_root, a="/a/b/text()")
+ >>> bytes(result)
+ b'<?xml version="1.0"?>\n<foo>Text</foo>\n'
+
+It's also possible to pass an XPath object as a parameter:
+
+.. sourcecode:: pycon
+
+ >>> result = transform(doc_root, a=etree.XPath("/a/b/text()"))
+ >>> bytes(result)
+ b'<?xml version="1.0"?>\n<foo>Text</foo>\n'
+
+Passing a string expression looks like this:
+
+.. sourcecode:: pycon
+
+ >>> result = transform(doc_root, a="'A'")
+ >>> bytes(result)
+ b'<?xml version="1.0"?>\n<foo>A</foo>\n'
+
+To pass a string that (potentially) contains quotes, you can use the
+``.strparam()`` class method. Note that it does not escape the
+string. Instead, it returns an opaque object that keeps the string
+value.
+
+.. sourcecode:: pycon
+
+ >>> plain_string_value = etree.XSLT.strparam(
+ ... """ It's "Monty Python" """)
+ >>> result = transform(doc_root, a=plain_string_value)
+ >>> bytes(result)
+ b'<?xml version="1.0"?>\n<foo> It\'s "Monty Python" </foo>\n'
+
+If you need to pass parameters that are not legal Python identifiers,
+pass them inside of a dictionary:
+
+.. sourcecode:: pycon
+
+ >>> transform = etree.XSLT(etree.XML('''\
+ ... <xsl:stylesheet version="1.0"
+ ... xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ ... <xsl:param name="non-python-identifier" />
+ ... <xsl:template match="/">
+ ... <foo><xsl:value-of select="$non-python-identifier" /></foo>
+ ... </xsl:template>
+ ... </xsl:stylesheet>'''))
+
+ >>> result = transform(doc_root, **{'non-python-identifier': '5'})
+ >>> bytes(result)
+ b'<?xml version="1.0"?>\n<foo>5</foo>\n'
+
+
+
+Errors and messages
+-------------------
+
+Like most of the processing oriented objects in lxml.etree, ``XSLT``
+provides an error log that lists messages and error output from the
+last run. See the `parser documentation`_ for a description of the
+error log.
+
+.. _`parser documentation`: parsing.html#error-log
+
+.. sourcecode:: pycon
+
+ >>> xslt_root = etree.XML('''\
+ ... <xsl:stylesheet version="1.0"
+ ... xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ ... <xsl:template match="/">
+ ... <xsl:message terminate="no">STARTING</xsl:message>
+ ... <foo><xsl:value-of select="/a/b/text()" /></foo>
+ ... <xsl:message terminate="no">DONE</xsl:message>
+ ... </xsl:template>
+ ... </xsl:stylesheet>''')
+ >>> transform = etree.XSLT(xslt_root)
+
+ >>> doc_root = etree.XML('<a><b>Text</b></a>')
+ >>> result = transform(doc_root)
+ >>> bytes(result)
+ b'<?xml version="1.0"?>\n<foo>Text</foo>\n'
+
+ >>> print(transform.error_log)
+ <string>:0:0:ERROR:XSLT:ERR_OK: STARTING
+ <string>:0:0:ERROR:XSLT:ERR_OK: DONE
+
+ >>> for entry in transform.error_log:
+ ... print('message from line %s, col %s: %s' % (
+ ... entry.line, entry.column, entry.message))
+ ... print('domain: %s (%d)' % (entry.domain_name, entry.domain))
+ ... print('type: %s (%d)' % (entry.type_name, entry.type))
+ ... print('level: %s (%d)' % (entry.level_name, entry.level))
+ ... print('filename: %s' % entry.filename)
+ message from line 0, col 0: STARTING
+ domain: XSLT (22)
+ type: ERR_OK (0)
+ level: ERROR (2)
+ filename: <string>
+ message from line 0, col 0: DONE
+ domain: XSLT (22)
+ type: ERR_OK (0)
+ level: ERROR (2)
+ filename: <string>
+
+Note that there is no way in XSLT to distinguish between user
+messages, warnings and error messages that occurred during the
+run. ``libxslt`` simply does not provide this information. You can
+partly work around this limitation by making your own messages
+uniquely identifiable, e.g. with a common text prefix.
+
+
+The ``xslt()`` tree method
+--------------------------
+
+There's also a convenience method on ElementTree objects for doing XSL
+transformations. This is less efficient if you want to apply the same XSL
+transformation to multiple documents, but is shorter to write for one-shot
+operations, as you do not have to instantiate a stylesheet yourself:
+
+.. sourcecode:: pycon
+
+ >>> result = doc.xslt(xslt_tree, a="'A'")
+ >>> bytes(result)
+ b'<?xml version="1.0"?>\n<foo>A</foo>\n'
+
+This is a shortcut for the following code:
+
+.. sourcecode:: pycon
+
+ >>> transform = etree.XSLT(xslt_tree)
+ >>> result = transform(doc, a="'A'")
+ >>> bytes(result)
+ b'<?xml version="1.0"?>\n<foo>A</foo>\n'
+
+
+Dealing with stylesheet complexity
+----------------------------------
+
+Some applications require a larger set of rather diverse stylesheets.
+lxml.etree allows you to deal with this in a number of ways. Here are
+some ideas to try.
+
+The most simple way to reduce the diversity is by using XSLT
+parameters that you pass at call time to configure the stylesheets.
+The ``partial()`` function in the ``functools`` module
+may come in handy here. It allows you to bind a set of keyword
+arguments (i.e. stylesheet parameters) to a reference of a callable
+stylesheet. The same works for instances of the ``XPath()``
+evaluator, obviously.
+
+You may also consider creating stylesheets programmatically. Just
+create an XSL tree, e.g. from a parsed template, and then add or
+replace parts as you see fit. Passing an XSL tree into the ``XSLT()``
+constructor multiple times will create independent stylesheets, so
+later modifications of the tree will not be reflected in the already
+created stylesheets. This makes stylesheet generation very straight
+forward.
+
+A third thing to remember is the support for `custom extension
+functions`_ and `XSLT extension elements`_. Some things are much
+easier to express in XSLT than in Python, while for others it is the
+complete opposite. Finding the right mixture of Python code and XSL
+code can help a great deal in keeping applications well designed and
+maintainable.
+
+
+Profiling
+---------
+
+If you want to know how your stylesheet performed, pass the ``profile_run``
+keyword to the transform:
+
+.. sourcecode:: pycon
+
+ >>> result = transform(doc, a="/a/b/text()", profile_run=True)
+ >>> profile = result.xslt_profile
+
+The value of the ``xslt_profile`` property is an ElementTree with profiling
+data about each template, similar to the following:
+
+.. sourcecode:: xml
+
+ <profile>
+ <template rank="1" match="/" name="" mode="" calls="1" time="1" average="1"/>
+ </profile>
+
+Note that this is a read-only document. You must not move any of its elements
+to other documents. Please deep-copy the document if you need to modify it.
+If you want to free it from memory, just do:
+
+.. sourcecode:: pycon
+
+ >>> del result.xslt_profile
diff --git a/download_artefacts.py b/download_artefacts.py
new file mode 100755
index 0000000..4502517
--- /dev/null
+++ b/download_artefacts.py
@@ -0,0 +1,136 @@
+#!/usr/bin/python3
+
+import itertools
+import json
+import logging
+import re
+import shutil
+import datetime
+
+from concurrent.futures import ProcessPoolExecutor as Pool, as_completed
+from pathlib import Path
+from urllib.request import urlopen
+from urllib.parse import urljoin
+
+logger = logging.getLogger()
+
+PARALLEL_DOWNLOADS = 6
+GITHUB_PACKAGE_URL = "https://github.com/lxml/lxml-wheels"
+APPVEYOR_PACKAGE_URL = "https://ci.appveyor.com/api/projects/scoder/lxml"
+APPVEYOR_BUILDJOBS_URL = "https://ci.appveyor.com/api/buildjobs"
+
+
+def find_github_files(version, base_package_url=GITHUB_PACKAGE_URL):
+ url = f"{base_package_url}/releases/tag/lxml-{version}"
+ with urlopen(url) as p:
+ page = p.read().decode()
+
+ for wheel_url, _ in itertools.groupby(sorted(re.findall(r'href="([^"]+\.whl)"', page))):
+ yield urljoin(base_package_url, wheel_url)
+
+
+def find_appveyor_files(version, base_package_url=APPVEYOR_PACKAGE_URL, base_job_url=APPVEYOR_BUILDJOBS_URL):
+ url = f"{base_package_url}/history?recordsNumber=20"
+ with urlopen(url) as p:
+ builds = json.load(p)["builds"]
+
+ tag = f"lxml-{version}"
+ for build in builds:
+ if build['isTag'] and build['tag'] == tag:
+ build_id = build['buildId']
+ break
+ else:
+ logger.warning(f"No appveyor build found for tag '{tag}'")
+ return
+
+ build_url = f"{base_package_url}/builds/{build_id}"
+ with urlopen(build_url) as p:
+ jobs = json.load(p)["build"]["jobs"]
+
+ for job in jobs:
+ artifacts_url = f"{base_job_url}/{job['jobId']}/artifacts/"
+
+ with urlopen(artifacts_url) as p:
+ for artifact in json.load(p):
+ yield urljoin(artifacts_url, artifact['fileName'])
+
+
+def download1(wheel_url, dest_dir):
+ wheel_name = wheel_url.rsplit("/", 1)[1]
+ logger.info(f"Downloading {wheel_url} ...")
+ with urlopen(wheel_url) as w:
+ file_path = dest_dir / wheel_name
+ if (file_path.exists()
+ and "Content-Length" in w.headers
+ and file_path.stat().st_size == int(w.headers["Content-Length"])):
+ logger.info(f"Already have {wheel_name}")
+ else:
+ try:
+ with open(file_path, "wb") as f:
+ shutil.copyfileobj(w, f)
+ except:
+ if file_path.exists():
+ file_path.unlink()
+ raise
+ else:
+ logger.info(f"Finished downloading {wheel_name}")
+ return wheel_name
+
+
+def download(urls, dest_dir, jobs=PARALLEL_DOWNLOADS):
+ with Pool(max_workers=jobs) as pool:
+ futures = [pool.submit(download1, url, dest_dir) for url in urls]
+ try:
+ for future in as_completed(futures):
+ wheel_name = future.result()
+ yield wheel_name
+ except KeyboardInterrupt:
+ for future in futures:
+ future.cancel()
+ raise
+
+
+def roundrobin(*iterables):
+ "roundrobin('ABC', 'D', 'EF') --> A D E B F C"
+ # Recipe credited to George Sakkis
+ from itertools import cycle, islice
+ num_active = len(iterables)
+ nexts = cycle(iter(it).__next__ for it in iterables)
+ while num_active:
+ try:
+ for next in nexts:
+ yield next()
+ except StopIteration:
+ # Remove the iterator we just exhausted from the cycle.
+ num_active -= 1
+ nexts = cycle(islice(nexts, num_active))
+
+
+def main(*args):
+ if not args:
+ print("Please pass the version to download")
+ return
+
+ version = args[0]
+ dest_dir = Path("dist") / version
+ if not dest_dir.is_dir():
+ dest_dir.mkdir()
+
+ start_time = datetime.datetime.now().replace(microsecond=0)
+ urls = roundrobin(
+ find_github_files(version),
+ find_appveyor_files(version),
+ )
+ count = sum(1 for _ in enumerate(download(urls, dest_dir)))
+ duration = datetime.datetime.now().replace(microsecond=0) - start_time
+ logger.info(f"Downloaded {count} files in {duration}.")
+
+
+if __name__ == "__main__":
+ import sys
+ logging.basicConfig(
+ stream=sys.stderr,
+ level=logging.INFO,
+ format="%(asctime)-15s %(message)s",
+ )
+ main(*sys.argv[1:])
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..988182b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+Cython>=0.29.7
diff --git a/samples/simple-ns.xml b/samples/simple-ns.xml
new file mode 100644
index 0000000..f81dda5
--- /dev/null
+++ b/samples/simple-ns.xml
@@ -0,0 +1,5 @@
+<root xmlns='http://namespace/'>
+ <element key='value'>text</element>
+ <element>text</element>tail
+ <empty-element/>
+</root>
diff --git a/samples/simple.xml b/samples/simple.xml
new file mode 100644
index 0000000..69a940b
--- /dev/null
+++ b/samples/simple.xml
@@ -0,0 +1,5 @@
+<root>
+ <element key='value'>text</element>
+ <element>text</element>tail
+ <empty-element/>
+</root>
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..845c0d9
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,252 @@
+import os
+import re
+import sys
+import fnmatch
+import os.path
+
+# for command line options and supported environment variables, please
+# see the end of 'setupinfo.py'
+
+if (2, 7) != sys.version_info[:2] < (3, 5):
+ print("This lxml version requires Python 2.7, 3.5 or later.")
+ sys.exit(1)
+
+try:
+ from setuptools import setup
+except ImportError:
+ from distutils.core import setup
+
+# make sure Cython finds include files in the project directory and not outside
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+
+import versioninfo
+import setupinfo
+
+# override these and pass --static for a static build. See
+# doc/build.txt for more information. If you do not pass --static
+# changing this will have no effect.
+STATIC_INCLUDE_DIRS = []
+STATIC_LIBRARY_DIRS = []
+STATIC_CFLAGS = []
+STATIC_BINARIES = []
+
+# create lxml-version.h file
+versioninfo.create_version_h()
+lxml_version = versioninfo.version()
+print("Building lxml version %s." % lxml_version)
+
+OPTION_RUN_TESTS = setupinfo.has_option('run-tests')
+
+branch_link = """
+After an official release of a new stable series, bug fixes may become
+available at
+https://github.com/lxml/lxml/tree/lxml-%(branch_version)s .
+Running ``easy_install lxml==%(branch_version)sbugfix`` will install
+the unreleased branch state from
+https://github.com/lxml/lxml/tarball/lxml-%(branch_version)s#egg=lxml-%(branch_version)sbugfix
+as soon as a maintenance branch has been established. Note that this
+requires Cython to be installed at an appropriate version for the build.
+
+"""
+
+if versioninfo.is_pre_release():
+ branch_link = ""
+
+
+extra_options = {}
+if 'setuptools' in sys.modules:
+ extra_options['zip_safe'] = False
+ extra_options['python_requires'] = (
+ # NOTE: keep in sync with Trove classifier list below.
+ '>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*')
+
+ try:
+ import pkg_resources
+ except ImportError:
+ pass
+ else:
+ f = open("requirements.txt", "r")
+ try:
+ deps = [str(req) for req in pkg_resources.parse_requirements(f)]
+ finally:
+ f.close()
+ extra_options['extras_require'] = {
+ 'source': deps,
+ 'cssselect': 'cssselect>=0.7',
+ 'html5': 'html5lib',
+ 'htmlsoup': 'BeautifulSoup4',
+ }
+
+extra_options.update(setupinfo.extra_setup_args())
+
+extra_options['package_data'] = {
+ 'lxml': [
+ 'etree.h',
+ 'etree_api.h',
+ 'lxml.etree.h',
+ 'lxml.etree_api.h',
+ ],
+ 'lxml.includes': [
+ '*.pxd', '*.h'
+ ],
+ 'lxml.isoschematron': [
+ 'resources/rng/iso-schematron.rng',
+ 'resources/xsl/*.xsl',
+ 'resources/xsl/iso-schematron-xslt1/*.xsl',
+ 'resources/xsl/iso-schematron-xslt1/readme.txt'
+ ],
+ }
+
+extra_options['package_dir'] = {
+ '': 'src'
+ }
+
+extra_options['packages'] = [
+ 'lxml', 'lxml.includes', 'lxml.html', 'lxml.isoschematron'
+ ]
+
+
+def setup_extra_options():
+ is_interesting_package = re.compile('^(libxml|libxslt|libexslt)$').match
+ def extract_files(directories, pattern='*'):
+ def get_files(root, dir_path, files):
+ return [ (root, dir_path, filename)
+ for filename in fnmatch.filter(files, pattern) ]
+
+ file_list = []
+ for dir_path in directories:
+ dir_path = os.path.realpath(dir_path)
+ for root, dirs, files in os.walk(dir_path):
+ rel_dir = root[len(dir_path)+1:]
+ if is_interesting_package(rel_dir):
+ file_list.extend(get_files(root, rel_dir, files))
+ return file_list
+
+ def build_packages(files):
+ packages = {}
+ seen = set()
+ for root_path, rel_path, filename in files:
+ if filename in seen:
+ # libxml2/libxslt header filenames are unique
+ continue
+ seen.add(filename)
+ package_path = '.'.join(rel_path.split(os.sep))
+ if package_path in packages:
+ root, package_files = packages[package_path]
+ if root != root_path:
+ print("conflicting directories found for include package '%s': %s and %s"
+ % (package_path, root_path, root))
+ continue
+ else:
+ package_files = []
+ packages[package_path] = (root_path, package_files)
+ package_files.append(filename)
+
+ return packages
+
+ # Copy Global Extra Options
+ extra_opts = dict(extra_options)
+
+ # Build ext modules
+ ext_modules = setupinfo.ext_modules(
+ STATIC_INCLUDE_DIRS, STATIC_LIBRARY_DIRS,
+ STATIC_CFLAGS, STATIC_BINARIES)
+ extra_opts['ext_modules'] = ext_modules
+
+ packages = extra_opts.get('packages', list())
+ package_dir = extra_opts.get('package_dir', dict())
+ package_data = extra_opts.get('package_data', dict())
+
+ # Add lxml.include with (lxml, libxslt headers...)
+ # python setup.py build --static --static-deps install
+ # python setup.py bdist_wininst --static
+ if setupinfo.OPTION_STATIC:
+ include_dirs = [] # keep them in order
+ for extension in ext_modules:
+ for inc_dir in extension.include_dirs:
+ if inc_dir not in include_dirs:
+ include_dirs.append(inc_dir)
+
+ header_packages = build_packages(extract_files(include_dirs))
+
+ for package_path, (root_path, filenames) in header_packages.items():
+ if package_path:
+ package = 'lxml.includes.' + package_path
+ packages.append(package)
+ else:
+ package = 'lxml.includes'
+ package_data[package] = filenames
+ package_dir[package] = root_path
+
+ return extra_opts
+
+setup(
+ name = "lxml",
+ version = lxml_version,
+ author="lxml dev team",
+ author_email="lxml-dev@lxml.de",
+ maintainer="lxml dev team",
+ maintainer_email="lxml-dev@lxml.de",
+ license="BSD",
+ url="https://lxml.de/",
+ # Commented out because this causes distutils to emit warnings
+ # `Unknown distribution option: 'bugtrack_url'`
+ # which distract folks from real causes of problems when troubleshooting
+ # bugtrack_url="https://bugs.launchpad.net/lxml",
+
+ description=(
+ "Powerful and Pythonic XML processing library"
+ " combining libxml2/libxslt with the ElementTree API."
+ ),
+ long_description=((("""\
+lxml is a Pythonic, mature binding for the libxml2 and libxslt libraries. It
+provides safe and convenient access to these libraries using the ElementTree
+API.
+
+It extends the ElementTree API significantly to offer support for XPath,
+RelaxNG, XML Schema, XSLT, C14N and much more.
+
+To contact the project, go to the `project home page
+<https://lxml.de/>`_ or see our bug tracker at
+https://launchpad.net/lxml
+
+In case you want to use the current in-development version of lxml,
+you can get it from the github repository at
+https://github.com/lxml/lxml . Note that this requires Cython to
+build the sources, see the build instructions on the project home
+page. To the same end, running ``easy_install lxml==dev`` will
+install lxml from
+https://github.com/lxml/lxml/tarball/master#egg=lxml-dev if you have
+an appropriate version of Cython installed.
+
+""" + branch_link) % {"branch_version": versioninfo.branch_version()}) +
+ versioninfo.changes()),
+ classifiers=[
+ versioninfo.dev_status(),
+ 'Intended Audience :: Developers',
+ 'Intended Audience :: Information Technology',
+ 'License :: OSI Approved :: BSD License',
+ 'Programming Language :: Cython',
+ # NOTE: keep in sync with 'python_requires' list above.
+ 'Programming Language :: Python :: 2',
+ 'Programming Language :: Python :: 2.7',
+ 'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3.5',
+ 'Programming Language :: Python :: 3.6',
+ 'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8',
+ 'Programming Language :: Python :: 3.9',
+ 'Programming Language :: C',
+ 'Operating System :: OS Independent',
+ 'Topic :: Text Processing :: Markup :: HTML',
+ 'Topic :: Text Processing :: Markup :: XML',
+ 'Topic :: Software Development :: Libraries :: Python Modules'
+ ],
+
+ **setup_extra_options()
+)
+
+if OPTION_RUN_TESTS:
+ print("Running tests.")
+ import test
+ sys.exit( test.main(sys.argv[:1]) )
diff --git a/setupinfo.py b/setupinfo.py
new file mode 100644
index 0000000..a44de25
--- /dev/null
+++ b/setupinfo.py
@@ -0,0 +1,551 @@
+import sys
+import io
+import os
+import os.path
+import subprocess
+from distutils.core import Extension
+from distutils.errors import CompileError, DistutilsOptionError
+from distutils.command.build_ext import build_ext as _build_ext
+from versioninfo import get_base_dir
+
+try:
+ import Cython.Compiler.Version
+ CYTHON_INSTALLED = True
+except ImportError:
+ CYTHON_INSTALLED = False
+
+EXT_MODULES = ["lxml.etree", "lxml.objectify"]
+COMPILED_MODULES = [
+ "lxml.builder",
+ "lxml._elementpath",
+ "lxml.html.diff",
+ "lxml.html.clean",
+ "lxml.sax",
+]
+HEADER_FILES = ['etree.h', 'etree_api.h']
+
+if hasattr(sys, 'pypy_version_info') or (
+ getattr(sys, 'implementation', None) and sys.implementation.name != 'cpython'):
+ # disable Cython compilation of Python modules in PyPy and other non-CPythons
+ del COMPILED_MODULES[:]
+
+SOURCE_PATH = "src"
+INCLUDE_PACKAGE_PATH = os.path.join(SOURCE_PATH, 'lxml', 'includes')
+
+if sys.version_info[0] >= 3:
+ _system_encoding = sys.getdefaultencoding()
+ if _system_encoding is None:
+ _system_encoding = "iso-8859-1" # :-)
+ def decode_input(data):
+ if isinstance(data, str):
+ return data
+ return data.decode(_system_encoding)
+else:
+ def decode_input(data):
+ return data
+
+def env_var(name):
+ value = os.getenv(name)
+ if value:
+ value = decode_input(value)
+ if sys.platform == 'win32' and ';' in value:
+ return value.split(';')
+ else:
+ return value.split()
+ else:
+ return []
+
+
+def _prefer_reldirs(base_dir, dirs):
+ return [
+ os.path.relpath(path) if path.startswith(base_dir) else path
+ for path in dirs
+ ]
+
+def ext_modules(static_include_dirs, static_library_dirs,
+ static_cflags, static_binaries):
+ global XML2_CONFIG, XSLT_CONFIG
+ if OPTION_BUILD_LIBXML2XSLT:
+ from buildlibxml import build_libxml2xslt, get_prebuilt_libxml2xslt
+ if sys.platform.startswith('win'):
+ get_prebuilt_libxml2xslt(
+ OPTION_DOWNLOAD_DIR, static_include_dirs, static_library_dirs)
+ else:
+ XML2_CONFIG, XSLT_CONFIG = build_libxml2xslt(
+ OPTION_DOWNLOAD_DIR, 'build/tmp',
+ static_include_dirs, static_library_dirs,
+ static_cflags, static_binaries,
+ libiconv_version=OPTION_LIBICONV_VERSION,
+ libxml2_version=OPTION_LIBXML2_VERSION,
+ libxslt_version=OPTION_LIBXSLT_VERSION,
+ zlib_version=OPTION_ZLIB_VERSION,
+ multicore=OPTION_MULTICORE)
+
+ modules = EXT_MODULES + COMPILED_MODULES
+ if OPTION_WITHOUT_OBJECTIFY:
+ modules = [entry for entry in modules if 'objectify' not in entry]
+
+ module_files = list(os.path.join(SOURCE_PATH, *module.split('.')) for module in modules)
+ c_files_exist = [os.path.exists(module + '.c') for module in module_files]
+
+ use_cython = True
+ if CYTHON_INSTALLED and (OPTION_WITH_CYTHON or not all(c_files_exist)):
+ print("Building with Cython %s." % Cython.Compiler.Version.version)
+ # generate module cleanup code
+ from Cython.Compiler import Options
+ Options.generate_cleanup_code = 3
+ Options.clear_to_none = False
+ elif not OPTION_WITHOUT_CYTHON and not all(c_files_exist):
+ for exists, module in zip(c_files_exist, module_files):
+ if not exists:
+ raise RuntimeError(
+ "ERROR: Trying to build without Cython, but pre-generated '%s.c' "
+ "is not available (pass --without-cython to ignore this error)." % module)
+ else:
+ if not all(c_files_exist):
+ for exists, module in zip(c_files_exist, module_files):
+ if not exists:
+ print("WARNING: Trying to build without Cython, but pre-generated "
+ "'%s.c' is not available." % module)
+ use_cython = False
+ print("Building without Cython.")
+
+ if not check_build_dependencies():
+ raise RuntimeError("Dependency missing")
+
+ base_dir = get_base_dir()
+ _include_dirs = _prefer_reldirs(
+ base_dir, include_dirs(static_include_dirs) + [
+ SOURCE_PATH,
+ INCLUDE_PACKAGE_PATH,
+ ])
+ _library_dirs = _prefer_reldirs(base_dir, library_dirs(static_library_dirs))
+ _cflags = cflags(static_cflags)
+ _ldflags = ['-isysroot', get_xcode_isysroot()] if sys.platform == 'darwin' else None
+ _define_macros = define_macros()
+ _libraries = libraries()
+
+ if _library_dirs:
+ message = "Building against libxml2/libxslt in "
+ if len(_library_dirs) > 1:
+ print(message + "one of the following directories:")
+ for dir in _library_dirs:
+ print(" " + dir)
+ else:
+ print(message + "the following directory: " +
+ _library_dirs[0])
+
+ if OPTION_AUTO_RPATH:
+ runtime_library_dirs = _library_dirs
+ else:
+ runtime_library_dirs = []
+
+ if CYTHON_INSTALLED and OPTION_SHOW_WARNINGS:
+ from Cython.Compiler import Errors
+ Errors.LEVEL = 0
+
+ cythonize_directives = {
+ 'binding': True,
+ }
+ if OPTION_WITH_COVERAGE:
+ cythonize_directives['linetrace'] = True
+
+ result = []
+ for module, src_file in zip(modules, module_files):
+ is_py = module in COMPILED_MODULES
+ main_module_source = src_file + (
+ '.c' if not use_cython else '.py' if is_py else '.pyx')
+ result.append(
+ Extension(
+ module,
+ sources = [main_module_source],
+ depends = find_dependencies(module),
+ extra_compile_args = _cflags,
+ extra_link_args = None if is_py else _ldflags,
+ extra_objects = None if is_py else static_binaries,
+ define_macros = _define_macros,
+ include_dirs = _include_dirs,
+ library_dirs = None if is_py else _library_dirs,
+ runtime_library_dirs = None if is_py else runtime_library_dirs,
+ libraries = None if is_py else _libraries,
+ ))
+ if CYTHON_INSTALLED and OPTION_WITH_CYTHON_GDB:
+ for ext in result:
+ ext.cython_gdb = True
+
+ if CYTHON_INSTALLED and use_cython:
+ # build .c files right now and convert Extension() objects
+ from Cython.Build import cythonize
+ result = cythonize(result, compiler_directives=cythonize_directives)
+
+ # for backwards compatibility reasons, provide "etree[_api].h" also as "lxml.etree[_api].h"
+ for header_filename in HEADER_FILES:
+ src_file = os.path.join(SOURCE_PATH, 'lxml', header_filename)
+ dst_file = os.path.join(SOURCE_PATH, 'lxml', 'lxml.' + header_filename)
+ if not os.path.exists(src_file):
+ continue
+ if os.path.exists(dst_file) and os.path.getmtime(dst_file) >= os.path.getmtime(src_file):
+ continue
+
+ with io.open(src_file, 'r', encoding='iso8859-1') as f:
+ content = f.read()
+ for filename in HEADER_FILES:
+ content = content.replace('"%s"' % filename, '"lxml.%s"' % filename)
+ with io.open(dst_file, 'w', encoding='iso8859-1') as f:
+ f.write(content)
+
+ return result
+
+
+def find_dependencies(module):
+ if not CYTHON_INSTALLED or 'lxml.html' in module:
+ return []
+ base_dir = get_base_dir()
+ package_dir = os.path.join(base_dir, SOURCE_PATH, 'lxml')
+ includes_dir = os.path.join(base_dir, INCLUDE_PACKAGE_PATH)
+
+ pxd_files = [
+ os.path.join(INCLUDE_PACKAGE_PATH, filename)
+ for filename in os.listdir(includes_dir)
+ if filename.endswith('.pxd')
+ ]
+
+ if module == 'lxml.etree':
+ pxi_files = [
+ os.path.join(SOURCE_PATH, 'lxml', filename)
+ for filename in os.listdir(package_dir)
+ if filename.endswith('.pxi') and 'objectpath' not in filename
+ ]
+ pxd_files = [
+ filename for filename in pxd_files
+ if 'etreepublic' not in filename
+ ]
+ elif module == 'lxml.objectify':
+ pxi_files = [os.path.join(SOURCE_PATH, 'lxml', 'objectpath.pxi')]
+ else:
+ pxi_files = pxd_files = []
+
+ return pxd_files + pxi_files
+
+
+def extra_setup_args():
+ class CheckLibxml2BuildExt(_build_ext):
+ """Subclass to check whether libxml2 is really available if the build fails"""
+ def run(self):
+ try:
+ _build_ext.run(self) # old-style class in Py2
+ except CompileError as e:
+ print('Compile failed: %s' % e)
+ if not seems_to_have_libxml2():
+ print_libxml_error()
+ raise
+ result = {'cmdclass': {'build_ext': CheckLibxml2BuildExt}}
+ return result
+
+
+def seems_to_have_libxml2():
+ from distutils import ccompiler
+ compiler = ccompiler.new_compiler()
+ return compiler.has_function(
+ 'xmlXPathInit',
+ include_dirs=include_dirs([]) + ['/usr/include/libxml2'],
+ includes=['libxml/xpath.h'],
+ library_dirs=library_dirs([]),
+ libraries=['xml2'])
+
+
+def print_libxml_error():
+ print('*********************************************************************************')
+ print('Could not find function xmlCheckVersion in library libxml2. Is libxml2 installed?')
+ if sys.platform in ('darwin',):
+ print('Perhaps try: xcode-select --install')
+ print('*********************************************************************************')
+
+
+def libraries():
+ standard_libs = []
+ if 'linux' in sys.platform:
+ standard_libs.append('rt')
+ if not OPTION_BUILD_LIBXML2XSLT:
+ standard_libs.append('z')
+ standard_libs.append('m')
+
+ if sys.platform in ('win32',):
+ libs = ['libxslt', 'libexslt', 'libxml2', 'iconv']
+ if OPTION_STATIC:
+ libs = ['%s_a' % lib for lib in libs]
+ libs.extend(['zlib', 'WS2_32'])
+ elif OPTION_STATIC:
+ libs = standard_libs
+ else:
+ libs = ['xslt', 'exslt', 'xml2'] + standard_libs
+ return libs
+
+def library_dirs(static_library_dirs):
+ if OPTION_STATIC:
+ if not static_library_dirs:
+ static_library_dirs = env_var('LIBRARY')
+ assert static_library_dirs, "Static build not configured, see doc/build.txt"
+ return static_library_dirs
+ # filter them from xslt-config --libs
+ result = []
+ possible_library_dirs = flags('libs')
+ for possible_library_dir in possible_library_dirs:
+ if possible_library_dir.startswith('-L'):
+ result.append(possible_library_dir[2:])
+ return result
+
+def include_dirs(static_include_dirs):
+ if OPTION_STATIC:
+ if not static_include_dirs:
+ static_include_dirs = env_var('INCLUDE')
+ return static_include_dirs
+ # filter them from xslt-config --cflags
+ result = []
+ possible_include_dirs = flags('cflags')
+ for possible_include_dir in possible_include_dirs:
+ if possible_include_dir.startswith('-I'):
+ result.append(possible_include_dir[2:])
+ return result
+
+def cflags(static_cflags):
+ result = []
+ if not OPTION_SHOW_WARNINGS:
+ result.append('-w')
+ if OPTION_DEBUG_GCC:
+ result.append('-g2')
+
+ if OPTION_STATIC:
+ if not static_cflags:
+ static_cflags = env_var('CFLAGS')
+ result.extend(static_cflags)
+ else:
+ # anything from xslt-config --cflags that doesn't start with -I
+ possible_cflags = flags('cflags')
+ for possible_cflag in possible_cflags:
+ if not possible_cflag.startswith('-I'):
+ result.append(possible_cflag)
+
+ if sys.platform in ('darwin',):
+ for opt in result:
+ if 'flat_namespace' in opt:
+ break
+ else:
+ result.append('-flat_namespace')
+
+ return result
+
+def define_macros():
+ macros = []
+ if OPTION_WITHOUT_ASSERT:
+ macros.append(('PYREX_WITHOUT_ASSERTIONS', None))
+ if OPTION_WITHOUT_THREADING:
+ macros.append(('WITHOUT_THREADING', None))
+ if OPTION_WITH_REFNANNY:
+ macros.append(('CYTHON_REFNANNY', None))
+ if OPTION_WITH_UNICODE_STRINGS:
+ macros.append(('LXML_UNICODE_STRINGS', '1'))
+ if OPTION_WITH_COVERAGE:
+ macros.append(('CYTHON_TRACE_NOGIL', '1'))
+ # Disable showing C lines in tracebacks, unless explicitly requested.
+ macros.append(('CYTHON_CLINE_IN_TRACEBACK', '1' if OPTION_WITH_CLINES else '0'))
+ return macros
+
+
+def run_command(cmd, *args):
+ if not cmd:
+ return ''
+ if args:
+ cmd = ' '.join((cmd,) + args)
+
+ p = subprocess.Popen(cmd, shell=True,
+ stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+ stdout_data, errors = p.communicate()
+
+ if errors:
+ return ''
+ return decode_input(stdout_data).strip()
+
+
+def check_min_version(version, min_version, libname):
+ if not version:
+ # this is ok for targets like sdist etc.
+ return True
+ lib_version = tuple(map(int, version.split('.')[:3]))
+ req_version = tuple(map(int, min_version.split('.')[:3]))
+ if lib_version < req_version:
+ print("Minimum required version of %s is %s. Your system has version %s." % (
+ libname, min_version, version))
+ return False
+ return True
+
+
+def get_library_version(prog, libname=None):
+ if libname:
+ return run_command(prog, '--modversion %s' % libname)
+ else:
+ return run_command(prog, '--version')
+
+
+PKG_CONFIG = None
+XML2_CONFIG = None
+XSLT_CONFIG = None
+
+def get_library_versions():
+ global XML2_CONFIG, XSLT_CONFIG
+
+ # Pre-built libraries
+ if XML2_CONFIG and XSLT_CONFIG:
+ xml2_version = get_library_version(XML2_CONFIG)
+ xslt_version = get_library_version(XSLT_CONFIG)
+ return xml2_version, xslt_version
+
+ # Path to xml2-config and xslt-config specified on the command line
+ if OPTION_WITH_XML2_CONFIG:
+ xml2_version = get_library_version(OPTION_WITH_XML2_CONFIG)
+ if xml2_version and OPTION_WITH_XSLT_CONFIG:
+ xslt_version = get_library_version(OPTION_WITH_XSLT_CONFIG)
+ if xslt_version:
+ XML2_CONFIG = OPTION_WITH_XML2_CONFIG
+ XSLT_CONFIG = OPTION_WITH_XSLT_CONFIG
+ return xml2_version, xslt_version
+
+ # Try pkg-config
+ global PKG_CONFIG
+ PKG_CONFIG = os.getenv('PKG_CONFIG', 'pkg-config')
+ xml2_version = get_library_version(PKG_CONFIG, 'libxml-2.0')
+ if xml2_version:
+ xslt_version = get_library_version(PKG_CONFIG, 'libxslt')
+ if xml2_version and xslt_version:
+ return xml2_version, xslt_version
+
+ # Try xml2-config and xslt-config
+ XML2_CONFIG = os.getenv('XML2_CONFIG', 'xml2-config')
+ xml2_version = get_library_version(XML2_CONFIG)
+ if xml2_version:
+ XSLT_CONFIG = os.getenv('XSLT_CONFIG', 'xslt-config')
+ xslt_version = get_library_version(XSLT_CONFIG)
+ if xml2_version and xslt_version:
+ return xml2_version, xslt_version
+
+ # One or both build dependencies not found. Fail on Linux platforms only.
+ if sys.platform.startswith('win'):
+ return '', ''
+ print("Error: Please make sure the libxml2 and libxslt development packages are installed.")
+ sys.exit(1)
+
+
+def check_build_dependencies():
+ xml2_version, xslt_version = get_library_versions()
+
+ xml2_ok = check_min_version(xml2_version, '2.7.0', 'libxml2')
+ xslt_ok = check_min_version(xslt_version, '1.1.23', 'libxslt')
+
+ if xml2_version and xslt_version:
+ print("Building against libxml2 %s and libxslt %s" % (xml2_version, xslt_version))
+ else:
+ print("Building against pre-built libxml2 andl libxslt libraries")
+
+ return (xml2_ok and xslt_ok)
+
+
+def get_flags(prog, option, libname=None):
+ if libname:
+ return run_command(prog, '--%s %s' % (option, libname))
+ else:
+ return run_command(prog, '--%s' % option)
+
+
+def flags(option):
+ if XML2_CONFIG:
+ xml2_flags = get_flags(XML2_CONFIG, option)
+ xslt_flags = get_flags(XSLT_CONFIG, option)
+ else:
+ xml2_flags = get_flags(PKG_CONFIG, option, 'libxml-2.0')
+ xslt_flags = get_flags(PKG_CONFIG, option, 'libxslt')
+
+ flag_list = xml2_flags.split()
+ for flag in xslt_flags.split():
+ if flag not in flag_list:
+ flag_list.append(flag)
+ return flag_list
+
+
+def get_xcode_isysroot():
+ return run_command('xcrun', '--show-sdk-path')
+
+
+## Option handling:
+
+def has_option(name):
+ try:
+ sys.argv.remove('--%s' % name)
+ return True
+ except ValueError:
+ pass
+ # allow passing all cmd line options also as environment variables
+ env_val = os.getenv(name.upper().replace('-', '_'), 'false').lower()
+ if env_val == "true":
+ return True
+ return False
+
+
+def option_value(name, deprecated_for=None):
+ for index, option in enumerate(sys.argv):
+ if option == '--' + name:
+ if index+1 >= len(sys.argv):
+ raise DistutilsOptionError(
+ 'The option %s requires a value' % option)
+ value = sys.argv[index+1]
+ sys.argv[index:index+2] = []
+ if deprecated_for:
+ print_deprecated_option(name, deprecated_for)
+ return value
+ if option.startswith('--' + name + '='):
+ value = option[len(name)+3:]
+ sys.argv[index:index+1] = []
+ if deprecated_for:
+ print_deprecated_option(name, deprecated_for)
+ return value
+ env_name = name.upper().replace('-', '_')
+ env_val = os.getenv(env_name)
+ if env_val and deprecated_for:
+ print_deprecated_option(env_name, deprecated_for.upper().replace('-', '_'))
+ return env_val
+
+
+def print_deprecated_option(name, new_name):
+ print("WARN: Option '%s' is deprecated. Use '%s' instead." % (name, new_name))
+
+
+staticbuild = bool(os.environ.get('STATICBUILD', ''))
+# pick up any commandline options and/or env variables
+OPTION_WITHOUT_OBJECTIFY = has_option('without-objectify')
+OPTION_WITH_UNICODE_STRINGS = has_option('with-unicode-strings')
+OPTION_WITHOUT_ASSERT = has_option('without-assert')
+OPTION_WITHOUT_THREADING = has_option('without-threading')
+OPTION_WITHOUT_CYTHON = has_option('without-cython')
+OPTION_WITH_CYTHON = has_option('with-cython')
+OPTION_WITH_CYTHON_GDB = has_option('cython-gdb')
+OPTION_WITH_REFNANNY = has_option('with-refnanny')
+OPTION_WITH_COVERAGE = has_option('with-coverage')
+OPTION_WITH_CLINES = has_option('with-clines')
+if OPTION_WITHOUT_CYTHON:
+ CYTHON_INSTALLED = False
+OPTION_STATIC = staticbuild or has_option('static')
+OPTION_DEBUG_GCC = has_option('debug-gcc')
+OPTION_SHOW_WARNINGS = has_option('warnings')
+OPTION_AUTO_RPATH = has_option('auto-rpath')
+OPTION_BUILD_LIBXML2XSLT = staticbuild or has_option('static-deps')
+if OPTION_BUILD_LIBXML2XSLT:
+ OPTION_STATIC = True
+OPTION_WITH_XML2_CONFIG = option_value('with-xml2-config') or option_value('xml2-config', deprecated_for='with-xml2-config')
+OPTION_WITH_XSLT_CONFIG = option_value('with-xslt-config') or option_value('xslt-config', deprecated_for='with-xslt-config')
+OPTION_LIBXML2_VERSION = option_value('libxml2-version')
+OPTION_LIBXSLT_VERSION = option_value('libxslt-version')
+OPTION_LIBICONV_VERSION = option_value('libiconv-version')
+OPTION_ZLIB_VERSION = option_value('zlib-version')
+OPTION_MULTICORE = option_value('multicore')
+OPTION_DOWNLOAD_DIR = option_value('download-dir')
+if OPTION_DOWNLOAD_DIR is None:
+ OPTION_DOWNLOAD_DIR = 'libs'
diff --git a/src/lxml/ElementInclude.py b/src/lxml/ElementInclude.py
new file mode 100644
index 0000000..2188433
--- /dev/null
+++ b/src/lxml/ElementInclude.py
@@ -0,0 +1,244 @@
+#
+# ElementTree
+# $Id: ElementInclude.py 1862 2004-06-18 07:31:02Z Fredrik $
+#
+# limited xinclude support for element trees
+#
+# history:
+# 2003-08-15 fl created
+# 2003-11-14 fl fixed default loader
+#
+# Copyright (c) 2003-2004 by Fredrik Lundh. All rights reserved.
+#
+# fredrik@pythonware.com
+# http://www.pythonware.com
+#
+# --------------------------------------------------------------------
+# The ElementTree toolkit is
+#
+# Copyright (c) 1999-2004 by Fredrik Lundh
+#
+# By obtaining, using, and/or copying this software and/or its
+# associated documentation, you agree that you have read, understood,
+# and will comply with the following terms and conditions:
+#
+# Permission to use, copy, modify, and distribute this software and
+# its associated documentation for any purpose and without fee is
+# hereby granted, provided that the above copyright notice appears in
+# all copies, and that both that copyright notice and this permission
+# notice appear in supporting documentation, and that the name of
+# Secret Labs AB or the author not be used in advertising or publicity
+# pertaining to distribution of the software without specific, written
+# prior permission.
+#
+# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
+# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
+# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
+# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+# OF THIS SOFTWARE.
+# --------------------------------------------------------------------
+
+"""
+Limited XInclude support for the ElementTree package.
+
+While lxml.etree has full support for XInclude (see
+`etree.ElementTree.xinclude()`), this module provides a simpler, pure
+Python, ElementTree compatible implementation that supports a simple
+form of custom URL resolvers.
+"""
+
+from lxml import etree
+try:
+ from urlparse import urljoin
+ from urllib2 import urlopen
+except ImportError:
+ # Python 3
+ from urllib.parse import urljoin
+ from urllib.request import urlopen
+
+XINCLUDE = "{http://www.w3.org/2001/XInclude}"
+
+XINCLUDE_INCLUDE = XINCLUDE + "include"
+XINCLUDE_FALLBACK = XINCLUDE + "fallback"
+XINCLUDE_ITER_TAG = XINCLUDE + "*"
+
+# For security reasons, the inclusion depth is limited to this read-only value by default.
+DEFAULT_MAX_INCLUSION_DEPTH = 6
+
+
+##
+# Fatal include error.
+
+class FatalIncludeError(etree.LxmlSyntaxError):
+ pass
+
+
+class LimitedRecursiveIncludeError(FatalIncludeError):
+ pass
+
+
+##
+# ET compatible default loader.
+# This loader reads an included resource from disk.
+#
+# @param href Resource reference.
+# @param parse Parse mode. Either "xml" or "text".
+# @param encoding Optional text encoding.
+# @return The expanded resource. If the parse mode is "xml", this
+# is an ElementTree instance. If the parse mode is "text", this
+# is a Unicode string. If the loader fails, it can return None
+# or raise an IOError exception.
+# @throws IOError If the loader fails to load the resource.
+
+def default_loader(href, parse, encoding=None):
+ file = open(href, 'rb')
+ if parse == "xml":
+ data = etree.parse(file).getroot()
+ else:
+ data = file.read()
+ if not encoding:
+ encoding = 'utf-8'
+ data = data.decode(encoding)
+ file.close()
+ return data
+
+
+##
+# Default loader used by lxml.etree - handles custom resolvers properly
+#
+
+def _lxml_default_loader(href, parse, encoding=None, parser=None):
+ if parse == "xml":
+ data = etree.parse(href, parser).getroot()
+ else:
+ if "://" in href:
+ f = urlopen(href)
+ else:
+ f = open(href, 'rb')
+ data = f.read()
+ f.close()
+ if not encoding:
+ encoding = 'utf-8'
+ data = data.decode(encoding)
+ return data
+
+
+##
+# Wrapper for ET compatibility - drops the parser
+
+def _wrap_et_loader(loader):
+ def load(href, parse, encoding=None, parser=None):
+ return loader(href, parse, encoding)
+ return load
+
+
+##
+# Expand XInclude directives.
+#
+# @param elem Root element.
+# @param loader Optional resource loader. If omitted, it defaults
+# to {@link default_loader}. If given, it should be a callable
+# that implements the same interface as <b>default_loader</b>.
+# @param base_url The base URL of the original file, to resolve
+# relative include file references.
+# @param max_depth The maximum number of recursive inclusions.
+# Limited to reduce the risk of malicious content explosion.
+# Pass None to disable the limitation.
+# @throws LimitedRecursiveIncludeError If the {@link max_depth} was exceeded.
+# @throws FatalIncludeError If the function fails to include a given
+# resource, or if the tree contains malformed XInclude elements.
+# @throws IOError If the function fails to load a given resource.
+# @returns the node or its replacement if it was an XInclude node
+
+def include(elem, loader=None, base_url=None,
+ max_depth=DEFAULT_MAX_INCLUSION_DEPTH):
+ if max_depth is None:
+ max_depth = -1
+ elif max_depth < 0:
+ raise ValueError("expected non-negative depth or None for 'max_depth', got %r" % max_depth)
+
+ if base_url is None:
+ if hasattr(elem, 'getroot'):
+ tree = elem
+ elem = elem.getroot()
+ else:
+ tree = elem.getroottree()
+ if hasattr(tree, 'docinfo'):
+ base_url = tree.docinfo.URL
+ elif hasattr(elem, 'getroot'):
+ elem = elem.getroot()
+ _include(elem, loader, base_url, max_depth)
+
+
+def _include(elem, loader=None, base_url=None,
+ max_depth=DEFAULT_MAX_INCLUSION_DEPTH, _parent_hrefs=None):
+ if loader is not None:
+ load_include = _wrap_et_loader(loader)
+ else:
+ load_include = _lxml_default_loader
+
+ if _parent_hrefs is None:
+ _parent_hrefs = set()
+
+ parser = elem.getroottree().parser
+
+ include_elements = list(
+ elem.iter(XINCLUDE_ITER_TAG))
+
+ for e in include_elements:
+ if e.tag == XINCLUDE_INCLUDE:
+ # process xinclude directive
+ href = urljoin(base_url, e.get("href"))
+ parse = e.get("parse", "xml")
+ parent = e.getparent()
+ if parse == "xml":
+ if href in _parent_hrefs:
+ raise FatalIncludeError(
+ "recursive include of %r detected" % href
+ )
+ if max_depth == 0:
+ raise LimitedRecursiveIncludeError(
+ "maximum xinclude depth reached when including file %s" % href)
+ node = load_include(href, parse, parser=parser)
+ if node is None:
+ raise FatalIncludeError(
+ "cannot load %r as %r" % (href, parse)
+ )
+ node = _include(node, loader, href, max_depth - 1, {href} | _parent_hrefs)
+ if e.tail:
+ node.tail = (node.tail or "") + e.tail
+ if parent is None:
+ return node # replaced the root node!
+ parent.replace(e, node)
+ elif parse == "text":
+ text = load_include(href, parse, encoding=e.get("encoding"))
+ if text is None:
+ raise FatalIncludeError(
+ "cannot load %r as %r" % (href, parse)
+ )
+ predecessor = e.getprevious()
+ if predecessor is not None:
+ predecessor.tail = (predecessor.tail or "") + text
+ elif parent is None:
+ return text # replaced the root node!
+ else:
+ parent.text = (parent.text or "") + text + (e.tail or "")
+ parent.remove(e)
+ else:
+ raise FatalIncludeError(
+ "unknown parse type in xi:include tag (%r)" % parse
+ )
+ elif e.tag == XINCLUDE_FALLBACK:
+ parent = e.getparent()
+ if parent is not None and parent.tag != XINCLUDE_INCLUDE:
+ raise FatalIncludeError(
+ "xi:fallback tag must be child of xi:include (%r)" % e.tag
+ )
+ else:
+ raise FatalIncludeError(
+ "Invalid element found in XInclude namespace (%r)" % e.tag
+ )
+ return elem
diff --git a/src/lxml/__init__.pxd b/src/lxml/__init__.pxd
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/lxml/__init__.pxd
diff --git a/src/lxml/__init__.py b/src/lxml/__init__.py
new file mode 100644
index 0000000..c569544
--- /dev/null
+++ b/src/lxml/__init__.py
@@ -0,0 +1,23 @@
+# this is a package
+
+__version__ = "4.6.3"
+
+
+def get_include():
+ """
+ Returns a list of header include paths (for lxml itself, libxml2
+ and libxslt) needed to compile C code against lxml if it was built
+ with statically linked libraries.
+ """
+ import os
+ lxml_path = __path__[0]
+ include_path = os.path.join(lxml_path, 'includes')
+ includes = [include_path, lxml_path]
+
+ for name in os.listdir(include_path):
+ path = os.path.join(include_path, name)
+ if os.path.isdir(path):
+ includes.append(path)
+
+ return includes
+
diff --git a/src/lxml/_elementpath.py b/src/lxml/_elementpath.py
new file mode 100644
index 0000000..eabd81c
--- /dev/null
+++ b/src/lxml/_elementpath.py
@@ -0,0 +1,345 @@
+# cython: language_level=2
+
+#
+# ElementTree
+# $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
+#
+# limited xpath support for element trees
+#
+# history:
+# 2003-05-23 fl created
+# 2003-05-28 fl added support for // etc
+# 2003-08-27 fl fixed parsing of periods in element names
+# 2007-09-10 fl new selection engine
+# 2007-09-12 fl fixed parent selector
+# 2007-09-13 fl added iterfind; changed findall to return a list
+# 2007-11-30 fl added namespaces support
+# 2009-10-30 fl added child element value filter
+#
+# Copyright (c) 2003-2009 by Fredrik Lundh. All rights reserved.
+#
+# fredrik@pythonware.com
+# http://www.pythonware.com
+#
+# --------------------------------------------------------------------
+# The ElementTree toolkit is
+#
+# Copyright (c) 1999-2009 by Fredrik Lundh
+#
+# By obtaining, using, and/or copying this software and/or its
+# associated documentation, you agree that you have read, understood,
+# and will comply with the following terms and conditions:
+#
+# Permission to use, copy, modify, and distribute this software and
+# its associated documentation for any purpose and without fee is
+# hereby granted, provided that the above copyright notice appears in
+# all copies, and that both that copyright notice and this permission
+# notice appear in supporting documentation, and that the name of
+# Secret Labs AB or the author not be used in advertising or publicity
+# pertaining to distribution of the software without specific, written
+# prior permission.
+#
+# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
+# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
+# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
+# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+# OF THIS SOFTWARE.
+# --------------------------------------------------------------------
+
+##
+# Implementation module for XPath support. There's usually no reason
+# to import this module directly; the <b>ElementTree</b> does this for
+# you, if needed.
+##
+
+from __future__ import absolute_import
+
+import re
+
+xpath_tokenizer_re = re.compile(
+ "("
+ "'[^']*'|\"[^\"]*\"|"
+ "::|"
+ "//?|"
+ r"\.\.|"
+ r"\(\)|"
+ r"[/.*:\[\]\(\)@=])|"
+ r"((?:\{[^}]+\})?[^/\[\]\(\)@=\s]+)|"
+ r"\s+"
+ )
+
+def xpath_tokenizer(pattern, namespaces=None):
+ # ElementTree uses '', lxml used None originally.
+ default_namespace = (namespaces.get(None) or namespaces.get('')) if namespaces else None
+ parsing_attribute = False
+ for token in xpath_tokenizer_re.findall(pattern):
+ ttype, tag = token
+ if tag and tag[0] != "{":
+ if ":" in tag:
+ prefix, uri = tag.split(":", 1)
+ try:
+ if not namespaces:
+ raise KeyError
+ yield ttype, "{%s}%s" % (namespaces[prefix], uri)
+ except KeyError:
+ raise SyntaxError("prefix %r not found in prefix map" % prefix)
+ elif default_namespace and not parsing_attribute:
+ yield ttype, "{%s}%s" % (default_namespace, tag)
+ else:
+ yield token
+ parsing_attribute = False
+ else:
+ yield token
+ parsing_attribute = ttype == '@'
+
+
+def prepare_child(next, token):
+ tag = token[1]
+ def select(result):
+ for elem in result:
+ for e in elem.iterchildren(tag):
+ yield e
+ return select
+
+def prepare_star(next, token):
+ def select(result):
+ for elem in result:
+ for e in elem.iterchildren('*'):
+ yield e
+ return select
+
+def prepare_self(next, token):
+ def select(result):
+ return result
+ return select
+
+def prepare_descendant(next, token):
+ token = next()
+ if token[0] == "*":
+ tag = "*"
+ elif not token[0]:
+ tag = token[1]
+ else:
+ raise SyntaxError("invalid descendant")
+ def select(result):
+ for elem in result:
+ for e in elem.iterdescendants(tag):
+ yield e
+ return select
+
+def prepare_parent(next, token):
+ def select(result):
+ for elem in result:
+ parent = elem.getparent()
+ if parent is not None:
+ yield parent
+ return select
+
+def prepare_predicate(next, token):
+ # FIXME: replace with real parser!!! refs:
+ # http://effbot.org/zone/simple-iterator-parser.htm
+ # http://javascript.crockford.com/tdop/tdop.html
+ signature = ''
+ predicate = []
+ while 1:
+ token = next()
+ if token[0] == "]":
+ break
+ if token == ('', ''):
+ # ignore whitespace
+ continue
+ if token[0] and token[0][:1] in "'\"":
+ token = "'", token[0][1:-1]
+ signature += token[0] or "-"
+ predicate.append(token[1])
+
+ # use signature to determine predicate type
+ if signature == "@-":
+ # [@attribute] predicate
+ key = predicate[1]
+ def select(result):
+ for elem in result:
+ if elem.get(key) is not None:
+ yield elem
+ return select
+ if signature == "@-='":
+ # [@attribute='value']
+ key = predicate[1]
+ value = predicate[-1]
+ def select(result):
+ for elem in result:
+ if elem.get(key) == value:
+ yield elem
+ return select
+ if signature == "-" and not re.match(r"-?\d+$", predicate[0]):
+ # [tag]
+ tag = predicate[0]
+ def select(result):
+ for elem in result:
+ for _ in elem.iterchildren(tag):
+ yield elem
+ break
+ return select
+ if signature == ".='" or (signature == "-='" and not re.match(r"-?\d+$", predicate[0])):
+ # [.='value'] or [tag='value']
+ tag = predicate[0]
+ value = predicate[-1]
+ if tag:
+ def select(result):
+ for elem in result:
+ for e in elem.iterchildren(tag):
+ if "".join(e.itertext()) == value:
+ yield elem
+ break
+ else:
+ def select(result):
+ for elem in result:
+ if "".join(elem.itertext()) == value:
+ yield elem
+ return select
+ if signature == "-" or signature == "-()" or signature == "-()-":
+ # [index] or [last()] or [last()-index]
+ if signature == "-":
+ # [index]
+ index = int(predicate[0]) - 1
+ if index < 0:
+ if index == -1:
+ raise SyntaxError(
+ "indices in path predicates are 1-based, not 0-based")
+ else:
+ raise SyntaxError("path index >= 1 expected")
+ else:
+ if predicate[0] != "last":
+ raise SyntaxError("unsupported function")
+ if signature == "-()-":
+ try:
+ index = int(predicate[2]) - 1
+ except ValueError:
+ raise SyntaxError("unsupported expression")
+ else:
+ index = -1
+ def select(result):
+ for elem in result:
+ parent = elem.getparent()
+ if parent is None:
+ continue
+ try:
+ # FIXME: what if the selector is "*" ?
+ elems = list(parent.iterchildren(elem.tag))
+ if elems[index] is elem:
+ yield elem
+ except IndexError:
+ pass
+ return select
+ raise SyntaxError("invalid predicate")
+
+ops = {
+ "": prepare_child,
+ "*": prepare_star,
+ ".": prepare_self,
+ "..": prepare_parent,
+ "//": prepare_descendant,
+ "[": prepare_predicate,
+}
+
+
+# --------------------------------------------------------------------
+
+_cache = {}
+
+
+def _build_path_iterator(path, namespaces):
+ """compile selector pattern"""
+ if path[-1:] == "/":
+ path += "*" # implicit all (FIXME: keep this?)
+
+ cache_key = (path,)
+ if namespaces:
+ # lxml originally used None for the default namespace but ElementTree uses the
+ # more convenient (all-strings-dict) empty string, so we support both here,
+ # preferring the more convenient '', as long as they aren't ambiguous.
+ if None in namespaces:
+ if '' in namespaces and namespaces[None] != namespaces['']:
+ raise ValueError("Ambiguous default namespace provided: %r versus %r" % (
+ namespaces[None], namespaces['']))
+ cache_key += (namespaces[None],) + tuple(sorted(
+ item for item in namespaces.items() if item[0] is not None))
+ else:
+ cache_key += tuple(sorted(namespaces.items()))
+
+ try:
+ return _cache[cache_key]
+ except KeyError:
+ pass
+ if len(_cache) > 100:
+ _cache.clear()
+
+ if path[:1] == "/":
+ raise SyntaxError("cannot use absolute path on element")
+ stream = iter(xpath_tokenizer(path, namespaces))
+ try:
+ _next = stream.next
+ except AttributeError:
+ # Python 3
+ _next = stream.__next__
+ try:
+ token = _next()
+ except StopIteration:
+ raise SyntaxError("empty path expression")
+ selector = []
+ while 1:
+ try:
+ selector.append(ops[token[0]](_next, token))
+ except StopIteration:
+ raise SyntaxError("invalid path")
+ try:
+ token = _next()
+ if token[0] == "/":
+ token = _next()
+ except StopIteration:
+ break
+ _cache[cache_key] = selector
+ return selector
+
+
+##
+# Iterate over the matching nodes
+
+def iterfind(elem, path, namespaces=None):
+ selector = _build_path_iterator(path, namespaces)
+ result = iter((elem,))
+ for select in selector:
+ result = select(result)
+ return result
+
+
+##
+# Find first matching object.
+
+def find(elem, path, namespaces=None):
+ it = iterfind(elem, path, namespaces)
+ try:
+ return next(it)
+ except StopIteration:
+ return None
+
+
+##
+# Find all matching objects.
+
+def findall(elem, path, namespaces=None):
+ return list(iterfind(elem, path, namespaces))
+
+
+##
+# Find text for first matching object.
+
+def findtext(elem, path, default=None, namespaces=None):
+ el = find(elem, path, namespaces)
+ if el is None:
+ return default
+ else:
+ return el.text or ''
diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi
new file mode 100644
index 0000000..5eb3416
--- /dev/null
+++ b/src/lxml/apihelpers.pxi
@@ -0,0 +1,1799 @@
+# Private/public helper functions for API functions
+
+from lxml.includes cimport uri
+
+
+cdef void displayNode(xmlNode* c_node, indent):
+ # to help with debugging
+ cdef xmlNode* c_child
+ try:
+ print indent * u' ', <long>c_node
+ c_child = c_node.children
+ while c_child is not NULL:
+ displayNode(c_child, indent + 1)
+ c_child = c_child.next
+ finally:
+ return # swallow any exceptions
+
+cdef inline int _assertValidNode(_Element element) except -1:
+ assert element._c_node is not NULL, u"invalid Element proxy at %s" % id(element)
+
+cdef inline int _assertValidDoc(_Document doc) except -1:
+ assert doc._c_doc is not NULL, u"invalid Document proxy at %s" % id(doc)
+
+cdef _Document _documentOrRaise(object input):
+ u"""Call this to get the document of a _Document, _ElementTree or _Element
+ object, or to raise an exception if it can't be determined.
+
+ Should be used in all API functions for consistency.
+ """
+ cdef _Document doc
+ if isinstance(input, _ElementTree):
+ if (<_ElementTree>input)._context_node is not None:
+ doc = (<_ElementTree>input)._context_node._doc
+ else:
+ doc = None
+ elif isinstance(input, _Element):
+ doc = (<_Element>input)._doc
+ elif isinstance(input, _Document):
+ doc = <_Document>input
+ else:
+ raise TypeError, f"Invalid input object: {python._fqtypename(input).decode('utf8')}"
+ if doc is None:
+ raise ValueError, f"Input object has no document: {python._fqtypename(input).decode('utf8')}"
+ _assertValidDoc(doc)
+ return doc
+
+cdef _Element _rootNodeOrRaise(object input):
+ u"""Call this to get the root node of a _Document, _ElementTree or
+ _Element object, or to raise an exception if it can't be determined.
+
+ Should be used in all API functions for consistency.
+ """
+ cdef _Element node
+ if isinstance(input, _ElementTree):
+ node = (<_ElementTree>input)._context_node
+ elif isinstance(input, _Element):
+ node = <_Element>input
+ elif isinstance(input, _Document):
+ node = (<_Document>input).getroot()
+ else:
+ raise TypeError, f"Invalid input object: {python._fqtypename(input).decode('utf8')}"
+ if (node is None or not node._c_node or
+ node._c_node.type != tree.XML_ELEMENT_NODE):
+ raise ValueError, f"Input object is not an XML element: {python._fqtypename(input).decode('utf8')}"
+ _assertValidNode(node)
+ return node
+
+cdef bint _isAncestorOrSame(xmlNode* c_ancestor, xmlNode* c_node):
+ while c_node:
+ if c_node is c_ancestor:
+ return True
+ c_node = c_node.parent
+ return False
+
+cdef _Element _makeElement(tag, xmlDoc* c_doc, _Document doc,
+ _BaseParser parser, text, tail, attrib, nsmap,
+ dict extra_attrs):
+ u"""Create a new element and initialize text content, namespaces and
+ attributes.
+
+ This helper function will reuse as much of the existing document as
+ possible:
+
+ If 'parser' is None, the parser will be inherited from 'doc' or the
+ default parser will be used.
+
+ If 'doc' is None, 'c_doc' is used to create a new _Document and the new
+ element is made its root node.
+
+ If 'c_doc' is also NULL, a new xmlDoc will be created.
+ """
+ cdef xmlNode* c_node
+ if doc is not None:
+ c_doc = doc._c_doc
+ ns_utf, name_utf = _getNsTag(tag)
+ if parser is not None and parser._for_html:
+ _htmlTagValidOrRaise(name_utf)
+ if c_doc is NULL:
+ c_doc = _newHTMLDoc()
+ else:
+ _tagValidOrRaise(name_utf)
+ if c_doc is NULL:
+ c_doc = _newXMLDoc()
+ c_node = _createElement(c_doc, name_utf)
+ if c_node is NULL:
+ if doc is None and c_doc is not NULL:
+ tree.xmlFreeDoc(c_doc)
+ raise MemoryError()
+ try:
+ if doc is None:
+ tree.xmlDocSetRootElement(c_doc, c_node)
+ doc = _documentFactory(c_doc, parser)
+ if text is not None:
+ _setNodeText(c_node, text)
+ if tail is not None:
+ _setTailText(c_node, tail)
+ # add namespaces to node if necessary
+ _setNodeNamespaces(c_node, doc, ns_utf, nsmap)
+ _initNodeAttributes(c_node, doc, attrib, extra_attrs)
+ return _elementFactory(doc, c_node)
+ except:
+ # free allocated c_node/c_doc unless Python does it for us
+ if c_node.doc is not c_doc:
+ # node not yet in document => will not be freed by document
+ if tail is not None:
+ _removeText(c_node.next) # tail
+ tree.xmlFreeNode(c_node)
+ if doc is None:
+ # c_doc will not be freed by doc
+ tree.xmlFreeDoc(c_doc)
+ raise
+
+cdef int _initNewElement(_Element element, bint is_html, name_utf, ns_utf,
+ _BaseParser parser, attrib, nsmap, dict extra_attrs) except -1:
+ u"""Initialise a new Element object.
+
+ This is used when users instantiate a Python Element subclass
+ directly, without it being mapped to an existing XML node.
+ """
+ cdef xmlDoc* c_doc
+ cdef xmlNode* c_node
+ cdef _Document doc
+ if is_html:
+ _htmlTagValidOrRaise(name_utf)
+ c_doc = _newHTMLDoc()
+ else:
+ _tagValidOrRaise(name_utf)
+ c_doc = _newXMLDoc()
+ c_node = _createElement(c_doc, name_utf)
+ if c_node is NULL:
+ if c_doc is not NULL:
+ tree.xmlFreeDoc(c_doc)
+ raise MemoryError()
+ tree.xmlDocSetRootElement(c_doc, c_node)
+ doc = _documentFactory(c_doc, parser)
+ # add namespaces to node if necessary
+ _setNodeNamespaces(c_node, doc, ns_utf, nsmap)
+ _initNodeAttributes(c_node, doc, attrib, extra_attrs)
+ _registerProxy(element, doc, c_node)
+ element._init()
+ return 0
+
+cdef _Element _makeSubElement(_Element parent, tag, text, tail,
+ attrib, nsmap, dict extra_attrs):
+ u"""Create a new child element and initialize text content, namespaces and
+ attributes.
+ """
+ cdef xmlNode* c_node
+ cdef xmlDoc* c_doc
+ if parent is None or parent._doc is None:
+ return None
+ _assertValidNode(parent)
+ ns_utf, name_utf = _getNsTag(tag)
+ c_doc = parent._doc._c_doc
+
+ if parent._doc._parser is not None and parent._doc._parser._for_html:
+ _htmlTagValidOrRaise(name_utf)
+ else:
+ _tagValidOrRaise(name_utf)
+
+ c_node = _createElement(c_doc, name_utf)
+ if c_node is NULL:
+ raise MemoryError()
+ tree.xmlAddChild(parent._c_node, c_node)
+
+ try:
+ if text is not None:
+ _setNodeText(c_node, text)
+ if tail is not None:
+ _setTailText(c_node, tail)
+
+ # add namespaces to node if necessary
+ _setNodeNamespaces(c_node, parent._doc, ns_utf, nsmap)
+ _initNodeAttributes(c_node, parent._doc, attrib, extra_attrs)
+ return _elementFactory(parent._doc, c_node)
+ except:
+ # make sure we clean up in case of an error
+ _removeNode(parent._doc, c_node)
+ raise
+
+
+cdef int _setNodeNamespaces(xmlNode* c_node, _Document doc,
+ object node_ns_utf, object nsmap) except -1:
+ u"""Lookup current namespace prefixes, then set namespace structure for
+ node (if 'node_ns_utf' was provided) and register new ns-prefix mappings.
+
+ 'node_ns_utf' should only be passed for a newly created node.
+ """
+ cdef xmlNs* c_ns
+ cdef list nsdefs
+
+ if nsmap:
+ for prefix, href in _iter_nsmap(nsmap):
+ href_utf = _utf8(href)
+ _uriValidOrRaise(href_utf)
+ c_href = _xcstr(href_utf)
+ if prefix is not None:
+ prefix_utf = _utf8(prefix)
+ _prefixValidOrRaise(prefix_utf)
+ c_prefix = _xcstr(prefix_utf)
+ else:
+ c_prefix = <const_xmlChar*>NULL
+ # add namespace with prefix if it is not already known
+ c_ns = tree.xmlSearchNs(doc._c_doc, c_node, c_prefix)
+ if c_ns is NULL or \
+ c_ns.href is NULL or \
+ tree.xmlStrcmp(c_ns.href, c_href) != 0:
+ c_ns = tree.xmlNewNs(c_node, c_href, c_prefix)
+ if href_utf == node_ns_utf:
+ tree.xmlSetNs(c_node, c_ns)
+ node_ns_utf = None
+
+ if node_ns_utf is not None:
+ _uriValidOrRaise(node_ns_utf)
+ doc._setNodeNs(c_node, _xcstr(node_ns_utf))
+ return 0
+
+
+cdef dict _build_nsmap(xmlNode* c_node):
+ """
+ Namespace prefix->URI mapping known in the context of this Element.
+ This includes all namespace declarations of the parents.
+ """
+ cdef xmlNs* c_ns
+ nsmap = {}
+ while c_node is not NULL and c_node.type == tree.XML_ELEMENT_NODE:
+ c_ns = c_node.nsDef
+ while c_ns is not NULL:
+ prefix = funicodeOrNone(c_ns.prefix)
+ if prefix not in nsmap:
+ nsmap[prefix] = funicodeOrNone(c_ns.href)
+ c_ns = c_ns.next
+ c_node = c_node.parent
+ return nsmap
+
+
+cdef _iter_nsmap(nsmap):
+ """
+ Create a reproducibly ordered iterable from an nsmap mapping.
+ Tries to preserve an existing order and sorts if it assumes no order.
+
+ The difference to _iter_attrib() is that None doesn't sort with strings
+ in Py3.x.
+ """
+ if python.PY_VERSION_HEX >= 0x03060000:
+ # dicts are insertion-ordered in Py3.6+ => keep the user provided order.
+ if isinstance(nsmap, dict):
+ return nsmap.items()
+ if len(nsmap) <= 1:
+ return nsmap.items()
+ # nsmap will usually be a plain unordered dict => avoid type checking overhead
+ if type(nsmap) is not dict and isinstance(nsmap, OrderedDict):
+ return nsmap.items() # keep existing order
+ if None not in nsmap:
+ return sorted(nsmap.items())
+
+ # Move the default namespace to the end. This makes sure libxml2
+ # prefers a prefix if the ns is defined redundantly on the same
+ # element. That way, users can work around a problem themselves
+ # where default namespace attributes on non-default namespaced
+ # elements serialise without prefix (i.e. into the non-default
+ # namespace).
+ default_ns = nsmap[None]
+ nsdefs = [(k, v) for k, v in nsmap.items() if k is not None]
+ nsdefs.sort()
+ nsdefs.append((None, default_ns))
+ return nsdefs
+
+
+cdef _iter_attrib(attrib):
+ """
+ Create a reproducibly ordered iterable from an attrib mapping.
+ Tries to preserve an existing order and sorts if it assumes no order.
+ """
+ # dicts are insertion-ordered in Py3.6+ => keep the user provided order.
+ if python.PY_VERSION_HEX >= 0x03060000 and isinstance(attrib, dict) or (
+ isinstance(attrib, (_Attrib, OrderedDict))):
+ return attrib.items()
+ # assume it's an unordered mapping of some kind
+ return sorted(attrib.items())
+
+
+cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, dict extra):
+ u"""Initialise the attributes of an element node.
+ """
+ cdef bint is_html
+ cdef xmlNs* c_ns
+ if attrib is not None and not hasattr(attrib, u'items'):
+ raise TypeError, f"Invalid attribute dictionary: {python._fqtypename(attrib).decode('utf8')}"
+ if not attrib and not extra:
+ return # nothing to do
+ is_html = doc._parser._for_html
+ seen = set()
+ if extra:
+ if python.PY_VERSION_HEX >= 0x03060000:
+ for name, value in extra.items():
+ _addAttributeToNode(c_node, doc, is_html, name, value, seen)
+ else:
+ for name, value in sorted(extra.items()):
+ _addAttributeToNode(c_node, doc, is_html, name, value, seen)
+ if attrib:
+ for name, value in _iter_attrib(attrib):
+ _addAttributeToNode(c_node, doc, is_html, name, value, seen)
+
+
+cdef int _addAttributeToNode(xmlNode* c_node, _Document doc, bint is_html,
+ name, value, set seen_tags) except -1:
+ ns_utf, name_utf = tag = _getNsTag(name)
+ if tag in seen_tags:
+ return 0
+ seen_tags.add(tag)
+ if not is_html:
+ _attributeValidOrRaise(name_utf)
+ value_utf = _utf8(value)
+ if ns_utf is None:
+ tree.xmlNewProp(c_node, _xcstr(name_utf), _xcstr(value_utf))
+ else:
+ _uriValidOrRaise(ns_utf)
+ c_ns = doc._findOrBuildNodeNs(c_node, _xcstr(ns_utf), NULL, 1)
+ tree.xmlNewNsProp(c_node, c_ns,
+ _xcstr(name_utf), _xcstr(value_utf))
+ return 0
+
+
+ctypedef struct _ns_node_ref:
+ xmlNs* ns
+ xmlNode* node
+
+
+cdef int _collectNsDefs(xmlNode* c_element, _ns_node_ref **_c_ns_list,
+ size_t *_c_ns_list_len, size_t *_c_ns_list_size) except -1:
+ c_ns_list = _c_ns_list[0]
+ cdef size_t c_ns_list_len = _c_ns_list_len[0]
+ cdef size_t c_ns_list_size = _c_ns_list_size[0]
+
+ c_nsdef = c_element.nsDef
+ while c_nsdef is not NULL:
+ if c_ns_list_len >= c_ns_list_size:
+ if c_ns_list is NULL:
+ c_ns_list_size = 20
+ else:
+ c_ns_list_size *= 2
+ c_nsref_ptr = <_ns_node_ref*> python.lxml_realloc(
+ c_ns_list, c_ns_list_size, sizeof(_ns_node_ref))
+ if c_nsref_ptr is NULL:
+ if c_ns_list is not NULL:
+ python.lxml_free(c_ns_list)
+ _c_ns_list[0] = NULL
+ raise MemoryError()
+ c_ns_list = c_nsref_ptr
+
+ c_ns_list[c_ns_list_len] = _ns_node_ref(c_nsdef, c_element)
+ c_ns_list_len += 1
+ c_nsdef = c_nsdef.next
+
+ _c_ns_list_size[0] = c_ns_list_size
+ _c_ns_list_len[0] = c_ns_list_len
+ _c_ns_list[0] = c_ns_list
+
+
+cdef int _removeUnusedNamespaceDeclarations(xmlNode* c_element, set prefixes_to_keep) except -1:
+ u"""Remove any namespace declarations from a subtree that are not used by
+ any of its elements (or attributes).
+
+ If a 'prefixes_to_keep' is provided, it must be a set of prefixes.
+ Any corresponding namespace mappings will not be removed as part of the cleanup.
+ """
+ cdef xmlNode* c_node
+ cdef _ns_node_ref* c_ns_list = NULL
+ cdef size_t c_ns_list_size = 0
+ cdef size_t c_ns_list_len = 0
+ cdef size_t i
+
+ if c_element.parent and c_element.parent.type == tree.XML_DOCUMENT_NODE:
+ # include declarations on the document node
+ _collectNsDefs(c_element.parent, &c_ns_list, &c_ns_list_len, &c_ns_list_size)
+
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_element, c_element, 1)
+ # collect all new namespace declarations into the ns list
+ if c_element.nsDef:
+ _collectNsDefs(c_element, &c_ns_list, &c_ns_list_len, &c_ns_list_size)
+
+ # remove all namespace declarations from the list that are referenced
+ if c_ns_list_len and c_element.type == tree.XML_ELEMENT_NODE:
+ c_node = c_element
+ while c_node and c_ns_list_len:
+ if c_node.ns:
+ for i in range(c_ns_list_len):
+ if c_node.ns is c_ns_list[i].ns:
+ c_ns_list_len -= 1
+ c_ns_list[i] = c_ns_list[c_ns_list_len]
+ #c_ns_list[c_ns_list_len] = _ns_node_ref(NULL, NULL)
+ break
+ if c_node is c_element:
+ # continue with attributes
+ c_node = <xmlNode*>c_element.properties
+ else:
+ c_node = c_node.next
+ tree.END_FOR_EACH_ELEMENT_FROM(c_element)
+
+ if c_ns_list is NULL:
+ return 0
+
+ # free all namespace declarations that remained in the list,
+ # except for those we should keep explicitly
+ cdef xmlNs* c_nsdef
+ for i in range(c_ns_list_len):
+ if prefixes_to_keep is not None:
+ if c_ns_list[i].ns.prefix and c_ns_list[i].ns.prefix in prefixes_to_keep:
+ continue
+ c_node = c_ns_list[i].node
+ c_nsdef = c_node.nsDef
+ if c_nsdef is c_ns_list[i].ns:
+ c_node.nsDef = c_node.nsDef.next
+ else:
+ while c_nsdef.next is not c_ns_list[i].ns:
+ c_nsdef = c_nsdef.next
+ c_nsdef.next = c_nsdef.next.next
+ tree.xmlFreeNs(c_ns_list[i].ns)
+
+ if c_ns_list is not NULL:
+ python.lxml_free(c_ns_list)
+ return 0
+
+cdef xmlNs* _searchNsByHref(xmlNode* c_node, const_xmlChar* c_href, bint is_attribute):
+ u"""Search a namespace declaration that covers a node (element or
+ attribute).
+
+ For attributes, try to find a prefixed namespace declaration
+ instead of the default namespaces. This helps in supporting
+ round-trips for attributes on elements with a different namespace.
+ """
+ cdef xmlNs* c_ns
+ cdef xmlNs* c_default_ns = NULL
+ cdef xmlNode* c_element
+ if c_href is NULL or c_node is NULL or c_node.type == tree.XML_ENTITY_REF_NODE:
+ return NULL
+ if tree.xmlStrcmp(c_href, tree.XML_XML_NAMESPACE) == 0:
+ # no special cases here, let libxml2 handle this
+ return tree.xmlSearchNsByHref(c_node.doc, c_node, c_href)
+ if c_node.type == tree.XML_ATTRIBUTE_NODE:
+ is_attribute = 1
+ while c_node is not NULL and c_node.type != tree.XML_ELEMENT_NODE:
+ c_node = c_node.parent
+ c_element = c_node
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ c_ns = c_node.nsDef
+ while c_ns is not NULL:
+ if c_ns.href is not NULL and tree.xmlStrcmp(c_href, c_ns.href) == 0:
+ if c_ns.prefix is NULL and is_attribute:
+ # for attributes, continue searching a named
+ # prefix, but keep the first default namespace
+ # declaration that we found
+ if c_default_ns is NULL:
+ c_default_ns = c_ns
+ elif tree.xmlSearchNs(
+ c_element.doc, c_element, c_ns.prefix) is c_ns:
+ # start node is in namespace scope => found!
+ return c_ns
+ c_ns = c_ns.next
+ if c_node is not c_element and c_node.ns is not NULL:
+ # optimise: the node may have the namespace itself
+ c_ns = c_node.ns
+ if c_ns.href is not NULL and tree.xmlStrcmp(c_href, c_ns.href) == 0:
+ if c_ns.prefix is NULL and is_attribute:
+ # for attributes, continue searching a named
+ # prefix, but keep the first default namespace
+ # declaration that we found
+ if c_default_ns is NULL:
+ c_default_ns = c_ns
+ elif tree.xmlSearchNs(
+ c_element.doc, c_element, c_ns.prefix) is c_ns:
+ # start node is in namespace scope => found!
+ return c_ns
+ c_node = c_node.parent
+ # nothing found => use a matching default namespace or fail
+ if c_default_ns is not NULL:
+ if tree.xmlSearchNs(c_element.doc, c_element, NULL) is c_default_ns:
+ return c_default_ns
+ return NULL
+
+cdef int _replaceNodeByChildren(_Document doc, xmlNode* c_node) except -1:
+ # NOTE: this does not deallocate the node, just unlink it!
+ cdef xmlNode* c_parent
+ cdef xmlNode* c_child
+ if c_node.children is NULL:
+ tree.xmlUnlinkNode(c_node)
+ return 0
+
+ c_parent = c_node.parent
+ # fix parent links of children
+ c_child = c_node.children
+ while c_child is not NULL:
+ c_child.parent = c_parent
+ c_child = c_child.next
+
+ # fix namespace references of children if their parent's namespace
+ # declarations get lost
+ if c_node.nsDef is not NULL:
+ c_child = c_node.children
+ while c_child is not NULL:
+ moveNodeToDocument(doc, doc._c_doc, c_child)
+ c_child = c_child.next
+
+ # fix sibling links to/from child slice
+ if c_node.prev is NULL:
+ c_parent.children = c_node.children
+ else:
+ c_node.prev.next = c_node.children
+ c_node.children.prev = c_node.prev
+ if c_node.next is NULL:
+ c_parent.last = c_node.last
+ else:
+ c_node.next.prev = c_node.last
+ c_node.last.next = c_node.next
+
+ # unlink c_node
+ c_node.children = c_node.last = NULL
+ c_node.parent = c_node.next = c_node.prev = NULL
+ return 0
+
+cdef object _attributeValue(xmlNode* c_element, xmlAttr* c_attrib_node):
+ c_href = _getNs(<xmlNode*>c_attrib_node)
+ value = tree.xmlGetNsProp(c_element, c_attrib_node.name, c_href)
+ try:
+ result = funicode(value)
+ finally:
+ tree.xmlFree(value)
+ return result
+
+cdef object _attributeValueFromNsName(xmlNode* c_element,
+ const_xmlChar* c_href, const_xmlChar* c_name):
+ c_result = tree.xmlGetNsProp(c_element, c_name, c_href)
+ if c_result is NULL:
+ return None
+ try:
+ result = funicode(c_result)
+ finally:
+ tree.xmlFree(c_result)
+ return result
+
+cdef object _getNodeAttributeValue(xmlNode* c_node, key, default):
+ ns, tag = _getNsTag(key)
+ c_href = <const_xmlChar*>NULL if ns is None else _xcstr(ns)
+ c_result = tree.xmlGetNsProp(c_node, _xcstr(tag), c_href)
+ if c_result is NULL:
+ # XXX free namespace that is not in use..?
+ return default
+ try:
+ result = funicode(c_result)
+ finally:
+ tree.xmlFree(c_result)
+ return result
+
+cdef inline object _getAttributeValue(_Element element, key, default):
+ return _getNodeAttributeValue(element._c_node, key, default)
+
+cdef int _setAttributeValue(_Element element, key, value) except -1:
+ cdef const_xmlChar* c_value
+ cdef xmlNs* c_ns
+ ns, tag = _getNsTag(key)
+ is_html = element._doc._parser._for_html
+ if not is_html:
+ _attributeValidOrRaise(tag)
+ c_tag = _xcstr(tag)
+ if value is None and is_html:
+ c_value = NULL
+ else:
+ if isinstance(value, QName):
+ value = _resolveQNameText(element, value)
+ else:
+ value = _utf8(value)
+ c_value = _xcstr(value)
+ if ns is None:
+ c_ns = NULL
+ else:
+ c_ns = element._doc._findOrBuildNodeNs(element._c_node, _xcstr(ns), NULL, 1)
+ tree.xmlSetNsProp(element._c_node, c_ns, c_tag, c_value)
+ return 0
+
+cdef int _delAttribute(_Element element, key) except -1:
+ ns, tag = _getNsTag(key)
+ c_href = <const_xmlChar*>NULL if ns is None else _xcstr(ns)
+ if _delAttributeFromNsName(element._c_node, c_href, _xcstr(tag)):
+ raise KeyError, key
+ return 0
+
+cdef int _delAttributeFromNsName(xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name):
+ c_attr = tree.xmlHasNsProp(c_node, c_name, c_href)
+ if c_attr is NULL:
+ # XXX free namespace that is not in use..?
+ return -1
+ tree.xmlRemoveProp(c_attr)
+ return 0
+
+cdef list _collectAttributes(xmlNode* c_node, int collecttype):
+ u"""Collect all attributes of a node in a list. Depending on collecttype,
+ it collects either the name (1), the value (2) or the name-value tuples.
+ """
+ cdef Py_ssize_t count
+ c_attr = c_node.properties
+ count = 0
+ while c_attr is not NULL:
+ if c_attr.type == tree.XML_ATTRIBUTE_NODE:
+ count += 1
+ c_attr = c_attr.next
+
+ if not count:
+ return []
+
+ attributes = [None] * count
+ c_attr = c_node.properties
+ count = 0
+ while c_attr is not NULL:
+ if c_attr.type == tree.XML_ATTRIBUTE_NODE:
+ if collecttype == 1:
+ item = _namespacedName(<xmlNode*>c_attr)
+ elif collecttype == 2:
+ item = _attributeValue(c_node, c_attr)
+ else:
+ item = (_namespacedName(<xmlNode*>c_attr),
+ _attributeValue(c_node, c_attr))
+ attributes[count] = item
+ count += 1
+ c_attr = c_attr.next
+ return attributes
+
+cdef object __RE_XML_ENCODING = re.compile(
+ ur'^(<\?xml[^>]+)\s+encoding\s*=\s*["\'][^"\']*["\'](\s*\?>|)', re.U)
+
+cdef object __REPLACE_XML_ENCODING = __RE_XML_ENCODING.sub
+cdef object __HAS_XML_ENCODING = __RE_XML_ENCODING.match
+
+cdef object _stripEncodingDeclaration(object xml_string):
+ # this is a hack to remove the XML encoding declaration from unicode
+ return __REPLACE_XML_ENCODING(ur'\g<1>\g<2>', xml_string)
+
+cdef bint _hasEncodingDeclaration(object xml_string) except -1:
+ # check if a (unicode) string has an XML encoding declaration
+ return __HAS_XML_ENCODING(xml_string) is not None
+
+cdef inline bint _hasText(xmlNode* c_node):
+ return c_node is not NULL and _textNodeOrSkip(c_node.children) is not NULL
+
+cdef inline bint _hasTail(xmlNode* c_node):
+ return c_node is not NULL and _textNodeOrSkip(c_node.next) is not NULL
+
+cdef inline bint _hasNonWhitespaceTail(xmlNode* c_node):
+ return _hasNonWhitespaceText(c_node, tail=True)
+
+cdef bint _hasNonWhitespaceText(xmlNode* c_node, bint tail=False):
+ c_text_node = c_node and _textNodeOrSkip(c_node.next if tail else c_node.children)
+ if c_text_node is NULL:
+ return False
+ while c_text_node is not NULL:
+ if c_text_node.content[0] != c'\0' and not _collectText(c_text_node).isspace():
+ return True
+ c_text_node = _textNodeOrSkip(c_text_node.next)
+ return False
+
+cdef _collectText(xmlNode* c_node):
+ u"""Collect all text nodes and return them as a unicode string.
+
+ Start collecting at c_node.
+
+ If there was no text to collect, return None
+ """
+ cdef Py_ssize_t scount
+ cdef xmlChar* c_text
+ cdef xmlNode* c_node_cur
+ # check for multiple text nodes
+ scount = 0
+ c_text = NULL
+ c_node_cur = c_node = _textNodeOrSkip(c_node)
+ while c_node_cur is not NULL:
+ if c_node_cur.content[0] != c'\0':
+ c_text = c_node_cur.content
+ scount += 1
+ c_node_cur = _textNodeOrSkip(c_node_cur.next)
+
+ # handle two most common cases first
+ if c_text is NULL:
+ return '' if scount > 0 else None
+ if scount == 1:
+ return funicode(c_text)
+
+ # the rest is not performance critical anymore
+ result = b''
+ while c_node is not NULL:
+ result += <unsigned char*>c_node.content
+ c_node = _textNodeOrSkip(c_node.next)
+ return funicode(<const_xmlChar*><unsigned char*>result)
+
+cdef void _removeText(xmlNode* c_node):
+ u"""Remove all text nodes.
+
+ Start removing at c_node.
+ """
+ cdef xmlNode* c_next
+ c_node = _textNodeOrSkip(c_node)
+ while c_node is not NULL:
+ c_next = _textNodeOrSkip(c_node.next)
+ tree.xmlUnlinkNode(c_node)
+ tree.xmlFreeNode(c_node)
+ c_node = c_next
+
+cdef xmlNode* _createTextNode(xmlDoc* doc, value) except NULL:
+ cdef xmlNode* c_text_node
+ if isinstance(value, CDATA):
+ c_text_node = tree.xmlNewCDataBlock(
+ doc, _xcstr((<CDATA>value)._utf8_data),
+ python.PyBytes_GET_SIZE((<CDATA>value)._utf8_data))
+ else:
+ text = _utf8(value)
+ c_text_node = tree.xmlNewDocText(doc, _xcstr(text))
+ if not c_text_node:
+ raise MemoryError()
+ return c_text_node
+
+cdef int _setNodeText(xmlNode* c_node, value) except -1:
+ # remove all text nodes at the start first
+ _removeText(c_node.children)
+ if value is None:
+ return 0
+ # now add new text node with value at start
+ c_text_node = _createTextNode(c_node.doc, value)
+ if c_node.children is NULL:
+ tree.xmlAddChild(c_node, c_text_node)
+ else:
+ tree.xmlAddPrevSibling(c_node.children, c_text_node)
+ return 0
+
+cdef int _setTailText(xmlNode* c_node, value) except -1:
+ # remove all text nodes at the start first
+ _removeText(c_node.next)
+ if value is None:
+ return 0
+ # now append new text node with value
+ c_text_node = _createTextNode(c_node.doc, value)
+ tree.xmlAddNextSibling(c_node, c_text_node)
+ return 0
+
+cdef bytes _resolveQNameText(_Element element, value):
+ cdef xmlNs* c_ns
+ ns, tag = _getNsTag(value)
+ if ns is None:
+ return tag
+ else:
+ c_ns = element._doc._findOrBuildNodeNs(
+ element._c_node, _xcstr(ns), NULL, 0)
+ return python.PyBytes_FromFormat('%s:%s', c_ns.prefix, _cstr(tag))
+
+cdef inline bint _hasChild(xmlNode* c_node):
+ return c_node is not NULL and _findChildForwards(c_node, 0) is not NULL
+
+cdef inline Py_ssize_t _countElements(xmlNode* c_node):
+ u"Counts the elements within the following siblings and the node itself."
+ cdef Py_ssize_t count
+ count = 0
+ while c_node is not NULL:
+ if _isElement(c_node):
+ count += 1
+ c_node = c_node.next
+ return count
+
+cdef int _findChildSlice(
+ slice sliceobject, xmlNode* c_parent,
+ xmlNode** c_start_node, Py_ssize_t* c_step, Py_ssize_t* c_length) except -1:
+ u"""Resolve a children slice.
+
+ Returns the start node, step size and the slice length in the
+ pointer arguments.
+ """
+ cdef Py_ssize_t start = 0, stop = 0, childcount
+ childcount = _countElements(c_parent.children)
+ if childcount == 0:
+ c_start_node[0] = NULL
+ c_length[0] = 0
+ if sliceobject.step is None:
+ c_step[0] = 1
+ else:
+ python._PyEval_SliceIndex(sliceobject.step, c_step)
+ return 0
+ python.PySlice_GetIndicesEx(
+ sliceobject, childcount, &start, &stop, c_step, c_length)
+ if start > childcount / 2:
+ c_start_node[0] = _findChildBackwards(c_parent, childcount - start - 1)
+ else:
+ c_start_node[0] = _findChild(c_parent, start)
+ return 0
+
+cdef bint _isFullSlice(slice sliceobject) except -1:
+ u"""Conservative guess if this slice is a full slice as in ``s[:]``.
+ """
+ cdef Py_ssize_t step = 0
+ if sliceobject is None:
+ return 0
+ if sliceobject.start is None and \
+ sliceobject.stop is None:
+ if sliceobject.step is None:
+ return 1
+ python._PyEval_SliceIndex(sliceobject.step, &step)
+ if step == 1:
+ return 1
+ return 0
+ return 0
+
+cdef _collectChildren(_Element element):
+ cdef xmlNode* c_node
+ cdef list result = []
+ c_node = element._c_node.children
+ if c_node is not NULL:
+ if not _isElement(c_node):
+ c_node = _nextElement(c_node)
+ while c_node is not NULL:
+ result.append(_elementFactory(element._doc, c_node))
+ c_node = _nextElement(c_node)
+ return result
+
+cdef inline xmlNode* _findChild(xmlNode* c_node, Py_ssize_t index):
+ if index < 0:
+ return _findChildBackwards(c_node, -index - 1)
+ else:
+ return _findChildForwards(c_node, index)
+
+cdef inline xmlNode* _findChildForwards(xmlNode* c_node, Py_ssize_t index):
+ u"""Return child element of c_node with index, or return NULL if not found.
+ """
+ cdef xmlNode* c_child
+ cdef Py_ssize_t c
+ c_child = c_node.children
+ c = 0
+ while c_child is not NULL:
+ if _isElement(c_child):
+ if c == index:
+ return c_child
+ c += 1
+ c_child = c_child.next
+ return NULL
+
+cdef inline xmlNode* _findChildBackwards(xmlNode* c_node, Py_ssize_t index):
+ u"""Return child element of c_node with index, or return NULL if not found.
+ Search from the end.
+ """
+ cdef xmlNode* c_child
+ cdef Py_ssize_t c
+ c_child = c_node.last
+ c = 0
+ while c_child is not NULL:
+ if _isElement(c_child):
+ if c == index:
+ return c_child
+ c += 1
+ c_child = c_child.prev
+ return NULL
+
+cdef inline xmlNode* _textNodeOrSkip(xmlNode* c_node) nogil:
+ u"""Return the node if it's a text node. Skip over ignorable nodes in a
+ series of text nodes. Return NULL if a non-ignorable node is found.
+
+ This is used to skip over XInclude nodes when collecting adjacent text
+ nodes.
+ """
+ while c_node is not NULL:
+ if c_node.type == tree.XML_TEXT_NODE or \
+ c_node.type == tree.XML_CDATA_SECTION_NODE:
+ return c_node
+ elif c_node.type == tree.XML_XINCLUDE_START or \
+ c_node.type == tree.XML_XINCLUDE_END:
+ c_node = c_node.next
+ else:
+ return NULL
+ return NULL
+
+cdef inline xmlNode* _nextElement(xmlNode* c_node):
+ u"""Given a node, find the next sibling that is an element.
+ """
+ if c_node is NULL:
+ return NULL
+ c_node = c_node.next
+ while c_node is not NULL:
+ if _isElement(c_node):
+ return c_node
+ c_node = c_node.next
+ return NULL
+
+cdef inline xmlNode* _previousElement(xmlNode* c_node):
+ u"""Given a node, find the next sibling that is an element.
+ """
+ if c_node is NULL:
+ return NULL
+ c_node = c_node.prev
+ while c_node is not NULL:
+ if _isElement(c_node):
+ return c_node
+ c_node = c_node.prev
+ return NULL
+
+cdef inline xmlNode* _parentElement(xmlNode* c_node):
+ u"Given a node, find the parent element."
+ if c_node is NULL or not _isElement(c_node):
+ return NULL
+ c_node = c_node.parent
+ if c_node is NULL or not _isElement(c_node):
+ return NULL
+ return c_node
+
+cdef inline bint _tagMatches(xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name):
+ u"""Tests if the node matches namespace URI and tag name.
+
+ A node matches if it matches both c_href and c_name.
+
+ A node matches c_href if any of the following is true:
+ * c_href is NULL
+ * its namespace is NULL and c_href is the empty string
+ * its namespace string equals the c_href string
+
+ A node matches c_name if any of the following is true:
+ * c_name is NULL
+ * its name string equals the c_name string
+ """
+ if c_node is NULL:
+ return 0
+ if c_node.type != tree.XML_ELEMENT_NODE:
+ # not an element, only succeed if we match everything
+ return c_name is NULL and c_href is NULL
+ if c_name is NULL:
+ if c_href is NULL:
+ # always match
+ return 1
+ else:
+ c_node_href = _getNs(c_node)
+ if c_node_href is NULL:
+ return c_href[0] == c'\0'
+ else:
+ return tree.xmlStrcmp(c_node_href, c_href) == 0
+ elif c_href is NULL:
+ if _getNs(c_node) is not NULL:
+ return 0
+ return c_node.name == c_name or tree.xmlStrcmp(c_node.name, c_name) == 0
+ elif c_node.name == c_name or tree.xmlStrcmp(c_node.name, c_name) == 0:
+ c_node_href = _getNs(c_node)
+ if c_node_href is NULL:
+ return c_href[0] == c'\0'
+ else:
+ return tree.xmlStrcmp(c_node_href, c_href) == 0
+ else:
+ return 0
+
+cdef inline bint _tagMatchesExactly(xmlNode* c_node, qname* c_qname):
+ u"""Tests if the node matches namespace URI and tag name.
+
+ This differs from _tagMatches() in that it does not consider a
+ NULL value in qname.href a wildcard, and that it expects the c_name
+ to be taken from the doc dict, i.e. it only compares the names by
+ address.
+
+ A node matches if it matches both href and c_name of the qname.
+
+ A node matches c_href if any of the following is true:
+ * its namespace is NULL and c_href is the empty string
+ * its namespace string equals the c_href string
+
+ A node matches c_name if any of the following is true:
+ * c_name is NULL
+ * its name string points to the same address (!) as c_name
+ """
+ return _nsTagMatchesExactly(_getNs(c_node), c_node.name, c_qname)
+
+cdef inline bint _nsTagMatchesExactly(const_xmlChar* c_node_href,
+ const_xmlChar* c_node_name,
+ qname* c_qname):
+ u"""Tests if name and namespace URI match those of c_qname.
+
+ This differs from _tagMatches() in that it does not consider a
+ NULL value in qname.href a wildcard, and that it expects the c_name
+ to be taken from the doc dict, i.e. it only compares the names by
+ address.
+
+ A node matches if it matches both href and c_name of the qname.
+
+ A node matches c_href if any of the following is true:
+ * its namespace is NULL and c_href is the empty string
+ * its namespace string equals the c_href string
+
+ A node matches c_name if any of the following is true:
+ * c_name is NULL
+ * its name string points to the same address (!) as c_name
+ """
+ cdef char* c_href
+ if c_qname.c_name is not NULL and c_qname.c_name is not c_node_name:
+ return 0
+ if c_qname.href is NULL:
+ return 1
+ c_href = python.__cstr(c_qname.href)
+ if c_href[0] == '\0':
+ return c_node_href is NULL or c_node_href[0] == '\0'
+ elif c_node_href is NULL:
+ return 0
+ else:
+ return tree.xmlStrcmp(<const_xmlChar*>c_href, c_node_href) == 0
+
+cdef Py_ssize_t _mapTagsToQnameMatchArray(xmlDoc* c_doc, list ns_tags,
+ qname* c_ns_tags, bint force_into_dict) except -1:
+ u"""Map a sequence of (name, namespace) pairs to a qname array for efficient
+ matching with _tagMatchesExactly() above.
+
+ Note that each qname struct in the array owns its href byte string object
+ if it is not NULL.
+ """
+ cdef Py_ssize_t count = 0, i
+ cdef bytes ns, tag
+ for ns, tag in ns_tags:
+ if tag is None:
+ c_tag = <const_xmlChar*>NULL
+ elif force_into_dict:
+ c_tag = tree.xmlDictLookup(c_doc.dict, _xcstr(tag), len(tag))
+ if c_tag is NULL:
+ # clean up before raising the error
+ for i in xrange(count):
+ cpython.ref.Py_XDECREF(c_ns_tags[i].href)
+ raise MemoryError()
+ else:
+ c_tag = tree.xmlDictExists(c_doc.dict, _xcstr(tag), len(tag))
+ if c_tag is NULL:
+ # not in the dict => not in the document
+ continue
+ c_ns_tags[count].c_name = c_tag
+ if ns is None:
+ c_ns_tags[count].href = NULL
+ else:
+ cpython.ref.Py_INCREF(ns) # keep an owned reference!
+ c_ns_tags[count].href = <python.PyObject*>ns
+ count += 1
+ return count
+
+cdef int _removeNode(_Document doc, xmlNode* c_node) except -1:
+ u"""Unlink and free a node and subnodes if possible. Otherwise, make sure
+ it's self-contained.
+ """
+ cdef xmlNode* c_next
+ c_next = c_node.next
+ tree.xmlUnlinkNode(c_node)
+ _moveTail(c_next, c_node)
+ if not attemptDeallocation(c_node):
+ # make namespaces absolute
+ moveNodeToDocument(doc, c_node.doc, c_node)
+ return 0
+
+cdef int _removeSiblings(xmlNode* c_element, tree.xmlElementType node_type, bint with_tail) except -1:
+ cdef xmlNode* c_node
+ cdef xmlNode* c_next
+ c_node = c_element.next
+ while c_node is not NULL:
+ c_next = _nextElement(c_node)
+ if c_node.type == node_type:
+ if with_tail:
+ _removeText(c_node.next)
+ tree.xmlUnlinkNode(c_node)
+ attemptDeallocation(c_node)
+ c_node = c_next
+ c_node = c_element.prev
+ while c_node is not NULL:
+ c_next = _previousElement(c_node)
+ if c_node.type == node_type:
+ if with_tail:
+ _removeText(c_node.next)
+ tree.xmlUnlinkNode(c_node)
+ attemptDeallocation(c_node)
+ c_node = c_next
+ return 0
+
+cdef void _moveTail(xmlNode* c_tail, xmlNode* c_target):
+ cdef xmlNode* c_next
+ # tail support: look for any text nodes trailing this node and
+ # move them too
+ c_tail = _textNodeOrSkip(c_tail)
+ while c_tail is not NULL:
+ c_next = _textNodeOrSkip(c_tail.next)
+ c_target = tree.xmlAddNextSibling(c_target, c_tail)
+ c_tail = c_next
+
+cdef int _copyTail(xmlNode* c_tail, xmlNode* c_target) except -1:
+ cdef xmlNode* c_new_tail
+ # tail copying support: look for any text nodes trailing this node and
+ # copy it to the target node
+ c_tail = _textNodeOrSkip(c_tail)
+ while c_tail is not NULL:
+ if c_target.doc is not c_tail.doc:
+ c_new_tail = tree.xmlDocCopyNode(c_tail, c_target.doc, 0)
+ else:
+ c_new_tail = tree.xmlCopyNode(c_tail, 0)
+ if c_new_tail is NULL:
+ raise MemoryError()
+ c_target = tree.xmlAddNextSibling(c_target, c_new_tail)
+ c_tail = _textNodeOrSkip(c_tail.next)
+ return 0
+
+cdef int _copyNonElementSiblings(xmlNode* c_node, xmlNode* c_target) except -1:
+ cdef xmlNode* c_copy
+ cdef xmlNode* c_sibling = c_node
+ while c_sibling.prev != NULL and \
+ (c_sibling.prev.type == tree.XML_PI_NODE or
+ c_sibling.prev.type == tree.XML_COMMENT_NODE or
+ c_sibling.prev.type == tree.XML_DTD_NODE):
+ c_sibling = c_sibling.prev
+ while c_sibling != c_node:
+ if c_sibling.type == tree.XML_DTD_NODE:
+ c_copy = <xmlNode*>_copyDtd(<tree.xmlDtd*>c_sibling)
+ if c_sibling == <xmlNode*>c_node.doc.intSubset:
+ c_target.doc.intSubset = <tree.xmlDtd*>c_copy
+ else: # c_sibling == c_node.doc.extSubset
+ c_target.doc.extSubset = <tree.xmlDtd*>c_copy
+ else:
+ c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1)
+ if c_copy is NULL:
+ raise MemoryError()
+ tree.xmlAddPrevSibling(c_target, c_copy)
+ c_sibling = c_sibling.next
+ while c_sibling.next != NULL and \
+ (c_sibling.next.type == tree.XML_PI_NODE or
+ c_sibling.next.type == tree.XML_COMMENT_NODE):
+ c_sibling = c_sibling.next
+ c_copy = tree.xmlDocCopyNode(c_sibling, c_target.doc, 1)
+ if c_copy is NULL:
+ raise MemoryError()
+ tree.xmlAddNextSibling(c_target, c_copy)
+
+cdef int _deleteSlice(_Document doc, xmlNode* c_node,
+ Py_ssize_t count, Py_ssize_t step) except -1:
+ u"""Delete slice, ``count`` items starting with ``c_node`` with a step
+ width of ``step``.
+ """
+ cdef xmlNode* c_next
+ cdef Py_ssize_t c, i
+ cdef _node_to_node_function next_element
+ if c_node is NULL:
+ return 0
+ if step > 0:
+ next_element = _nextElement
+ else:
+ step = -step
+ next_element = _previousElement
+ # now start deleting nodes
+ c = 0
+ c_next = c_node
+ while c_node is not NULL and c < count:
+ for i in range(step):
+ c_next = next_element(c_next)
+ if c_next is NULL:
+ break
+ _removeNode(doc, c_node)
+ c += 1
+ c_node = c_next
+ return 0
+
+cdef int _replaceSlice(_Element parent, xmlNode* c_node,
+ Py_ssize_t slicelength, Py_ssize_t step,
+ bint left_to_right, elements) except -1:
+ u"""Replace the slice of ``count`` elements starting at ``c_node`` with
+ positive step width ``step`` by the Elements in ``elements``. The
+ direction is given by the boolean argument ``left_to_right``.
+
+ ``c_node`` may be NULL to indicate the end of the children list.
+ """
+ cdef xmlNode* c_orig_neighbour
+ cdef xmlNode* c_next
+ cdef xmlDoc* c_source_doc
+ cdef _Element element
+ cdef Py_ssize_t seqlength, i, c
+ cdef _node_to_node_function next_element
+ assert step > 0
+ if left_to_right:
+ next_element = _nextElement
+ else:
+ next_element = _previousElement
+
+ if not isinstance(elements, (list, tuple)):
+ elements = list(elements)
+
+ if step != 1 or not left_to_right:
+ # *replacing* children stepwise with list => check size!
+ seqlength = len(elements)
+ if seqlength != slicelength:
+ raise ValueError, f"attempt to assign sequence of size {seqlength} " \
+ f"to extended slice of size {slicelength}"
+
+ if c_node is NULL:
+ # no children yet => add all elements straight away
+ if left_to_right:
+ for element in elements:
+ assert element is not None, u"Node must not be None"
+ _appendChild(parent, element)
+ else:
+ for element in elements:
+ assert element is not None, u"Node must not be None"
+ _prependChild(parent, element)
+ return 0
+
+ # remove the elements first as some might be re-added
+ if left_to_right:
+ # L->R, remember left neighbour
+ c_orig_neighbour = _previousElement(c_node)
+ else:
+ # R->L, remember right neighbour
+ c_orig_neighbour = _nextElement(c_node)
+
+ # We remove the original slice elements one by one. Since we hold
+ # a Python reference to all elements that we will insert, it is
+ # safe to let _removeNode() try (and fail) to free them even if
+ # the element itself or one of its descendents will be reinserted.
+ c = 0
+ c_next = c_node
+ while c_node is not NULL and c < slicelength:
+ for i in range(step):
+ c_next = next_element(c_next)
+ if c_next is NULL:
+ break
+ _removeNode(parent._doc, c_node)
+ c += 1
+ c_node = c_next
+
+ # make sure each element is inserted only once
+ elements = iter(elements)
+
+ # find the first node right of the new insertion point
+ if left_to_right:
+ if c_orig_neighbour is not NULL:
+ c_node = next_element(c_orig_neighbour)
+ else:
+ # before the first element
+ c_node = _findChildForwards(parent._c_node, 0)
+ elif c_orig_neighbour is NULL:
+ # at the end, but reversed stepping
+ # append one element and go to the next insertion point
+ for element in elements:
+ assert element is not None, u"Node must not be None"
+ _appendChild(parent, element)
+ c_node = element._c_node
+ if slicelength > 0:
+ slicelength -= 1
+ for i in range(1, step):
+ c_node = next_element(c_node)
+ if c_node is NULL:
+ break
+ break
+ else:
+ c_node = c_orig_neighbour
+
+ if left_to_right:
+ # adjust step size after removing slice as we are not stepping
+ # over the newly inserted elements
+ step -= 1
+
+ # now insert elements where we removed them
+ if c_node is not NULL:
+ for element in elements:
+ assert element is not None, u"Node must not be None"
+ _assertValidNode(element)
+ # move element and tail over
+ c_source_doc = element._c_node.doc
+ c_next = element._c_node.next
+ tree.xmlAddPrevSibling(c_node, element._c_node)
+ _moveTail(c_next, element._c_node)
+
+ # integrate element into new document
+ moveNodeToDocument(parent._doc, c_source_doc, element._c_node)
+
+ # stop at the end of the slice
+ if slicelength > 0:
+ slicelength -= 1
+ for i in range(step):
+ c_node = next_element(c_node)
+ if c_node is NULL:
+ break
+ if c_node is NULL:
+ break
+ else:
+ # everything inserted
+ return 0
+
+ # append the remaining elements at the respective end
+ if left_to_right:
+ for element in elements:
+ assert element is not None, u"Node must not be None"
+ _assertValidNode(element)
+ _appendChild(parent, element)
+ else:
+ for element in elements:
+ assert element is not None, u"Node must not be None"
+ _assertValidNode(element)
+ _prependChild(parent, element)
+
+ return 0
+
+
+cdef int _linkChild(xmlNode* c_parent, xmlNode* c_node) except -1:
+ """Adaptation of 'xmlAddChild()' that deep-fix the document links iteratively.
+ """
+ assert _isElement(c_node)
+ c_node.parent = c_parent
+ if c_parent.children is NULL:
+ c_parent.children = c_parent.last = c_node
+ else:
+ c_node.prev = c_parent.last
+ c_parent.last.next = c_node
+ c_parent.last = c_node
+
+ _setTreeDoc(c_node, c_parent.doc)
+ return 0
+
+
+cdef int _appendChild(_Element parent, _Element child) except -1:
+ u"""Append a new child to a parent element.
+ """
+ c_node = child._c_node
+ c_source_doc = c_node.doc
+ # prevent cycles
+ if _isAncestorOrSame(c_node, parent._c_node):
+ raise ValueError("cannot append parent to itself")
+ # store possible text node
+ c_next = c_node.next
+ # move node itself
+ tree.xmlUnlinkNode(c_node)
+ # do not call xmlAddChild() here since it would deep-traverse the tree
+ _linkChild(parent._c_node, c_node)
+ _moveTail(c_next, c_node)
+ # uh oh, elements may be pointing to different doc when
+ # parent element has moved; change them too..
+ moveNodeToDocument(parent._doc, c_source_doc, c_node)
+ return 0
+
+cdef int _prependChild(_Element parent, _Element child) except -1:
+ u"""Prepend a new child to a parent element.
+ """
+ c_node = child._c_node
+ c_source_doc = c_node.doc
+ # prevent cycles
+ if _isAncestorOrSame(c_node, parent._c_node):
+ raise ValueError("cannot append parent to itself")
+ # store possible text node
+ c_next = c_node.next
+ # move node itself
+ c_child = _findChildForwards(parent._c_node, 0)
+ if c_child is NULL:
+ tree.xmlUnlinkNode(c_node)
+ # do not call xmlAddChild() here since it would deep-traverse the tree
+ _linkChild(parent._c_node, c_node)
+ else:
+ tree.xmlAddPrevSibling(c_child, c_node)
+ _moveTail(c_next, c_node)
+ # uh oh, elements may be pointing to different doc when
+ # parent element has moved; change them too..
+ moveNodeToDocument(parent._doc, c_source_doc, c_node)
+ return 0
+
+cdef int _appendSibling(_Element element, _Element sibling) except -1:
+ u"""Add a new sibling behind an element.
+ """
+ return _addSibling(element, sibling, as_next=True)
+
+cdef int _prependSibling(_Element element, _Element sibling) except -1:
+ u"""Add a new sibling before an element.
+ """
+ return _addSibling(element, sibling, as_next=False)
+
+cdef int _addSibling(_Element element, _Element sibling, bint as_next) except -1:
+ c_node = sibling._c_node
+ c_source_doc = c_node.doc
+ # prevent cycles
+ if _isAncestorOrSame(c_node, element._c_node):
+ if element._c_node is c_node:
+ return 0 # nothing to do
+ raise ValueError("cannot add ancestor as sibling, please break cycle first")
+ # store possible text node
+ c_next = c_node.next
+ # move node itself
+ if as_next:
+ tree.xmlAddNextSibling(element._c_node, c_node)
+ else:
+ tree.xmlAddPrevSibling(element._c_node, c_node)
+ _moveTail(c_next, c_node)
+ # uh oh, elements may be pointing to different doc when
+ # parent element has moved; change them too..
+ moveNodeToDocument(element._doc, c_source_doc, c_node)
+ return 0
+
+cdef inline bint isutf8(const_xmlChar* s):
+ cdef xmlChar c = s[0]
+ while c != c'\0':
+ if c & 0x80:
+ return True
+ s += 1
+ c = s[0]
+ return False
+
+cdef bint isutf8l(const_xmlChar* s, size_t length):
+ """
+ Search for non-ASCII characters in the string, knowing its length in advance.
+ """
+ cdef unsigned int i
+ cdef unsigned long non_ascii_mask
+ cdef const unsigned long *lptr = <const unsigned long*> s
+
+ cdef const unsigned long *end = lptr + length // sizeof(unsigned long)
+ if length >= sizeof(non_ascii_mask):
+ # Build constant 0x80808080... mask (and let the C compiler fold it).
+ non_ascii_mask = 0
+ for i in range(sizeof(non_ascii_mask) // 2):
+ non_ascii_mask = (non_ascii_mask << 16) | 0x8080
+
+ # Advance to long-aligned character before we start reading longs.
+ while (<size_t>s) % sizeof(unsigned long) and s < <const_xmlChar *>end:
+ if s[0] & 0x80:
+ return True
+ s += 1
+
+ # Read one long at a time
+ lptr = <const unsigned long*> s
+ while lptr < end:
+ if lptr[0] & non_ascii_mask:
+ return True
+ lptr += 1
+ s = <const_xmlChar *>lptr
+
+ while s < (<const_xmlChar *>end + length % sizeof(unsigned long)):
+ if s[0] & 0x80:
+ return True
+ s += 1
+
+ return False
+
+cdef int _is_valid_xml_ascii(bytes pystring):
+ """Check if a string is XML ascii content."""
+ cdef signed char ch
+ # When ch is a *signed* char, non-ascii characters are negative integers
+ # and xmlIsChar_ch does not accept them.
+ for ch in pystring:
+ if not tree.xmlIsChar_ch(ch):
+ return 0
+ return 1
+
+cdef bint _is_valid_xml_utf8(bytes pystring):
+ u"""Check if a string is like valid UTF-8 XML content."""
+ cdef const_xmlChar* s = _xcstr(pystring)
+ cdef const_xmlChar* c_end = s + len(pystring)
+ cdef unsigned long next3 = 0
+ if s < c_end - 2:
+ next3 = (s[0] << 8) | (s[1])
+
+ while s < c_end - 2:
+ next3 = 0x00ffffff & ((next3 << 8) | s[2])
+ if s[0] & 0x80:
+ # 0xefbfbe and 0xefbfbf are utf-8 encodings of
+ # forbidden characters \ufffe and \uffff
+ if next3 == 0x00efbfbe or next3 == 0x00efbfbf:
+ return 0
+ # 0xeda080 and 0xedbfbf are utf-8 encodings of
+ # \ud800 and \udfff. Anything between them (inclusive)
+ # is forbidden, because they are surrogate blocks in utf-16.
+ if 0x00eda080 <= next3 <= 0x00edbfbf:
+ return 0
+ elif not tree.xmlIsChar_ch(s[0]):
+ return 0 # invalid ascii char
+ s += 1
+
+ while s < c_end:
+ if not s[0] & 0x80 and not tree.xmlIsChar_ch(s[0]):
+ return 0 # invalid ascii char
+ s += 1
+
+ return 1
+
+cdef inline object funicodeOrNone(const_xmlChar* s):
+ return funicode(s) if s is not NULL else None
+
+cdef inline object funicodeOrEmpty(const_xmlChar* s):
+ return funicode(s) if s is not NULL else ''
+
+cdef object funicode(const_xmlChar* s):
+ cdef Py_ssize_t slen
+ cdef const_xmlChar* spos
+ cdef bint is_non_ascii
+ if python.LXML_UNICODE_STRINGS:
+ return s.decode('UTF-8')
+ spos = s
+ is_non_ascii = 0
+ while spos[0] != c'\0':
+ if spos[0] & 0x80:
+ is_non_ascii = 1
+ break
+ spos += 1
+ slen = spos - s
+ if spos[0] != c'\0':
+ slen += cstring_h.strlen(<const char*> spos)
+ if is_non_ascii:
+ return s[:slen].decode('UTF-8')
+ return <bytes>s[:slen]
+
+cdef bytes _utf8(object s):
+ """Test if a string is valid user input and encode it to UTF-8.
+ Reject all bytes/unicode input that contains non-XML characters.
+ Reject all bytes input that contains non-ASCII characters.
+ """
+ cdef int valid
+ cdef bytes utf8_string
+ if python.IS_PYTHON2 and type(s) is bytes:
+ utf8_string = <bytes>s
+ valid = _is_valid_xml_ascii(utf8_string)
+ elif isinstance(s, unicode):
+ utf8_string = (<unicode>s).encode('utf8')
+ valid = _is_valid_xml_utf8(utf8_string)
+ elif isinstance(s, (bytes, bytearray)):
+ utf8_string = bytes(s)
+ valid = _is_valid_xml_ascii(utf8_string)
+ else:
+ raise TypeError("Argument must be bytes or unicode, got '%.200s'" % type(s).__name__)
+ if not valid:
+ raise ValueError(
+ "All strings must be XML compatible: Unicode or ASCII, no NULL bytes or control characters")
+ return utf8_string
+
+
+cdef bytes _utf8orNone(object s):
+ return _utf8(s) if s is not None else None
+
+
+cdef strrepr(s):
+ """Build a representation of strings which we can use in __repr__
+ methods, e.g. _Element.__repr__().
+ """
+ return s.encode('unicode-escape') if python.IS_PYTHON2 else s
+
+
+cdef enum:
+ NO_FILE_PATH = 0
+ ABS_UNIX_FILE_PATH = 1
+ ABS_WIN_FILE_PATH = 2
+ REL_FILE_PATH = 3
+
+
+cdef bint _isFilePath(const_xmlChar* c_path):
+ u"simple heuristic to see if a path is a filename"
+ cdef xmlChar c
+ # test if it looks like an absolute Unix path or a Windows network path
+ if c_path[0] == c'/':
+ return ABS_UNIX_FILE_PATH
+
+ # test if it looks like an absolute Windows path or URL
+ if c'a' <= c_path[0] <= c'z' or c'A' <= c_path[0] <= c'Z':
+ c_path += 1
+ if c_path[0] == c':' and c_path[1] in b'\0\\':
+ return ABS_WIN_FILE_PATH # C: or C:\...
+
+ # test if it looks like a URL with scheme://
+ while c'a' <= c_path[0] <= c'z' or c'A' <= c_path[0] <= c'Z':
+ c_path += 1
+ if c_path[0] == c':' and c_path[1] == c'/' and c_path[2] == c'/':
+ return NO_FILE_PATH
+
+ # assume it's a relative path
+ return REL_FILE_PATH
+
+cdef object _encodeFilename(object filename):
+ u"""Make sure a filename is 8-bit encoded (or None).
+ """
+ if filename is None:
+ return None
+ elif isinstance(filename, bytes):
+ return filename
+ elif isinstance(filename, unicode):
+ filename8 = (<unicode>filename).encode('utf8')
+ if _isFilePath(<unsigned char*>filename8):
+ try:
+ return python.PyUnicode_AsEncodedString(
+ filename, _C_FILENAME_ENCODING, NULL)
+ except UnicodeEncodeError:
+ pass
+ return filename8
+ else:
+ raise TypeError("Argument must be string or unicode.")
+
+cdef object _decodeFilename(const_xmlChar* c_path):
+ u"""Make the filename a unicode string if we are in Py3.
+ """
+ return _decodeFilenameWithLength(c_path, tree.xmlStrlen(c_path))
+
+cdef object _decodeFilenameWithLength(const_xmlChar* c_path, size_t c_len):
+ u"""Make the filename a unicode string if we are in Py3.
+ """
+ if _isFilePath(c_path):
+ try:
+ return python.PyUnicode_Decode(
+ <const_char*>c_path, c_len, _C_FILENAME_ENCODING, NULL)
+ except UnicodeDecodeError:
+ pass
+ try:
+ return (<unsigned char*>c_path)[:c_len].decode('UTF-8')
+ except UnicodeDecodeError:
+ # this is a stupid fallback, but it might still work...
+ return (<unsigned char*>c_path)[:c_len].decode('latin-1', 'replace')
+
+cdef object _encodeFilenameUTF8(object filename):
+ u"""Recode filename as UTF-8. Tries ASCII, local filesystem encoding and
+ UTF-8 as source encoding.
+ """
+ cdef char* c_filename
+ if filename is None:
+ return None
+ elif isinstance(filename, bytes):
+ if not isutf8l(<bytes>filename, len(<bytes>filename)):
+ # plain ASCII!
+ return filename
+ c_filename = _cstr(<bytes>filename)
+ try:
+ # try to decode with default encoding
+ filename = python.PyUnicode_Decode(
+ c_filename, len(<bytes>filename),
+ _C_FILENAME_ENCODING, NULL)
+ except UnicodeDecodeError as decode_exc:
+ try:
+ # try if it's proper UTF-8
+ (<bytes>filename).decode('utf8')
+ return filename
+ except UnicodeDecodeError:
+ raise decode_exc # otherwise re-raise original exception
+ if isinstance(filename, unicode):
+ return (<unicode>filename).encode('utf8')
+ else:
+ raise TypeError("Argument must be string or unicode.")
+
+cdef tuple _getNsTag(tag):
+ u"""Given a tag, find namespace URI and tag name.
+ Return None for NS uri if no namespace URI provided.
+ """
+ return __getNsTag(tag, 0)
+
+cdef tuple _getNsTagWithEmptyNs(tag):
+ u"""Given a tag, find namespace URI and tag name. Return None for NS uri
+ if no namespace URI provided, or the empty string if namespace
+ part is '{}'.
+ """
+ return __getNsTag(tag, 1)
+
+cdef tuple __getNsTag(tag, bint empty_ns):
+ cdef char* c_tag
+ cdef char* c_ns_end
+ cdef Py_ssize_t taglen
+ cdef Py_ssize_t nslen
+ cdef bytes ns = None
+ # _isString() is much faster than isinstance()
+ if not _isString(tag) and isinstance(tag, QName):
+ tag = (<QName>tag).text
+ tag = _utf8(tag)
+ c_tag = _cstr(tag)
+ if c_tag[0] == c'{':
+ c_tag += 1
+ c_ns_end = cstring_h.strchr(c_tag, c'}')
+ if c_ns_end is NULL:
+ raise ValueError, u"Invalid tag name"
+ nslen = c_ns_end - c_tag
+ taglen = python.PyBytes_GET_SIZE(tag) - nslen - 2
+ if taglen == 0:
+ raise ValueError, u"Empty tag name"
+ if nslen > 0:
+ ns = <bytes>c_tag[:nslen]
+ elif empty_ns:
+ ns = b''
+ tag = <bytes>c_ns_end[1:taglen+1]
+ elif python.PyBytes_GET_SIZE(tag) == 0:
+ raise ValueError, u"Empty tag name"
+ return ns, tag
+
+cdef inline int _pyXmlNameIsValid(name_utf8):
+ return _xmlNameIsValid(_xcstr(name_utf8)) and b':' not in name_utf8
+
+cdef inline int _pyHtmlNameIsValid(name_utf8):
+ return _htmlNameIsValid(_xcstr(name_utf8))
+
+cdef inline int _xmlNameIsValid(const_xmlChar* c_name):
+ return tree.xmlValidateNameValue(c_name)
+
+cdef int _htmlNameIsValid(const_xmlChar* c_name):
+ if c_name is NULL or c_name[0] == c'\0':
+ return 0
+ while c_name[0] != c'\0':
+ if c_name[0] in b'&<>/"\'\t\n\x0B\x0C\r ':
+ return 0
+ c_name += 1
+ return 1
+
+cdef bint _characterReferenceIsValid(const_xmlChar* c_name):
+ cdef bint is_hex
+ if c_name[0] == c'x':
+ c_name += 1
+ is_hex = 1
+ else:
+ is_hex = 0
+ if c_name[0] == c'\0':
+ return 0
+ while c_name[0] != c'\0':
+ if c_name[0] < c'0' or c_name[0] > c'9':
+ if not is_hex:
+ return 0
+ if not (c'a' <= c_name[0] <= c'f'):
+ if not (c'A' <= c_name[0] <= c'F'):
+ return 0
+ c_name += 1
+ return 1
+
+cdef int _tagValidOrRaise(tag_utf) except -1:
+ if not _pyXmlNameIsValid(tag_utf):
+ raise ValueError(f"Invalid tag name {(<bytes>tag_utf).decode('utf8')!r}")
+ return 0
+
+cdef int _htmlTagValidOrRaise(tag_utf) except -1:
+ if not _pyHtmlNameIsValid(tag_utf):
+ raise ValueError(f"Invalid HTML tag name {(<bytes>tag_utf).decode('utf8')!r}")
+ return 0
+
+cdef int _attributeValidOrRaise(name_utf) except -1:
+ if not _pyXmlNameIsValid(name_utf):
+ raise ValueError(f"Invalid attribute name {(<bytes>name_utf).decode('utf8')!r}")
+ return 0
+
+cdef int _prefixValidOrRaise(tag_utf) except -1:
+ if not _pyXmlNameIsValid(tag_utf):
+ raise ValueError(f"Invalid namespace prefix {(<bytes>tag_utf).decode('utf8')!r}")
+ return 0
+
+cdef int _uriValidOrRaise(uri_utf) except -1:
+ cdef uri.xmlURI* c_uri = uri.xmlParseURI(_cstr(uri_utf))
+ if c_uri is NULL:
+ raise ValueError(f"Invalid namespace URI {(<bytes>uri_utf).decode('utf8')!r}")
+ uri.xmlFreeURI(c_uri)
+ return 0
+
+cdef inline object _namespacedName(xmlNode* c_node):
+ return _namespacedNameFromNsName(_getNs(c_node), c_node.name)
+
+cdef object _namespacedNameFromNsName(const_xmlChar* href, const_xmlChar* name):
+ if href is NULL:
+ return funicode(name)
+ elif not python.IS_PYPY and (python.LXML_UNICODE_STRINGS or isutf8(name) or isutf8(href)):
+ return python.PyUnicode_FromFormat("{%s}%s", href, name)
+ else:
+ s = python.PyBytes_FromFormat("{%s}%s", href, name)
+ if python.IS_PYPY and (python.LXML_UNICODE_STRINGS or isutf8l(s, len(s))):
+ return (<bytes>s).decode('utf8')
+ else:
+ return s
+
+cdef _getFilenameForFile(source):
+ u"""Given a Python File or Gzip object, give filename back.
+
+ Returns None if not a file object.
+ """
+ # urllib2 provides a geturl() method
+ try:
+ return source.geturl()
+ except:
+ pass
+ # file instances have a name attribute
+ try:
+ filename = source.name
+ if _isString(filename):
+ return os_path_abspath(filename)
+ except:
+ pass
+ # gzip file instances have a filename attribute (before Py3k)
+ try:
+ filename = source.filename
+ if _isString(filename):
+ return os_path_abspath(filename)
+ except:
+ pass
+ # can't determine filename
+ return None
diff --git a/src/lxml/builder.pxd b/src/lxml/builder.pxd
new file mode 100644
index 0000000..f6b2fb5
--- /dev/null
+++ b/src/lxml/builder.pxd
@@ -0,0 +1,10 @@
+# cython: language_level=2
+
+cdef object ET
+cdef object partial
+
+cdef class ElementMaker:
+ cdef readonly dict _nsmap
+ cdef readonly dict _typemap
+ cdef readonly object _namespace
+ cdef readonly object _makeelement
diff --git a/src/lxml/builder.py b/src/lxml/builder.py
new file mode 100644
index 0000000..a288845
--- /dev/null
+++ b/src/lxml/builder.py
@@ -0,0 +1,239 @@
+# cython: language_level=2
+
+#
+# Element generator factory by Fredrik Lundh.
+#
+# Source:
+# http://online.effbot.org/2006_11_01_archive.htm#et-builder
+# http://effbot.python-hosting.com/file/stuff/sandbox/elementlib/builder.py
+#
+# --------------------------------------------------------------------
+# The ElementTree toolkit is
+#
+# Copyright (c) 1999-2004 by Fredrik Lundh
+#
+# By obtaining, using, and/or copying this software and/or its
+# associated documentation, you agree that you have read, understood,
+# and will comply with the following terms and conditions:
+#
+# Permission to use, copy, modify, and distribute this software and
+# its associated documentation for any purpose and without fee is
+# hereby granted, provided that the above copyright notice appears in
+# all copies, and that both that copyright notice and this permission
+# notice appear in supporting documentation, and that the name of
+# Secret Labs AB or the author not be used in advertising or publicity
+# pertaining to distribution of the software without specific, written
+# prior permission.
+#
+# SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
+# TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
+# ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
+# BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
+# DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+# OF THIS SOFTWARE.
+# --------------------------------------------------------------------
+
+"""
+The ``E`` Element factory for generating XML documents.
+"""
+
+from __future__ import absolute_import
+
+import lxml.etree as ET
+
+from functools import partial
+
+try:
+ basestring
+except NameError:
+ basestring = str
+
+try:
+ unicode
+except NameError:
+ unicode = str
+
+
+class ElementMaker(object):
+ """Element generator factory.
+
+ Unlike the ordinary Element factory, the E factory allows you to pass in
+ more than just a tag and some optional attributes; you can also pass in
+ text and other elements. The text is added as either text or tail
+ attributes, and elements are inserted at the right spot. Some small
+ examples::
+
+ >>> from lxml import etree as ET
+ >>> from lxml.builder import E
+
+ >>> ET.tostring(E("tag"))
+ '<tag/>'
+ >>> ET.tostring(E("tag", "text"))
+ '<tag>text</tag>'
+ >>> ET.tostring(E("tag", "text", key="value"))
+ '<tag key="value">text</tag>'
+ >>> ET.tostring(E("tag", E("subtag", "text"), "tail"))
+ '<tag><subtag>text</subtag>tail</tag>'
+
+ For simple tags, the factory also allows you to write ``E.tag(...)`` instead
+ of ``E('tag', ...)``::
+
+ >>> ET.tostring(E.tag())
+ '<tag/>'
+ >>> ET.tostring(E.tag("text"))
+ '<tag>text</tag>'
+ >>> ET.tostring(E.tag(E.subtag("text"), "tail"))
+ '<tag><subtag>text</subtag>tail</tag>'
+
+ Here's a somewhat larger example; this shows how to generate HTML
+ documents, using a mix of prepared factory functions for inline elements,
+ nested ``E.tag`` calls, and embedded XHTML fragments::
+
+ # some common inline elements
+ A = E.a
+ I = E.i
+ B = E.b
+
+ def CLASS(v):
+ # helper function, 'class' is a reserved word
+ return {'class': v}
+
+ page = (
+ E.html(
+ E.head(
+ E.title("This is a sample document")
+ ),
+ E.body(
+ E.h1("Hello!", CLASS("title")),
+ E.p("This is a paragraph with ", B("bold"), " text in it!"),
+ E.p("This is another paragraph, with a ",
+ A("link", href="http://www.python.org"), "."),
+ E.p("Here are some reserved characters: <spam&egg>."),
+ ET.XML("<p>And finally, here is an embedded XHTML fragment.</p>"),
+ )
+ )
+ )
+
+ print ET.tostring(page)
+
+ Here's a prettyprinted version of the output from the above script::
+
+ <html>
+ <head>
+ <title>This is a sample document</title>
+ </head>
+ <body>
+ <h1 class="title">Hello!</h1>
+ <p>This is a paragraph with <b>bold</b> text in it!</p>
+ <p>This is another paragraph, with <a href="http://www.python.org">link</a>.</p>
+ <p>Here are some reserved characters: &lt;spam&amp;egg&gt;.</p>
+ <p>And finally, here is an embedded XHTML fragment.</p>
+ </body>
+ </html>
+
+ For namespace support, you can pass a namespace map (``nsmap``)
+ and/or a specific target ``namespace`` to the ElementMaker class::
+
+ >>> E = ElementMaker(namespace="http://my.ns/")
+ >>> print(ET.tostring( E.test ))
+ <test xmlns="http://my.ns/"/>
+
+ >>> E = ElementMaker(namespace="http://my.ns/", nsmap={'p':'http://my.ns/'})
+ >>> print(ET.tostring( E.test ))
+ <p:test xmlns:p="http://my.ns/"/>
+ """
+
+ def __init__(self, typemap=None,
+ namespace=None, nsmap=None, makeelement=None):
+ if namespace is not None:
+ self._namespace = '{' + namespace + '}'
+ else:
+ self._namespace = None
+
+ if nsmap:
+ self._nsmap = dict(nsmap)
+ else:
+ self._nsmap = None
+
+ if makeelement is not None:
+ assert callable(makeelement)
+ self._makeelement = makeelement
+ else:
+ self._makeelement = ET.Element
+
+ # initialize type map for this element factory
+
+ if typemap:
+ typemap = dict(typemap)
+ else:
+ typemap = {}
+
+ def add_text(elem, item):
+ try:
+ elem[-1].tail = (elem[-1].tail or "") + item
+ except IndexError:
+ elem.text = (elem.text or "") + item
+
+ def add_cdata(elem, cdata):
+ if elem.text:
+ raise ValueError("Can't add a CDATA section. Element already has some text: %r" % elem.text)
+ elem.text = cdata
+
+ if str not in typemap:
+ typemap[str] = add_text
+ if unicode not in typemap:
+ typemap[unicode] = add_text
+ if ET.CDATA not in typemap:
+ typemap[ET.CDATA] = add_cdata
+
+ def add_dict(elem, item):
+ attrib = elem.attrib
+ for k, v in item.items():
+ if isinstance(v, basestring):
+ attrib[k] = v
+ else:
+ attrib[k] = typemap[type(v)](None, v)
+ if dict not in typemap:
+ typemap[dict] = add_dict
+
+ self._typemap = typemap
+
+ def __call__(self, tag, *children, **attrib):
+ typemap = self._typemap
+
+ if self._namespace is not None and tag[0] != '{':
+ tag = self._namespace + tag
+ elem = self._makeelement(tag, nsmap=self._nsmap)
+ if attrib:
+ typemap[dict](elem, attrib)
+
+ for item in children:
+ if callable(item):
+ item = item()
+ t = typemap.get(type(item))
+ if t is None:
+ if ET.iselement(item):
+ elem.append(item)
+ continue
+ for basetype in type(item).__mro__:
+ # See if the typemap knows of any of this type's bases.
+ t = typemap.get(basetype)
+ if t is not None:
+ break
+ else:
+ raise TypeError("bad argument type: %s(%r)" %
+ (type(item).__name__, item))
+ v = t(elem, item)
+ if v:
+ typemap.get(type(v))(elem, v)
+
+ return elem
+
+ def __getattr__(self, tag):
+ return partial(self, tag)
+
+
+# create factory object
+E = ElementMaker()
diff --git a/src/lxml/classlookup.pxi b/src/lxml/classlookup.pxi
new file mode 100644
index 0000000..137e111
--- /dev/null
+++ b/src/lxml/classlookup.pxi
@@ -0,0 +1,563 @@
+# Configurable Element class lookup
+
+################################################################################
+# Custom Element classes
+
+cdef public class ElementBase(_Element) [ type LxmlElementBaseType,
+ object LxmlElementBase ]:
+ u"""ElementBase(*children, attrib=None, nsmap=None, **_extra)
+
+ The public Element class. All custom Element classes must inherit
+ from this one. To create an Element, use the `Element()` factory.
+
+ BIG FAT WARNING: Subclasses *must not* override __init__ or
+ __new__ as it is absolutely undefined when these objects will be
+ created or destroyed. All persistent state of Elements must be
+ stored in the underlying XML. If you really need to initialize
+ the object after creation, you can implement an ``_init(self)``
+ method that will be called directly after object creation.
+
+ Subclasses of this class can be instantiated to create a new
+ Element. By default, the tag name will be the class name and the
+ namespace will be empty. You can modify this with the following
+ class attributes:
+
+ * TAG - the tag name, possibly containing a namespace in Clark
+ notation
+
+ * NAMESPACE - the default namespace URI, unless provided as part
+ of the TAG attribute.
+
+ * HTML - flag if the class is an HTML tag, as opposed to an XML
+ tag. This only applies to un-namespaced tags and defaults to
+ false (i.e. XML).
+
+ * PARSER - the parser that provides the configuration for the
+ newly created document. Providing an HTML parser here will
+ default to creating an HTML element.
+
+ In user code, the latter three are commonly inherited in class
+ hierarchies that implement a common namespace.
+ """
+ def __init__(self, *children, attrib=None, nsmap=None, **_extra):
+ u"""ElementBase(*children, attrib=None, nsmap=None, **_extra)
+ """
+ cdef bint is_html = 0
+ cdef _BaseParser parser
+ cdef _Element last_child
+ # don't use normal attribute access as it might be overridden
+ _getattr = object.__getattribute__
+ try:
+ namespace = _utf8(_getattr(self, 'NAMESPACE'))
+ except AttributeError:
+ namespace = None
+ try:
+ ns, tag = _getNsTag(_getattr(self, 'TAG'))
+ if ns is not None:
+ namespace = ns
+ except AttributeError:
+ tag = _utf8(_getattr(_getattr(self, '__class__'), '__name__'))
+ if b'.' in tag:
+ tag = tag.split(b'.')[-1]
+ try:
+ parser = _getattr(self, 'PARSER')
+ except AttributeError:
+ parser = None
+ for child in children:
+ if isinstance(child, _Element):
+ parser = (<_Element>child)._doc._parser
+ break
+ if isinstance(parser, HTMLParser):
+ is_html = 1
+ if namespace is None:
+ try:
+ is_html = _getattr(self, 'HTML')
+ except AttributeError:
+ pass
+ _initNewElement(self, is_html, tag, namespace, parser,
+ attrib, nsmap, _extra)
+ last_child = None
+ for child in children:
+ if _isString(child):
+ if last_child is None:
+ _setNodeText(self._c_node,
+ (_collectText(self._c_node.children) or '') + child)
+ else:
+ _setTailText(last_child._c_node,
+ (_collectText(last_child._c_node.next) or '') + child)
+ elif isinstance(child, _Element):
+ last_child = child
+ _appendChild(self, last_child)
+ elif isinstance(child, type) and issubclass(child, ElementBase):
+ last_child = child()
+ _appendChild(self, last_child)
+ else:
+ raise TypeError, f"Invalid child type: {type(child)!r}"
+
+cdef class CommentBase(_Comment):
+ u"""All custom Comment classes must inherit from this one.
+
+ To create an XML Comment instance, use the ``Comment()`` factory.
+
+ Subclasses *must not* override __init__ or __new__ as it is
+ absolutely undefined when these objects will be created or
+ destroyed. All persistent state of Comments must be stored in the
+ underlying XML. If you really need to initialize the object after
+ creation, you can implement an ``_init(self)`` method that will be
+ called after object creation.
+ """
+ def __init__(self, text):
+ # copied from Comment() factory
+ cdef _Document doc
+ cdef xmlDoc* c_doc
+ if text is None:
+ text = b''
+ else:
+ text = _utf8(text)
+ c_doc = _newXMLDoc()
+ doc = _documentFactory(c_doc, None)
+ self._c_node = _createComment(c_doc, _xcstr(text))
+ if self._c_node is NULL:
+ raise MemoryError()
+ tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
+ _registerProxy(self, doc, self._c_node)
+ self._init()
+
+cdef class PIBase(_ProcessingInstruction):
+ u"""All custom Processing Instruction classes must inherit from this one.
+
+ To create an XML ProcessingInstruction instance, use the ``PI()``
+ factory.
+
+ Subclasses *must not* override __init__ or __new__ as it is
+ absolutely undefined when these objects will be created or
+ destroyed. All persistent state of PIs must be stored in the
+ underlying XML. If you really need to initialize the object after
+ creation, you can implement an ``_init(self)`` method that will be
+ called after object creation.
+ """
+ def __init__(self, target, text=None):
+ # copied from PI() factory
+ cdef _Document doc
+ cdef xmlDoc* c_doc
+ target = _utf8(target)
+ if text is None:
+ text = b''
+ else:
+ text = _utf8(text)
+ c_doc = _newXMLDoc()
+ doc = _documentFactory(c_doc, None)
+ self._c_node = _createPI(c_doc, _xcstr(target), _xcstr(text))
+ if self._c_node is NULL:
+ raise MemoryError()
+ tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
+ _registerProxy(self, doc, self._c_node)
+ self._init()
+
+cdef class EntityBase(_Entity):
+ u"""All custom Entity classes must inherit from this one.
+
+ To create an XML Entity instance, use the ``Entity()`` factory.
+
+ Subclasses *must not* override __init__ or __new__ as it is
+ absolutely undefined when these objects will be created or
+ destroyed. All persistent state of Entities must be stored in the
+ underlying XML. If you really need to initialize the object after
+ creation, you can implement an ``_init(self)`` method that will be
+ called after object creation.
+ """
+ def __init__(self, name):
+ cdef _Document doc
+ cdef xmlDoc* c_doc
+ name_utf = _utf8(name)
+ c_name = _xcstr(name_utf)
+ if c_name[0] == c'#':
+ if not _characterReferenceIsValid(c_name + 1):
+ raise ValueError, f"Invalid character reference: '{name}'"
+ elif not _xmlNameIsValid(c_name):
+ raise ValueError, f"Invalid entity reference: '{name}'"
+ c_doc = _newXMLDoc()
+ doc = _documentFactory(c_doc, None)
+ self._c_node = _createEntity(c_doc, c_name)
+ if self._c_node is NULL:
+ raise MemoryError()
+ tree.xmlAddChild(<xmlNode*>c_doc, self._c_node)
+ _registerProxy(self, doc, self._c_node)
+ self._init()
+
+
+cdef int _validateNodeClass(xmlNode* c_node, cls) except -1:
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ expected = ElementBase
+ elif c_node.type == tree.XML_COMMENT_NODE:
+ expected = CommentBase
+ elif c_node.type == tree.XML_ENTITY_REF_NODE:
+ expected = EntityBase
+ elif c_node.type == tree.XML_PI_NODE:
+ expected = PIBase
+ else:
+ assert False, f"Unknown node type: {c_node.type}"
+
+ if not (isinstance(cls, type) and issubclass(cls, expected)):
+ raise TypeError(
+ f"result of class lookup must be subclass of {type(expected)}, got {type(cls)}")
+ return 0
+
+
+################################################################################
+# Element class lookup
+
+ctypedef public object (*_element_class_lookup_function)(object, _Document, xmlNode*)
+
+# class to store element class lookup functions
+cdef public class ElementClassLookup [ type LxmlElementClassLookupType,
+ object LxmlElementClassLookup ]:
+ u"""ElementClassLookup(self)
+ Superclass of Element class lookups.
+ """
+ cdef _element_class_lookup_function _lookup_function
+
+
+cdef public class FallbackElementClassLookup(ElementClassLookup) \
+ [ type LxmlFallbackElementClassLookupType,
+ object LxmlFallbackElementClassLookup ]:
+ u"""FallbackElementClassLookup(self, fallback=None)
+
+ Superclass of Element class lookups with additional fallback.
+ """
+ cdef readonly ElementClassLookup fallback
+ cdef _element_class_lookup_function _fallback_function
+ def __cinit__(self):
+ # fall back to default lookup
+ self._fallback_function = _lookupDefaultElementClass
+
+ def __init__(self, ElementClassLookup fallback=None):
+ if fallback is not None:
+ self._setFallback(fallback)
+ else:
+ self._fallback_function = _lookupDefaultElementClass
+
+ cdef void _setFallback(self, ElementClassLookup lookup):
+ u"""Sets the fallback scheme for this lookup method.
+ """
+ self.fallback = lookup
+ self._fallback_function = lookup._lookup_function
+ if self._fallback_function is NULL:
+ self._fallback_function = _lookupDefaultElementClass
+
+ def set_fallback(self, ElementClassLookup lookup not None):
+ u"""set_fallback(self, lookup)
+
+ Sets the fallback scheme for this lookup method.
+ """
+ self._setFallback(lookup)
+
+cdef inline object _callLookupFallback(FallbackElementClassLookup lookup,
+ _Document doc, xmlNode* c_node):
+ return lookup._fallback_function(lookup.fallback, doc, c_node)
+
+
+################################################################################
+# default lookup scheme
+
+cdef class ElementDefaultClassLookup(ElementClassLookup):
+ u"""ElementDefaultClassLookup(self, element=None, comment=None, pi=None, entity=None)
+ Element class lookup scheme that always returns the default Element
+ class.
+
+ The keyword arguments ``element``, ``comment``, ``pi`` and ``entity``
+ accept the respective Element classes.
+ """
+ cdef readonly object element_class
+ cdef readonly object comment_class
+ cdef readonly object pi_class
+ cdef readonly object entity_class
+ def __cinit__(self):
+ self._lookup_function = _lookupDefaultElementClass
+
+ def __init__(self, element=None, comment=None, pi=None, entity=None):
+ if element is None:
+ self.element_class = _Element
+ elif issubclass(element, ElementBase):
+ self.element_class = element
+ else:
+ raise TypeError, u"element class must be subclass of ElementBase"
+
+ if comment is None:
+ self.comment_class = _Comment
+ elif issubclass(comment, CommentBase):
+ self.comment_class = comment
+ else:
+ raise TypeError, u"comment class must be subclass of CommentBase"
+
+ if entity is None:
+ self.entity_class = _Entity
+ elif issubclass(entity, EntityBase):
+ self.entity_class = entity
+ else:
+ raise TypeError, u"Entity class must be subclass of EntityBase"
+
+ if pi is None:
+ self.pi_class = None # special case, see below
+ elif issubclass(pi, PIBase):
+ self.pi_class = pi
+ else:
+ raise TypeError, u"PI class must be subclass of PIBase"
+
+cdef object _lookupDefaultElementClass(state, _Document _doc, xmlNode* c_node):
+ u"Trivial class lookup function that always returns the default class."
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ if state is not None:
+ return (<ElementDefaultClassLookup>state).element_class
+ else:
+ return _Element
+ elif c_node.type == tree.XML_COMMENT_NODE:
+ if state is not None:
+ return (<ElementDefaultClassLookup>state).comment_class
+ else:
+ return _Comment
+ elif c_node.type == tree.XML_ENTITY_REF_NODE:
+ if state is not None:
+ return (<ElementDefaultClassLookup>state).entity_class
+ else:
+ return _Entity
+ elif c_node.type == tree.XML_PI_NODE:
+ if state is None or (<ElementDefaultClassLookup>state).pi_class is None:
+ # special case XSLT-PI
+ if c_node.name is not NULL and c_node.content is not NULL:
+ if tree.xmlStrcmp(c_node.name, <unsigned char*>"xml-stylesheet") == 0:
+ if tree.xmlStrstr(c_node.content, <unsigned char*>"text/xsl") is not NULL or \
+ tree.xmlStrstr(c_node.content, <unsigned char*>"text/xml") is not NULL:
+ return _XSLTProcessingInstruction
+ return _ProcessingInstruction
+ else:
+ return (<ElementDefaultClassLookup>state).pi_class
+ else:
+ assert False, f"Unknown node type: {c_node.type}"
+
+
+################################################################################
+# attribute based lookup scheme
+
+cdef class AttributeBasedElementClassLookup(FallbackElementClassLookup):
+ u"""AttributeBasedElementClassLookup(self, attribute_name, class_mapping, fallback=None)
+ Checks an attribute of an Element and looks up the value in a
+ class dictionary.
+
+ Arguments:
+ - attribute name - '{ns}name' style string
+ - class mapping - Python dict mapping attribute values to Element classes
+ - fallback - optional fallback lookup mechanism
+
+ A None key in the class mapping will be checked if the attribute is
+ missing.
+ """
+ cdef object _class_mapping
+ cdef tuple _pytag
+ cdef const_xmlChar* _c_ns
+ cdef const_xmlChar* _c_name
+ def __cinit__(self):
+ self._lookup_function = _attribute_class_lookup
+
+ def __init__(self, attribute_name, class_mapping,
+ ElementClassLookup fallback=None):
+ self._pytag = _getNsTag(attribute_name)
+ ns, name = self._pytag
+ if ns is None:
+ self._c_ns = NULL
+ else:
+ self._c_ns = _xcstr(ns)
+ self._c_name = _xcstr(name)
+ self._class_mapping = dict(class_mapping)
+
+ FallbackElementClassLookup.__init__(self, fallback)
+
+cdef object _attribute_class_lookup(state, _Document doc, xmlNode* c_node):
+ cdef AttributeBasedElementClassLookup lookup
+ cdef python.PyObject* dict_result
+
+ lookup = <AttributeBasedElementClassLookup>state
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ value = _attributeValueFromNsName(
+ c_node, lookup._c_ns, lookup._c_name)
+ dict_result = python.PyDict_GetItem(lookup._class_mapping, value)
+ if dict_result is not NULL:
+ cls = <object>dict_result
+ _validateNodeClass(c_node, cls)
+ return cls
+ return _callLookupFallback(lookup, doc, c_node)
+
+
+################################################################################
+# per-parser lookup scheme
+
+cdef class ParserBasedElementClassLookup(FallbackElementClassLookup):
+ u"""ParserBasedElementClassLookup(self, fallback=None)
+ Element class lookup based on the XML parser.
+ """
+ def __cinit__(self):
+ self._lookup_function = _parser_class_lookup
+
+cdef object _parser_class_lookup(state, _Document doc, xmlNode* c_node):
+ if doc._parser._class_lookup is not None:
+ return doc._parser._class_lookup._lookup_function(
+ doc._parser._class_lookup, doc, c_node)
+ return _callLookupFallback(<FallbackElementClassLookup>state, doc, c_node)
+
+
+################################################################################
+# custom class lookup based on node type, namespace, name
+
+cdef class CustomElementClassLookup(FallbackElementClassLookup):
+ u"""CustomElementClassLookup(self, fallback=None)
+ Element class lookup based on a subclass method.
+
+ You can inherit from this class and override the method::
+
+ lookup(self, type, doc, namespace, name)
+
+ to lookup the element class for a node. Arguments of the method:
+ * type: one of 'element', 'comment', 'PI', 'entity'
+ * doc: document that the node is in
+ * namespace: namespace URI of the node (or None for comments/PIs/entities)
+ * name: name of the element/entity, None for comments, target for PIs
+
+ If you return None from this method, the fallback will be called.
+ """
+ def __cinit__(self):
+ self._lookup_function = _custom_class_lookup
+
+ def lookup(self, type, doc, namespace, name):
+ u"lookup(self, type, doc, namespace, name)"
+ return None
+
+cdef object _custom_class_lookup(state, _Document doc, xmlNode* c_node):
+ cdef CustomElementClassLookup lookup
+
+ lookup = <CustomElementClassLookup>state
+
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ element_type = u"element"
+ elif c_node.type == tree.XML_COMMENT_NODE:
+ element_type = u"comment"
+ elif c_node.type == tree.XML_PI_NODE:
+ element_type = u"PI"
+ elif c_node.type == tree.XML_ENTITY_REF_NODE:
+ element_type = u"entity"
+ else:
+ element_type = u"element"
+ if c_node.name is NULL:
+ name = None
+ else:
+ name = funicode(c_node.name)
+ c_str = tree._getNs(c_node)
+ ns = funicode(c_str) if c_str is not NULL else None
+
+ cls = lookup.lookup(element_type, doc, ns, name)
+ if cls is not None:
+ _validateNodeClass(c_node, cls)
+ return cls
+ return _callLookupFallback(lookup, doc, c_node)
+
+
+################################################################################
+# read-only tree based class lookup
+
+cdef class PythonElementClassLookup(FallbackElementClassLookup):
+ u"""PythonElementClassLookup(self, fallback=None)
+ Element class lookup based on a subclass method.
+
+ This class lookup scheme allows access to the entire XML tree in
+ read-only mode. To use it, re-implement the ``lookup(self, doc,
+ root)`` method in a subclass::
+
+ from lxml import etree, pyclasslookup
+
+ class MyElementClass(etree.ElementBase):
+ honkey = True
+
+ class MyLookup(pyclasslookup.PythonElementClassLookup):
+ def lookup(self, doc, root):
+ if root.tag == "sometag":
+ return MyElementClass
+ else:
+ for child in root:
+ if child.tag == "someothertag":
+ return MyElementClass
+ # delegate to default
+ return None
+
+ If you return None from this method, the fallback will be called.
+
+ The first argument is the opaque document instance that contains
+ the Element. The second argument is a lightweight Element proxy
+ implementation that is only valid during the lookup. Do not try
+ to keep a reference to it. Once the lookup is done, the proxy
+ will be invalid.
+
+ Also, you cannot wrap such a read-only Element in an ElementTree,
+ and you must take care not to keep a reference to them outside of
+ the `lookup()` method.
+
+ Note that the API of the Element objects is not complete. It is
+ purely read-only and does not support all features of the normal
+ `lxml.etree` API (such as XPath, extended slicing or some
+ iteration methods).
+
+ See https://lxml.de/element_classes.html
+ """
+ def __cinit__(self):
+ self._lookup_function = _python_class_lookup
+
+ def lookup(self, doc, element):
+ u"""lookup(self, doc, element)
+
+ Override this method to implement your own lookup scheme.
+ """
+ return None
+
+cdef object _python_class_lookup(state, _Document doc, tree.xmlNode* c_node):
+ cdef PythonElementClassLookup lookup
+ cdef _ReadOnlyProxy proxy
+ lookup = <PythonElementClassLookup>state
+
+ proxy = _newReadOnlyProxy(None, c_node)
+ cls = lookup.lookup(doc, proxy)
+ _freeReadOnlyProxies(proxy)
+
+ if cls is not None:
+ _validateNodeClass(c_node, cls)
+ return cls
+ return _callLookupFallback(lookup, doc, c_node)
+
+################################################################################
+# Global setup
+
+cdef _element_class_lookup_function LOOKUP_ELEMENT_CLASS
+cdef object ELEMENT_CLASS_LOOKUP_STATE
+
+cdef void _setElementClassLookupFunction(
+ _element_class_lookup_function function, object state):
+ global LOOKUP_ELEMENT_CLASS, ELEMENT_CLASS_LOOKUP_STATE
+ if function is NULL:
+ state = DEFAULT_ELEMENT_CLASS_LOOKUP
+ function = DEFAULT_ELEMENT_CLASS_LOOKUP._lookup_function
+
+ ELEMENT_CLASS_LOOKUP_STATE = state
+ LOOKUP_ELEMENT_CLASS = function
+
+def set_element_class_lookup(ElementClassLookup lookup = None):
+ u"""set_element_class_lookup(lookup = None)
+
+ Set the global default element class lookup method.
+ """
+ if lookup is None or lookup._lookup_function is NULL:
+ _setElementClassLookupFunction(NULL, None)
+ else:
+ _setElementClassLookupFunction(lookup._lookup_function, lookup)
+
+# default setup: parser delegation
+cdef ParserBasedElementClassLookup DEFAULT_ELEMENT_CLASS_LOOKUP
+DEFAULT_ELEMENT_CLASS_LOOKUP = ParserBasedElementClassLookup()
+
+set_element_class_lookup(DEFAULT_ELEMENT_CLASS_LOOKUP)
diff --git a/src/lxml/cleanup.pxi b/src/lxml/cleanup.pxi
new file mode 100644
index 0000000..ca9f5c6
--- /dev/null
+++ b/src/lxml/cleanup.pxi
@@ -0,0 +1,215 @@
+# functions for tree cleanup and removing elements from subtrees
+
+def cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None):
+ u"""cleanup_namespaces(tree_or_element, top_nsmap=None, keep_ns_prefixes=None)
+
+ Remove all namespace declarations from a subtree that are not used
+ by any of the elements or attributes in that tree.
+
+ If a 'top_nsmap' is provided, it must be a mapping from prefixes
+ to namespace URIs. These namespaces will be declared on the top
+ element of the subtree before running the cleanup, which allows
+ moving namespace declarations to the top of the tree.
+
+ If a 'keep_ns_prefixes' is provided, it must be a list of prefixes.
+ These prefixes will not be removed as part of the cleanup.
+ """
+ element = _rootNodeOrRaise(tree_or_element)
+ c_element = element._c_node
+
+ if top_nsmap:
+ doc = element._doc
+ # declare namespaces from nsmap, then apply them to the subtree
+ _setNodeNamespaces(c_element, doc, None, top_nsmap)
+ moveNodeToDocument(doc, c_element.doc, c_element)
+
+ keep_ns_prefixes = (
+ set([_utf8(prefix) for prefix in keep_ns_prefixes])
+ if keep_ns_prefixes else None)
+
+ _removeUnusedNamespaceDeclarations(c_element, keep_ns_prefixes)
+
+
+def strip_attributes(tree_or_element, *attribute_names):
+ u"""strip_attributes(tree_or_element, *attribute_names)
+
+ Delete all attributes with the provided attribute names from an
+ Element (or ElementTree) and its descendants.
+
+ Attribute names can contain wildcards as in `_Element.iter`.
+
+ Example usage::
+
+ strip_attributes(root_element,
+ 'simpleattr',
+ '{http://some/ns}attrname',
+ '{http://other/ns}*')
+ """
+ cdef _MultiTagMatcher matcher
+ element = _rootNodeOrRaise(tree_or_element)
+ if not attribute_names:
+ return
+
+ matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, attribute_names)
+ matcher.cacheTags(element._doc)
+ if matcher.rejectsAllAttributes():
+ return
+ _strip_attributes(element._c_node, matcher)
+
+
+cdef _strip_attributes(xmlNode* c_node, _MultiTagMatcher matcher):
+ cdef xmlAttr* c_attr
+ cdef xmlAttr* c_next_attr
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ c_attr = c_node.properties
+ while c_attr is not NULL:
+ c_next_attr = c_attr.next
+ if matcher.matchesAttribute(c_attr):
+ tree.xmlRemoveProp(c_attr)
+ c_attr = c_next_attr
+ tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+
+
+def strip_elements(tree_or_element, *tag_names, bint with_tail=True):
+ u"""strip_elements(tree_or_element, *tag_names, with_tail=True)
+
+ Delete all elements with the provided tag names from a tree or
+ subtree. This will remove the elements and their entire subtree,
+ including all their attributes, text content and descendants. It
+ will also remove the tail text of the element unless you
+ explicitly set the ``with_tail`` keyword argument option to False.
+
+ Tag names can contain wildcards as in `_Element.iter`.
+
+ Note that this will not delete the element (or ElementTree root
+ element) that you passed even if it matches. It will only treat
+ its descendants. If you want to include the root element, check
+ its tag name directly before even calling this function.
+
+ Example usage::
+
+ strip_elements(some_element,
+ 'simpletagname', # non-namespaced tag
+ '{http://some/ns}tagname', # namespaced tag
+ '{http://some/other/ns}*' # any tag from a namespace
+ lxml.etree.Comment # comments
+ )
+ """
+ cdef _MultiTagMatcher matcher
+ doc = _documentOrRaise(tree_or_element)
+ element = _rootNodeOrRaise(tree_or_element)
+ if not tag_names:
+ return
+
+ matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
+ matcher.cacheTags(doc)
+ if matcher.rejectsAll():
+ return
+
+ if isinstance(tree_or_element, _ElementTree):
+ # include PIs and comments next to the root node
+ if matcher.matchesType(tree.XML_COMMENT_NODE):
+ _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, with_tail)
+ if matcher.matchesType(tree.XML_PI_NODE):
+ _removeSiblings(element._c_node, tree.XML_PI_NODE, with_tail)
+ _strip_elements(doc, element._c_node, matcher, with_tail)
+
+cdef _strip_elements(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher,
+ bint with_tail):
+ cdef xmlNode* c_child
+ cdef xmlNode* c_next
+
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ # we run through the children here to prevent any problems
+ # with the tree iteration which would occur if we unlinked the
+ # c_node itself
+ c_child = _findChildForwards(c_node, 0)
+ while c_child is not NULL:
+ c_next = _nextElement(c_child)
+ if matcher.matches(c_child):
+ if c_child.type == tree.XML_ELEMENT_NODE:
+ if not with_tail:
+ tree.xmlUnlinkNode(c_child)
+ _removeNode(doc, c_child)
+ else:
+ if with_tail:
+ _removeText(c_child.next)
+ tree.xmlUnlinkNode(c_child)
+ attemptDeallocation(c_child)
+ c_child = c_next
+ tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+
+
+def strip_tags(tree_or_element, *tag_names):
+ u"""strip_tags(tree_or_element, *tag_names)
+
+ Delete all elements with the provided tag names from a tree or
+ subtree. This will remove the elements and their attributes, but
+ *not* their text/tail content or descendants. Instead, it will
+ merge the text content and children of the element into its
+ parent.
+
+ Tag names can contain wildcards as in `_Element.iter`.
+
+ Note that this will not delete the element (or ElementTree root
+ element) that you passed even if it matches. It will only treat
+ its descendants.
+
+ Example usage::
+
+ strip_tags(some_element,
+ 'simpletagname', # non-namespaced tag
+ '{http://some/ns}tagname', # namespaced tag
+ '{http://some/other/ns}*' # any tag from a namespace
+ Comment # comments (including their text!)
+ )
+ """
+ cdef _MultiTagMatcher matcher
+ doc = _documentOrRaise(tree_or_element)
+ element = _rootNodeOrRaise(tree_or_element)
+ if not tag_names:
+ return
+
+ matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag_names)
+ matcher.cacheTags(doc)
+ if matcher.rejectsAll():
+ return
+
+ if isinstance(tree_or_element, _ElementTree):
+ # include PIs and comments next to the root node
+ if matcher.matchesType(tree.XML_COMMENT_NODE):
+ _removeSiblings(element._c_node, tree.XML_COMMENT_NODE, 0)
+ if matcher.matchesType(tree.XML_PI_NODE):
+ _removeSiblings(element._c_node, tree.XML_PI_NODE, 0)
+ _strip_tags(doc, element._c_node, matcher)
+
+cdef _strip_tags(_Document doc, xmlNode* c_node, _MultiTagMatcher matcher):
+ cdef xmlNode* c_child
+ cdef xmlNode* c_next
+
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ # we run through the children here to prevent any problems
+ # with the tree iteration which would occur if we unlinked the
+ # c_node itself
+ c_child = _findChildForwards(c_node, 0)
+ while c_child is not NULL:
+ if not matcher.matches(c_child):
+ c_child = _nextElement(c_child)
+ continue
+ if c_child.type == tree.XML_ELEMENT_NODE:
+ c_next = _findChildForwards(c_child, 0) or _nextElement(c_child)
+ _replaceNodeByChildren(doc, c_child)
+ if not attemptDeallocation(c_child):
+ if c_child.nsDef is not NULL:
+ # make namespaces absolute
+ moveNodeToDocument(doc, doc._c_doc, c_child)
+ c_child = c_next
+ else:
+ c_next = _nextElement(c_child)
+ tree.xmlUnlinkNode(c_child)
+ attemptDeallocation(c_child)
+ c_child = c_next
+ tree.END_FOR_EACH_ELEMENT_FROM(c_node)
diff --git a/src/lxml/cssselect.py b/src/lxml/cssselect.py
new file mode 100644
index 0000000..586a142
--- /dev/null
+++ b/src/lxml/cssselect.py
@@ -0,0 +1,102 @@
+"""CSS Selectors based on XPath.
+
+This module supports selecting XML/HTML tags based on CSS selectors.
+See the `CSSSelector` class for details.
+
+This is a thin wrapper around cssselect 0.7 or later.
+"""
+
+from __future__ import absolute_import
+
+from . import etree
+try:
+ import cssselect as external_cssselect
+except ImportError:
+ raise ImportError(
+ 'cssselect does not seem to be installed. '
+ 'See http://packages.python.org/cssselect/')
+
+
+SelectorSyntaxError = external_cssselect.SelectorSyntaxError
+ExpressionError = external_cssselect.ExpressionError
+SelectorError = external_cssselect.SelectorError
+
+
+__all__ = ['SelectorSyntaxError', 'ExpressionError', 'SelectorError',
+ 'CSSSelector']
+
+
+class LxmlTranslator(external_cssselect.GenericTranslator):
+ """
+ A custom CSS selector to XPath translator with lxml-specific extensions.
+ """
+ def xpath_contains_function(self, xpath, function):
+ # Defined there, removed in later drafts:
+ # http://www.w3.org/TR/2001/CR-css3-selectors-20011113/#content-selectors
+ if function.argument_types() not in (['STRING'], ['IDENT']):
+ raise ExpressionError(
+ "Expected a single string or ident for :contains(), got %r"
+ % function.arguments)
+ value = function.arguments[0].value
+ return xpath.add_condition(
+ 'contains(__lxml_internal_css:lower-case(string(.)), %s)'
+ % self.xpath_literal(value.lower()))
+
+
+class LxmlHTMLTranslator(LxmlTranslator, external_cssselect.HTMLTranslator):
+ """
+ lxml extensions + HTML support.
+ """
+
+
+def _make_lower_case(context, s):
+ return s.lower()
+
+ns = etree.FunctionNamespace('http://codespeak.net/lxml/css/')
+ns.prefix = '__lxml_internal_css'
+ns['lower-case'] = _make_lower_case
+
+
+class CSSSelector(etree.XPath):
+ """A CSS selector.
+
+ Usage::
+
+ >>> from lxml import etree, cssselect
+ >>> select = cssselect.CSSSelector("a tag > child")
+
+ >>> root = etree.XML("<a><b><c/><tag><child>TEXT</child></tag></b></a>")
+ >>> [ el.tag for el in select(root) ]
+ ['child']
+
+ To use CSS namespaces, you need to pass a prefix-to-namespace
+ mapping as ``namespaces`` keyword argument::
+
+ >>> rdfns = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'
+ >>> select_ns = cssselect.CSSSelector('root > rdf|Description',
+ ... namespaces={'rdf': rdfns})
+
+ >>> rdf = etree.XML((
+ ... '<root xmlns:rdf="%s">'
+ ... '<rdf:Description>blah</rdf:Description>'
+ ... '</root>') % rdfns)
+ >>> [(el.tag, el.text) for el in select_ns(rdf)]
+ [('{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description', 'blah')]
+
+ """
+ def __init__(self, css, namespaces=None, translator='xml'):
+ if translator == 'xml':
+ translator = LxmlTranslator()
+ elif translator == 'html':
+ translator = LxmlHTMLTranslator()
+ elif translator == 'xhtml':
+ translator = LxmlHTMLTranslator(xhtml=True)
+ path = translator.css_to_xpath(css)
+ etree.XPath.__init__(self, path, namespaces=namespaces)
+ self.css = css
+
+ def __repr__(self):
+ return '<%s %s for %r>' % (
+ self.__class__.__name__,
+ hex(abs(id(self)))[2:],
+ self.css)
diff --git a/src/lxml/cvarargs.pxd b/src/lxml/cvarargs.pxd
new file mode 100644
index 0000000..5fe9b89
--- /dev/null
+++ b/src/lxml/cvarargs.pxd
@@ -0,0 +1,8 @@
+cdef extern from "stdarg.h":
+ ctypedef void *va_list
+ void va_start(va_list ap, void *last) nogil
+ void va_end(va_list ap) nogil
+
+cdef extern from "includes/etree_defs.h":
+ cdef int va_int(va_list ap) nogil
+ cdef char *va_charptr(va_list ap) nogil
diff --git a/src/lxml/debug.pxi b/src/lxml/debug.pxi
new file mode 100644
index 0000000..a0dc62e
--- /dev/null
+++ b/src/lxml/debug.pxi
@@ -0,0 +1,91 @@
+
+@cython.final
+@cython.internal
+cdef class _MemDebug:
+ """Debugging support for the memory allocation in libxml2.
+ """
+ def bytes_used(self):
+ """bytes_used(self)
+
+ Returns the total amount of memory (in bytes) currently used by libxml2.
+ Note that libxml2 constrains this value to a C int, which limits
+ the accuracy on 64 bit systems.
+ """
+ return tree.xmlMemUsed()
+
+ def blocks_used(self):
+ """blocks_used(self)
+
+ Returns the total number of memory blocks currently allocated by libxml2.
+ Note that libxml2 constrains this value to a C int, which limits
+ the accuracy on 64 bit systems.
+ """
+ return tree.xmlMemBlocks()
+
+ def dict_size(self):
+ """dict_size(self)
+
+ Returns the current size of the global name dictionary used by libxml2
+ for the current thread. Each thread has its own dictionary.
+ """
+ c_dict = __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL)
+ if c_dict is NULL:
+ raise MemoryError()
+ return tree.xmlDictSize(c_dict)
+
+ def dump(self, output_file=None, byte_count=None):
+ """dump(self, output_file=None, byte_count=None)
+
+ Dumps the current memory blocks allocated by libxml2 to a file.
+
+ The optional parameter 'output_file' specifies the file path. It defaults
+ to the file ".memorylist" in the current directory.
+
+ The optional parameter 'byte_count' limits the number of bytes in the dump.
+ Note that this parameter is ignored when lxml is compiled against a libxml2
+ version before 2.7.0.
+ """
+ cdef Py_ssize_t c_count
+ if output_file is None:
+ output_file = b'.memorylist'
+ elif isinstance(output_file, unicode):
+ output_file.encode(sys.getfilesystemencoding())
+
+ f = stdio.fopen(output_file, "w")
+ if f is NULL:
+ raise IOError(f"Failed to create file {output_file.decode(sys.getfilesystemencoding())}")
+ try:
+ if byte_count is None:
+ tree.xmlMemDisplay(f)
+ else:
+ c_count = byte_count
+ tree.xmlMemDisplayLast(f, c_count)
+ finally:
+ stdio.fclose(f)
+
+ def show(self, output_file=None, block_count=None):
+ """show(self, output_file=None, block_count=None)
+
+ Dumps the current memory blocks allocated by libxml2 to a file.
+ The output file format is suitable for line diffing.
+
+ The optional parameter 'output_file' specifies the file path. It defaults
+ to the file ".memorydump" in the current directory.
+
+ The optional parameter 'block_count' limits the number of blocks
+ in the dump.
+ """
+ if output_file is None:
+ output_file = b'.memorydump'
+ elif isinstance(output_file, unicode):
+ output_file.encode(sys.getfilesystemencoding())
+
+ f = stdio.fopen(output_file, "w")
+ if f is NULL:
+ raise IOError(f"Failed to create file {output_file.decode(sys.getfilesystemencoding())}")
+ try:
+ tree.xmlMemShow(f, block_count if block_count is not None else tree.xmlMemBlocks())
+ finally:
+ stdio.fclose(f)
+
+memory_debugger = _MemDebug()
diff --git a/src/lxml/docloader.pxi b/src/lxml/docloader.pxi
new file mode 100644
index 0000000..83ad612
--- /dev/null
+++ b/src/lxml/docloader.pxi
@@ -0,0 +1,178 @@
+# Custom resolver API
+
+ctypedef enum _InputDocumentDataType:
+ PARSER_DATA_INVALID
+ PARSER_DATA_EMPTY
+ PARSER_DATA_STRING
+ PARSER_DATA_FILENAME
+ PARSER_DATA_FILE
+
+@cython.final
+@cython.internal
+cdef class _InputDocument:
+ cdef _InputDocumentDataType _type
+ cdef bytes _data_bytes
+ cdef object _filename
+ cdef object _file
+ cdef bint _close_file
+
+ def __cinit__(self):
+ self._type = PARSER_DATA_INVALID
+
+
+cdef class Resolver:
+ u"This is the base class of all resolvers."
+ def resolve(self, system_url, public_id, context):
+ u"""resolve(self, system_url, public_id, context)
+
+ Override this method to resolve an external source by
+ ``system_url`` and ``public_id``. The third argument is an
+ opaque context object.
+
+ Return the result of one of the ``resolve_*()`` methods.
+ """
+ return None
+
+ def resolve_empty(self, context):
+ u"""resolve_empty(self, context)
+
+ Return an empty input document.
+
+ Pass context as parameter.
+ """
+ cdef _InputDocument doc_ref
+ doc_ref = _InputDocument()
+ doc_ref._type = PARSER_DATA_EMPTY
+ return doc_ref
+
+ def resolve_string(self, string, context, *, base_url=None):
+ u"""resolve_string(self, string, context, base_url=None)
+
+ Return a parsable string as input document.
+
+ Pass data string and context as parameters. You can pass the
+ source URL or filename through the ``base_url`` keyword
+ argument.
+ """
+ cdef _InputDocument doc_ref
+ if isinstance(string, unicode):
+ string = (<unicode>string).encode('utf8')
+ elif not isinstance(string, bytes):
+ raise TypeError, "argument must be a byte string or unicode string"
+ doc_ref = _InputDocument()
+ doc_ref._type = PARSER_DATA_STRING
+ doc_ref._data_bytes = string
+ if base_url is not None:
+ doc_ref._filename = _encodeFilename(base_url)
+ return doc_ref
+
+ def resolve_filename(self, filename, context):
+ u"""resolve_filename(self, filename, context)
+
+ Return the name of a parsable file as input document.
+
+ Pass filename and context as parameters. You can also pass a
+ URL with an HTTP, FTP or file target.
+ """
+ cdef _InputDocument doc_ref
+ doc_ref = _InputDocument()
+ doc_ref._type = PARSER_DATA_FILENAME
+ doc_ref._filename = _encodeFilename(filename)
+ return doc_ref
+
+ def resolve_file(self, f, context, *, base_url=None, bint close=True):
+ u"""resolve_file(self, f, context, base_url=None, close=True)
+
+ Return an open file-like object as input document.
+
+ Pass open file and context as parameters. You can pass the
+ base URL or filename of the file through the ``base_url``
+ keyword argument. If the ``close`` flag is True (the
+ default), the file will be closed after reading.
+
+ Note that using ``.resolve_filename()`` is more efficient,
+ especially in threaded environments.
+ """
+ cdef _InputDocument doc_ref
+ try:
+ f.read
+ except AttributeError:
+ raise TypeError, u"Argument is not a file-like object"
+ doc_ref = _InputDocument()
+ doc_ref._type = PARSER_DATA_FILE
+ if base_url is not None:
+ doc_ref._filename = _encodeFilename(base_url)
+ else:
+ doc_ref._filename = _getFilenameForFile(f)
+ doc_ref._close_file = close
+ doc_ref._file = f
+ return doc_ref
+
+@cython.final
+@cython.internal
+cdef class _ResolverRegistry:
+ cdef object _resolvers
+ cdef Resolver _default_resolver
+ def __cinit__(self, Resolver default_resolver=None):
+ self._resolvers = set()
+ self._default_resolver = default_resolver
+
+ def add(self, Resolver resolver not None):
+ u"""add(self, resolver)
+
+ Register a resolver.
+
+ For each requested entity, the 'resolve' method of the resolver will
+ be called and the result will be passed to the parser. If this method
+ returns None, the request will be delegated to other resolvers or the
+ default resolver. The resolvers will be tested in an arbitrary order
+ until the first match is found.
+ """
+ self._resolvers.add(resolver)
+
+ def remove(self, resolver):
+ u"remove(self, resolver)"
+ self._resolvers.discard(resolver)
+
+ cdef _ResolverRegistry _copy(self):
+ cdef _ResolverRegistry registry
+ registry = _ResolverRegistry(self._default_resolver)
+ registry._resolvers = self._resolvers.copy()
+ return registry
+
+ def copy(self):
+ u"copy(self)"
+ return self._copy()
+
+ def resolve(self, system_url, public_id, context):
+ u"resolve(self, system_url, public_id, context)"
+ for resolver in self._resolvers:
+ result = resolver.resolve(system_url, public_id, context)
+ if result is not None:
+ return result
+ if self._default_resolver is None:
+ return None
+ return self._default_resolver.resolve(system_url, public_id, context)
+
+ def __repr__(self):
+ return repr(self._resolvers)
+
+
+@cython.internal
+cdef class _ResolverContext(_ExceptionContext):
+ cdef _ResolverRegistry _resolvers
+ cdef _TempStore _storage
+
+ cdef int clear(self) except -1:
+ _ExceptionContext.clear(self)
+ self._storage.clear()
+ return 0
+
+
+cdef _initResolverContext(_ResolverContext context,
+ _ResolverRegistry resolvers):
+ if resolvers is None:
+ context._resolvers = _ResolverRegistry()
+ else:
+ context._resolvers = resolvers
+ context._storage = _TempStore()
diff --git a/src/lxml/doctestcompare.py b/src/lxml/doctestcompare.py
new file mode 100644
index 0000000..1b0daa4
--- /dev/null
+++ b/src/lxml/doctestcompare.py
@@ -0,0 +1,507 @@
+"""
+lxml-based doctest output comparison.
+
+Note: normally, you should just import the `lxml.usedoctest` and
+`lxml.html.usedoctest` modules from within a doctest, instead of this
+one::
+
+ >>> import lxml.usedoctest # for XML output
+
+ >>> import lxml.html.usedoctest # for HTML output
+
+To use this module directly, you must call ``lxmldoctest.install()``,
+which will cause doctest to use this in all subsequent calls.
+
+This changes the way output is checked and comparisons are made for
+XML or HTML-like content.
+
+XML or HTML content is noticed because the example starts with ``<``
+(it's HTML if it starts with ``<html``). You can also use the
+``PARSE_HTML`` and ``PARSE_XML`` flags to force parsing.
+
+Some rough wildcard-like things are allowed. Whitespace is generally
+ignored (except in attributes). In text (attributes and text in the
+body) you can use ``...`` as a wildcard. In an example it also
+matches any trailing tags in the element, though it does not match
+leading tags. You may create a tag ``<any>`` or include an ``any``
+attribute in the tag. An ``any`` tag matches any tag, while the
+attribute matches any and all attributes.
+
+When a match fails, the reformatted example and gotten text is
+displayed (indented), and a rough diff-like output is given. Anything
+marked with ``+`` is in the output but wasn't supposed to be, and
+similarly ``-`` means its in the example but wasn't in the output.
+
+You can disable parsing on one line with ``# doctest:+NOPARSE_MARKUP``
+"""
+
+from lxml import etree
+import sys
+import re
+import doctest
+try:
+ from html import escape as html_escape
+except ImportError:
+ from cgi import escape as html_escape
+
+__all__ = ['PARSE_HTML', 'PARSE_XML', 'NOPARSE_MARKUP', 'LXMLOutputChecker',
+ 'LHTMLOutputChecker', 'install', 'temp_install']
+
+try:
+ _basestring = basestring
+except NameError:
+ _basestring = (str, bytes)
+
+_IS_PYTHON_3 = sys.version_info[0] >= 3
+
+PARSE_HTML = doctest.register_optionflag('PARSE_HTML')
+PARSE_XML = doctest.register_optionflag('PARSE_XML')
+NOPARSE_MARKUP = doctest.register_optionflag('NOPARSE_MARKUP')
+
+OutputChecker = doctest.OutputChecker
+
+def strip(v):
+ if v is None:
+ return None
+ else:
+ return v.strip()
+
+def norm_whitespace(v):
+ return _norm_whitespace_re.sub(' ', v)
+
+_html_parser = etree.HTMLParser(recover=False, remove_blank_text=True)
+
+def html_fromstring(html):
+ return etree.fromstring(html, _html_parser)
+
+# We use this to distinguish repr()s from elements:
+_repr_re = re.compile(r'^<[^>]+ (at|object) ')
+_norm_whitespace_re = re.compile(r'[ \t\n][ \t\n]+')
+
+class LXMLOutputChecker(OutputChecker):
+
+ empty_tags = (
+ 'param', 'img', 'area', 'br', 'basefont', 'input',
+ 'base', 'meta', 'link', 'col')
+
+ def get_default_parser(self):
+ return etree.XML
+
+ def check_output(self, want, got, optionflags):
+ alt_self = getattr(self, '_temp_override_self', None)
+ if alt_self is not None:
+ super_method = self._temp_call_super_check_output
+ self = alt_self
+ else:
+ super_method = OutputChecker.check_output
+ parser = self.get_parser(want, got, optionflags)
+ if not parser:
+ return super_method(
+ self, want, got, optionflags)
+ try:
+ want_doc = parser(want)
+ except etree.XMLSyntaxError:
+ return False
+ try:
+ got_doc = parser(got)
+ except etree.XMLSyntaxError:
+ return False
+ return self.compare_docs(want_doc, got_doc)
+
+ def get_parser(self, want, got, optionflags):
+ parser = None
+ if NOPARSE_MARKUP & optionflags:
+ return None
+ if PARSE_HTML & optionflags:
+ parser = html_fromstring
+ elif PARSE_XML & optionflags:
+ parser = etree.XML
+ elif (want.strip().lower().startswith('<html')
+ and got.strip().startswith('<html')):
+ parser = html_fromstring
+ elif (self._looks_like_markup(want)
+ and self._looks_like_markup(got)):
+ parser = self.get_default_parser()
+ return parser
+
+ def _looks_like_markup(self, s):
+ s = s.strip()
+ return (s.startswith('<')
+ and not _repr_re.search(s))
+
+ def compare_docs(self, want, got):
+ if not self.tag_compare(want.tag, got.tag):
+ return False
+ if not self.text_compare(want.text, got.text, True):
+ return False
+ if not self.text_compare(want.tail, got.tail, True):
+ return False
+ if 'any' not in want.attrib:
+ want_keys = sorted(want.attrib.keys())
+ got_keys = sorted(got.attrib.keys())
+ if want_keys != got_keys:
+ return False
+ for key in want_keys:
+ if not self.text_compare(want.attrib[key], got.attrib[key], False):
+ return False
+ if want.text != '...' or len(want):
+ want_children = list(want)
+ got_children = list(got)
+ while want_children or got_children:
+ if not want_children or not got_children:
+ return False
+ want_first = want_children.pop(0)
+ got_first = got_children.pop(0)
+ if not self.compare_docs(want_first, got_first):
+ return False
+ if not got_children and want_first.tail == '...':
+ break
+ return True
+
+ def text_compare(self, want, got, strip):
+ want = want or ''
+ got = got or ''
+ if strip:
+ want = norm_whitespace(want).strip()
+ got = norm_whitespace(got).strip()
+ want = '^%s$' % re.escape(want)
+ want = want.replace(r'\.\.\.', '.*')
+ if re.search(want, got):
+ return True
+ else:
+ return False
+
+ def tag_compare(self, want, got):
+ if want == 'any':
+ return True
+ if (not isinstance(want, _basestring)
+ or not isinstance(got, _basestring)):
+ return want == got
+ want = want or ''
+ got = got or ''
+ if want.startswith('{...}'):
+ # Ellipsis on the namespace
+ return want.split('}')[-1] == got.split('}')[-1]
+ else:
+ return want == got
+
+ def output_difference(self, example, got, optionflags):
+ want = example.want
+ parser = self.get_parser(want, got, optionflags)
+ errors = []
+ if parser is not None:
+ try:
+ want_doc = parser(want)
+ except etree.XMLSyntaxError:
+ e = sys.exc_info()[1]
+ errors.append('In example: %s' % e)
+ try:
+ got_doc = parser(got)
+ except etree.XMLSyntaxError:
+ e = sys.exc_info()[1]
+ errors.append('In actual output: %s' % e)
+ if parser is None or errors:
+ value = OutputChecker.output_difference(
+ self, example, got, optionflags)
+ if errors:
+ errors.append(value)
+ return '\n'.join(errors)
+ else:
+ return value
+ html = parser is html_fromstring
+ diff_parts = ['Expected:',
+ self.format_doc(want_doc, html, 2),
+ 'Got:',
+ self.format_doc(got_doc, html, 2),
+ 'Diff:',
+ self.collect_diff(want_doc, got_doc, html, 2)]
+ return '\n'.join(diff_parts)
+
+ def html_empty_tag(self, el, html=True):
+ if not html:
+ return False
+ if el.tag not in self.empty_tags:
+ return False
+ if el.text or len(el):
+ # This shouldn't happen (contents in an empty tag)
+ return False
+ return True
+
+ def format_doc(self, doc, html, indent, prefix=''):
+ parts = []
+ if not len(doc):
+ # No children...
+ parts.append(' '*indent)
+ parts.append(prefix)
+ parts.append(self.format_tag(doc))
+ if not self.html_empty_tag(doc, html):
+ if strip(doc.text):
+ parts.append(self.format_text(doc.text))
+ parts.append(self.format_end_tag(doc))
+ if strip(doc.tail):
+ parts.append(self.format_text(doc.tail))
+ parts.append('\n')
+ return ''.join(parts)
+ parts.append(' '*indent)
+ parts.append(prefix)
+ parts.append(self.format_tag(doc))
+ if not self.html_empty_tag(doc, html):
+ parts.append('\n')
+ if strip(doc.text):
+ parts.append(' '*indent)
+ parts.append(self.format_text(doc.text))
+ parts.append('\n')
+ for el in doc:
+ parts.append(self.format_doc(el, html, indent+2))
+ parts.append(' '*indent)
+ parts.append(self.format_end_tag(doc))
+ parts.append('\n')
+ if strip(doc.tail):
+ parts.append(' '*indent)
+ parts.append(self.format_text(doc.tail))
+ parts.append('\n')
+ return ''.join(parts)
+
+ def format_text(self, text, strip=True):
+ if text is None:
+ return ''
+ if strip:
+ text = text.strip()
+ return html_escape(text, 1)
+
+ def format_tag(self, el):
+ attrs = []
+ if isinstance(el, etree.CommentBase):
+ # FIXME: probably PIs should be handled specially too?
+ return '<!--'
+ for name, value in sorted(el.attrib.items()):
+ attrs.append('%s="%s"' % (name, self.format_text(value, False)))
+ if not attrs:
+ return '<%s>' % el.tag
+ return '<%s %s>' % (el.tag, ' '.join(attrs))
+
+ def format_end_tag(self, el):
+ if isinstance(el, etree.CommentBase):
+ # FIXME: probably PIs should be handled specially too?
+ return '-->'
+ return '</%s>' % el.tag
+
+ def collect_diff(self, want, got, html, indent):
+ parts = []
+ if not len(want) and not len(got):
+ parts.append(' '*indent)
+ parts.append(self.collect_diff_tag(want, got))
+ if not self.html_empty_tag(got, html):
+ parts.append(self.collect_diff_text(want.text, got.text))
+ parts.append(self.collect_diff_end_tag(want, got))
+ parts.append(self.collect_diff_text(want.tail, got.tail))
+ parts.append('\n')
+ return ''.join(parts)
+ parts.append(' '*indent)
+ parts.append(self.collect_diff_tag(want, got))
+ parts.append('\n')
+ if strip(want.text) or strip(got.text):
+ parts.append(' '*indent)
+ parts.append(self.collect_diff_text(want.text, got.text))
+ parts.append('\n')
+ want_children = list(want)
+ got_children = list(got)
+ while want_children or got_children:
+ if not want_children:
+ parts.append(self.format_doc(got_children.pop(0), html, indent+2, '+'))
+ continue
+ if not got_children:
+ parts.append(self.format_doc(want_children.pop(0), html, indent+2, '-'))
+ continue
+ parts.append(self.collect_diff(
+ want_children.pop(0), got_children.pop(0), html, indent+2))
+ parts.append(' '*indent)
+ parts.append(self.collect_diff_end_tag(want, got))
+ parts.append('\n')
+ if strip(want.tail) or strip(got.tail):
+ parts.append(' '*indent)
+ parts.append(self.collect_diff_text(want.tail, got.tail))
+ parts.append('\n')
+ return ''.join(parts)
+
+ def collect_diff_tag(self, want, got):
+ if not self.tag_compare(want.tag, got.tag):
+ tag = '%s (got: %s)' % (want.tag, got.tag)
+ else:
+ tag = got.tag
+ attrs = []
+ any = want.tag == 'any' or 'any' in want.attrib
+ for name, value in sorted(got.attrib.items()):
+ if name not in want.attrib and not any:
+ attrs.append('+%s="%s"' % (name, self.format_text(value, False)))
+ else:
+ if name in want.attrib:
+ text = self.collect_diff_text(want.attrib[name], value, False)
+ else:
+ text = self.format_text(value, False)
+ attrs.append('%s="%s"' % (name, text))
+ if not any:
+ for name, value in sorted(want.attrib.items()):
+ if name in got.attrib:
+ continue
+ attrs.append('-%s="%s"' % (name, self.format_text(value, False)))
+ if attrs:
+ tag = '<%s %s>' % (tag, ' '.join(attrs))
+ else:
+ tag = '<%s>' % tag
+ return tag
+
+ def collect_diff_end_tag(self, want, got):
+ if want.tag != got.tag:
+ tag = '%s (got: %s)' % (want.tag, got.tag)
+ else:
+ tag = got.tag
+ return '</%s>' % tag
+
+ def collect_diff_text(self, want, got, strip=True):
+ if self.text_compare(want, got, strip):
+ if not got:
+ return ''
+ return self.format_text(got, strip)
+ text = '%s (got: %s)' % (want, got)
+ return self.format_text(text, strip)
+
+class LHTMLOutputChecker(LXMLOutputChecker):
+ def get_default_parser(self):
+ return html_fromstring
+
+def install(html=False):
+ """
+ Install doctestcompare for all future doctests.
+
+ If html is true, then by default the HTML parser will be used;
+ otherwise the XML parser is used.
+ """
+ if html:
+ doctest.OutputChecker = LHTMLOutputChecker
+ else:
+ doctest.OutputChecker = LXMLOutputChecker
+
+def temp_install(html=False, del_module=None):
+ """
+ Use this *inside* a doctest to enable this checker for this
+ doctest only.
+
+ If html is true, then by default the HTML parser will be used;
+ otherwise the XML parser is used.
+ """
+ if html:
+ Checker = LHTMLOutputChecker
+ else:
+ Checker = LXMLOutputChecker
+ frame = _find_doctest_frame()
+ dt_self = frame.f_locals['self']
+ checker = Checker()
+ old_checker = dt_self._checker
+ dt_self._checker = checker
+ # The unfortunate thing is that there is a local variable 'check'
+ # in the function that runs the doctests, that is a bound method
+ # into the output checker. We have to update that. We can't
+ # modify the frame, so we have to modify the object in place. The
+ # only way to do this is to actually change the func_code
+ # attribute of the method. We change it, and then wait for
+ # __record_outcome to be run, which signals the end of the __run
+ # method, at which point we restore the previous check_output
+ # implementation.
+ if _IS_PYTHON_3:
+ check_func = frame.f_locals['check'].__func__
+ checker_check_func = checker.check_output.__func__
+ else:
+ check_func = frame.f_locals['check'].im_func
+ checker_check_func = checker.check_output.im_func
+ # Because we can't patch up func_globals, this is the only global
+ # in check_output that we care about:
+ doctest.etree = etree
+ _RestoreChecker(dt_self, old_checker, checker,
+ check_func, checker_check_func,
+ del_module)
+
+class _RestoreChecker(object):
+ def __init__(self, dt_self, old_checker, new_checker, check_func, clone_func,
+ del_module):
+ self.dt_self = dt_self
+ self.checker = old_checker
+ self.checker._temp_call_super_check_output = self.call_super
+ self.checker._temp_override_self = new_checker
+ self.check_func = check_func
+ self.clone_func = clone_func
+ self.del_module = del_module
+ self.install_clone()
+ self.install_dt_self()
+ def install_clone(self):
+ if _IS_PYTHON_3:
+ self.func_code = self.check_func.__code__
+ self.func_globals = self.check_func.__globals__
+ self.check_func.__code__ = self.clone_func.__code__
+ else:
+ self.func_code = self.check_func.func_code
+ self.func_globals = self.check_func.func_globals
+ self.check_func.func_code = self.clone_func.func_code
+ def uninstall_clone(self):
+ if _IS_PYTHON_3:
+ self.check_func.__code__ = self.func_code
+ else:
+ self.check_func.func_code = self.func_code
+ def install_dt_self(self):
+ self.prev_func = self.dt_self._DocTestRunner__record_outcome
+ self.dt_self._DocTestRunner__record_outcome = self
+ def uninstall_dt_self(self):
+ self.dt_self._DocTestRunner__record_outcome = self.prev_func
+ def uninstall_module(self):
+ if self.del_module:
+ import sys
+ del sys.modules[self.del_module]
+ if '.' in self.del_module:
+ package, module = self.del_module.rsplit('.', 1)
+ package_mod = sys.modules[package]
+ delattr(package_mod, module)
+ def __call__(self, *args, **kw):
+ self.uninstall_clone()
+ self.uninstall_dt_self()
+ del self.checker._temp_override_self
+ del self.checker._temp_call_super_check_output
+ result = self.prev_func(*args, **kw)
+ self.uninstall_module()
+ return result
+ def call_super(self, *args, **kw):
+ self.uninstall_clone()
+ try:
+ return self.check_func(*args, **kw)
+ finally:
+ self.install_clone()
+
+def _find_doctest_frame():
+ import sys
+ frame = sys._getframe(1)
+ while frame:
+ l = frame.f_locals
+ if 'BOOM' in l:
+ # Sign of doctest
+ return frame
+ frame = frame.f_back
+ raise LookupError(
+ "Could not find doctest (only use this function *inside* a doctest)")
+
+__test__ = {
+ 'basic': '''
+ >>> temp_install()
+ >>> print """<xml a="1" b="2">stuff</xml>"""
+ <xml b="2" a="1">...</xml>
+ >>> print """<xml xmlns="http://example.com"><tag attr="bar" /></xml>"""
+ <xml xmlns="...">
+ <tag attr="..." />
+ </xml>
+ >>> print """<xml>blahblahblah<foo /></xml>""" # doctest: +NOPARSE_MARKUP, +ELLIPSIS
+ <xml>...foo /></xml>
+ '''}
+
+if __name__ == '__main__':
+ import doctest
+ doctest.testmod()
+
+
diff --git a/src/lxml/dtd.pxi b/src/lxml/dtd.pxi
new file mode 100644
index 0000000..5dcb80c
--- /dev/null
+++ b/src/lxml/dtd.pxi
@@ -0,0 +1,472 @@
+# support for DTD validation
+from lxml.includes cimport dtdvalid
+
+cdef class DTDError(LxmlError):
+ """Base class for DTD errors.
+ """
+
+cdef class DTDParseError(DTDError):
+ """Error while parsing a DTD.
+ """
+
+cdef class DTDValidateError(DTDError):
+ """Error while validating an XML document with a DTD.
+ """
+
+
+cdef inline int _assertValidDTDNode(node, void *c_node) except -1:
+ assert c_node is not NULL, u"invalid DTD proxy at %s" % id(node)
+
+
+@cython.final
+@cython.internal
+@cython.freelist(8)
+cdef class _DTDElementContentDecl:
+ cdef DTD _dtd
+ cdef tree.xmlElementContent* _c_node
+
+ def __repr__(self):
+ return "<%s.%s object name=%r type=%r occur=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.type, self.occur, id(self))
+
+ @property
+ def name(self):
+ _assertValidDTDNode(self, self._c_node)
+ return funicodeOrNone(self._c_node.name)
+
+ @property
+ def type(self):
+ _assertValidDTDNode(self, self._c_node)
+ cdef int type = self._c_node.type
+ if type == tree.XML_ELEMENT_CONTENT_PCDATA:
+ return "pcdata"
+ elif type == tree.XML_ELEMENT_CONTENT_ELEMENT:
+ return "element"
+ elif type == tree.XML_ELEMENT_CONTENT_SEQ:
+ return "seq"
+ elif type == tree.XML_ELEMENT_CONTENT_OR:
+ return "or"
+ else:
+ return None
+
+ @property
+ def occur(self):
+ _assertValidDTDNode(self, self._c_node)
+ cdef int occur = self._c_node.ocur
+ if occur == tree.XML_ELEMENT_CONTENT_ONCE:
+ return "once"
+ elif occur == tree.XML_ELEMENT_CONTENT_OPT:
+ return "opt"
+ elif occur == tree.XML_ELEMENT_CONTENT_MULT:
+ return "mult"
+ elif occur == tree.XML_ELEMENT_CONTENT_PLUS:
+ return "plus"
+ else:
+ return None
+
+ @property
+ def left(self):
+ _assertValidDTDNode(self, self._c_node)
+ c1 = self._c_node.c1
+ if c1:
+ node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
+ node._dtd = self._dtd
+ node._c_node = <tree.xmlElementContent*>c1
+ return node
+ else:
+ return None
+
+ @property
+ def right(self):
+ _assertValidDTDNode(self, self._c_node)
+ c2 = self._c_node.c2
+ if c2:
+ node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
+ node._dtd = self._dtd
+ node._c_node = <tree.xmlElementContent*>c2
+ return node
+ else:
+ return None
+
+
+@cython.final
+@cython.internal
+@cython.freelist(8)
+cdef class _DTDAttributeDecl:
+ cdef DTD _dtd
+ cdef tree.xmlAttribute* _c_node
+
+ def __repr__(self):
+ return "<%s.%s object name=%r elemname=%r prefix=%r type=%r default=%r default_value=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.elemname, self.prefix, self.type, self.default, self.default_value, id(self))
+
+ @property
+ def name(self):
+ _assertValidDTDNode(self, self._c_node)
+ return funicodeOrNone(self._c_node.name)
+
+ @property
+ def elemname(self):
+ _assertValidDTDNode(self, self._c_node)
+ return funicodeOrNone(self._c_node.elem)
+
+ @property
+ def prefix(self):
+ _assertValidDTDNode(self, self._c_node)
+ return funicodeOrNone(self._c_node.prefix)
+
+ @property
+ def type(self):
+ _assertValidDTDNode(self, self._c_node)
+ cdef int type = self._c_node.atype
+ if type == tree.XML_ATTRIBUTE_CDATA:
+ return "cdata"
+ elif type == tree.XML_ATTRIBUTE_ID:
+ return "id"
+ elif type == tree.XML_ATTRIBUTE_IDREF:
+ return "idref"
+ elif type == tree.XML_ATTRIBUTE_IDREFS:
+ return "idrefs"
+ elif type == tree.XML_ATTRIBUTE_ENTITY:
+ return "entity"
+ elif type == tree.XML_ATTRIBUTE_ENTITIES:
+ return "entities"
+ elif type == tree.XML_ATTRIBUTE_NMTOKEN:
+ return "nmtoken"
+ elif type == tree.XML_ATTRIBUTE_NMTOKENS:
+ return "nmtokens"
+ elif type == tree.XML_ATTRIBUTE_ENUMERATION:
+ return "enumeration"
+ elif type == tree.XML_ATTRIBUTE_NOTATION:
+ return "notation"
+ else:
+ return None
+
+ @property
+ def default(self):
+ _assertValidDTDNode(self, self._c_node)
+ cdef int default = self._c_node.def_
+ if default == tree.XML_ATTRIBUTE_NONE:
+ return "none"
+ elif default == tree.XML_ATTRIBUTE_REQUIRED:
+ return "required"
+ elif default == tree.XML_ATTRIBUTE_IMPLIED:
+ return "implied"
+ elif default == tree.XML_ATTRIBUTE_FIXED:
+ return "fixed"
+ else:
+ return None
+
+ @property
+ def default_value(self):
+ _assertValidDTDNode(self, self._c_node)
+ return funicodeOrNone(self._c_node.defaultValue)
+
+ def itervalues(self):
+ _assertValidDTDNode(self, self._c_node)
+ cdef tree.xmlEnumeration *c_node = self._c_node.tree
+ while c_node is not NULL:
+ yield funicode(c_node.name)
+ c_node = c_node.next
+
+ def values(self):
+ return list(self.itervalues())
+
+
+@cython.final
+@cython.internal
+@cython.freelist(8)
+cdef class _DTDElementDecl:
+ cdef DTD _dtd
+ cdef tree.xmlElement* _c_node
+
+ def __repr__(self):
+ return "<%s.%s object name=%r prefix=%r type=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, self.prefix, self.type, id(self))
+
+ @property
+ def name(self):
+ _assertValidDTDNode(self, self._c_node)
+ return funicodeOrNone(self._c_node.name)
+
+ @property
+ def prefix(self):
+ _assertValidDTDNode(self, self._c_node)
+ return funicodeOrNone(self._c_node.prefix)
+
+ @property
+ def type(self):
+ _assertValidDTDNode(self, self._c_node)
+ cdef int type = self._c_node.etype
+ if type == tree.XML_ELEMENT_TYPE_UNDEFINED:
+ return "undefined"
+ elif type == tree.XML_ELEMENT_TYPE_EMPTY:
+ return "empty"
+ elif type == tree.XML_ELEMENT_TYPE_ANY:
+ return "any"
+ elif type == tree.XML_ELEMENT_TYPE_MIXED:
+ return "mixed"
+ elif type == tree.XML_ELEMENT_TYPE_ELEMENT:
+ return "element"
+ else:
+ return None
+
+ @property
+ def content(self):
+ _assertValidDTDNode(self, self._c_node)
+ cdef tree.xmlElementContent *content = self._c_node.content
+ if content:
+ node = <_DTDElementContentDecl>_DTDElementContentDecl.__new__(_DTDElementContentDecl)
+ node._dtd = self._dtd
+ node._c_node = content
+ return node
+ else:
+ return None
+
+ def iterattributes(self):
+ _assertValidDTDNode(self, self._c_node)
+ cdef tree.xmlAttribute *c_node = self._c_node.attributes
+ while c_node:
+ node = <_DTDAttributeDecl>_DTDAttributeDecl.__new__(_DTDAttributeDecl)
+ node._dtd = self._dtd
+ node._c_node = c_node
+ yield node
+ c_node = c_node.nexth
+
+ def attributes(self):
+ return list(self.iterattributes())
+
+
+@cython.final
+@cython.internal
+@cython.freelist(8)
+cdef class _DTDEntityDecl:
+ cdef DTD _dtd
+ cdef tree.xmlEntity* _c_node
+ def __repr__(self):
+ return "<%s.%s object name=%r at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))
+
+ @property
+ def name(self):
+ _assertValidDTDNode(self, self._c_node)
+ return funicodeOrNone(self._c_node.name)
+
+ @property
+ def orig(self):
+ _assertValidDTDNode(self, self._c_node)
+ return funicodeOrNone(self._c_node.orig)
+
+ @property
+ def content(self):
+ _assertValidDTDNode(self, self._c_node)
+ return funicodeOrNone(self._c_node.content)
+
+
+################################################################################
+# DTD
+
+cdef class DTD(_Validator):
+ u"""DTD(self, file=None, external_id=None)
+ A DTD validator.
+
+ Can load from filesystem directly given a filename or file-like object.
+ Alternatively, pass the keyword parameter ``external_id`` to load from a
+ catalog.
+ """
+ cdef tree.xmlDtd* _c_dtd
+ def __init__(self, file=None, *, external_id=None):
+ _Validator.__init__(self)
+ if file is not None:
+ if _isString(file):
+ file = _encodeFilename(file)
+ with self._error_log:
+ orig_loader = _register_document_loader()
+ self._c_dtd = xmlparser.xmlParseDTD(NULL, _xcstr(file))
+ _reset_document_loader(orig_loader)
+ elif hasattr(file, 'read'):
+ orig_loader = _register_document_loader()
+ self._c_dtd = _parseDtdFromFilelike(file)
+ _reset_document_loader(orig_loader)
+ else:
+ raise DTDParseError, u"file must be a filename or file-like object"
+ elif external_id is not None:
+ with self._error_log:
+ orig_loader = _register_document_loader()
+ self._c_dtd = xmlparser.xmlParseDTD(<const_xmlChar*>external_id, NULL)
+ _reset_document_loader(orig_loader)
+ else:
+ raise DTDParseError, u"either filename or external ID required"
+
+ if self._c_dtd is NULL:
+ raise DTDParseError(
+ self._error_log._buildExceptionMessage(u"error parsing DTD"),
+ self._error_log)
+
+ @property
+ def name(self):
+ if self._c_dtd is NULL:
+ return None
+ return funicodeOrNone(self._c_dtd.name)
+
+ @property
+ def external_id(self):
+ if self._c_dtd is NULL:
+ return None
+ return funicodeOrNone(self._c_dtd.ExternalID)
+
+ @property
+ def system_url(self):
+ if self._c_dtd is NULL:
+ return None
+ return funicodeOrNone(self._c_dtd.SystemID)
+
+ def iterelements(self):
+ cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ELEMENT_DECL:
+ node = _DTDElementDecl()
+ node._dtd = self
+ node._c_node = <tree.xmlElement*>c_node
+ yield node
+ c_node = c_node.next
+
+ def elements(self):
+ return list(self.iterelements())
+
+ def iterentities(self):
+ cdef tree.xmlNode *c_node = self._c_dtd.children if self._c_dtd is not NULL else NULL
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ENTITY_DECL:
+ node = _DTDEntityDecl()
+ node._dtd = self
+ node._c_node = <tree.xmlEntity*>c_node
+ yield node
+ c_node = c_node.next
+
+ def entities(self):
+ return list(self.iterentities())
+
+ def __dealloc__(self):
+ tree.xmlFreeDtd(self._c_dtd)
+
+ def __call__(self, etree):
+ u"""__call__(self, etree)
+
+ Validate doc using the DTD.
+
+ Returns true if the document is valid, false if not.
+ """
+ cdef _Document doc
+ cdef _Element root_node
+ cdef xmlDoc* c_doc
+ cdef dtdvalid.xmlValidCtxt* valid_ctxt
+ cdef int ret = -1
+
+ assert self._c_dtd is not NULL, "DTD not initialised"
+ doc = _documentOrRaise(etree)
+ root_node = _rootNodeOrRaise(etree)
+
+ valid_ctxt = dtdvalid.xmlNewValidCtxt()
+ if valid_ctxt is NULL:
+ raise DTDError(u"Failed to create validation context")
+
+ # work around error reporting bug in libxml2 <= 2.9.1 (and later?)
+ # https://bugzilla.gnome.org/show_bug.cgi?id=724903
+ valid_ctxt.error = <dtdvalid.xmlValidityErrorFunc>_nullGenericErrorFunc
+ valid_ctxt.userData = NULL
+
+ try:
+ with self._error_log:
+ c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node)
+ ret = dtdvalid.xmlValidateDtd(valid_ctxt, c_doc, self._c_dtd)
+ _destroyFakeDoc(doc._c_doc, c_doc)
+ finally:
+ dtdvalid.xmlFreeValidCtxt(valid_ctxt)
+
+ if ret == -1:
+ raise DTDValidateError(u"Internal error in DTD validation",
+ self._error_log)
+ return ret == 1
+
+
+cdef tree.xmlDtd* _parseDtdFromFilelike(file) except NULL:
+ cdef _ExceptionContext exc_context
+ cdef _FileReaderContext dtd_parser
+ cdef _ErrorLog error_log
+ cdef tree.xmlDtd* c_dtd = NULL
+ exc_context = _ExceptionContext()
+ dtd_parser = _FileReaderContext(file, exc_context, None)
+ error_log = _ErrorLog()
+
+ with error_log:
+ c_dtd = dtd_parser._readDtd()
+
+ exc_context._raise_if_stored()
+ if c_dtd is NULL:
+ raise DTDParseError(u"error parsing DTD", error_log)
+ return c_dtd
+
+cdef DTD _dtdFactory(tree.xmlDtd* c_dtd):
+ # do not run through DTD.__init__()!
+ cdef DTD dtd
+ if c_dtd is NULL:
+ return None
+ dtd = DTD.__new__(DTD)
+ dtd._c_dtd = _copyDtd(c_dtd)
+ _Validator.__init__(dtd)
+ return dtd
+
+
+cdef tree.xmlDtd* _copyDtd(tree.xmlDtd* c_orig_dtd) except NULL:
+ """
+ Copy a DTD. libxml2 (currently) fails to set up the element->attributes
+ links when copying DTDs, so we have to rebuild them here.
+ """
+ c_dtd = tree.xmlCopyDtd(c_orig_dtd)
+ if not c_dtd:
+ raise MemoryError
+ cdef tree.xmlNode* c_node = c_dtd.children
+ while c_node:
+ if c_node.type == tree.XML_ATTRIBUTE_DECL:
+ _linkDtdAttribute(c_dtd, <tree.xmlAttribute*>c_node)
+ c_node = c_node.next
+ return c_dtd
+
+
+cdef void _linkDtdAttribute(tree.xmlDtd* c_dtd, tree.xmlAttribute* c_attr):
+ """
+ Create the link to the DTD attribute declaration from the corresponding
+ element declaration.
+ """
+ c_elem = dtdvalid.xmlGetDtdElementDesc(c_dtd, c_attr.elem)
+ if not c_elem:
+ # no such element? something is wrong with the DTD ...
+ return
+ c_pos = c_elem.attributes
+ if not c_pos:
+ c_elem.attributes = c_attr
+ c_attr.nexth = NULL
+ return
+ # libxml2 keeps namespace declarations first, and we need to make
+ # sure we don't re-insert attributes that are already there
+ if _isDtdNsDecl(c_attr):
+ if not _isDtdNsDecl(c_pos):
+ c_elem.attributes = c_attr
+ c_attr.nexth = c_pos
+ return
+ while c_pos != c_attr and c_pos.nexth and _isDtdNsDecl(c_pos.nexth):
+ c_pos = c_pos.nexth
+ else:
+ # append at end
+ while c_pos != c_attr and c_pos.nexth:
+ c_pos = c_pos.nexth
+ if c_pos == c_attr:
+ return
+ c_attr.nexth = c_pos.nexth
+ c_pos.nexth = c_attr
+
+
+cdef bint _isDtdNsDecl(tree.xmlAttribute* c_attr):
+ if cstring_h.strcmp(<const_char*>c_attr.name, "xmlns") == 0:
+ return True
+ if (c_attr.prefix is not NULL and
+ cstring_h.strcmp(<const_char*>c_attr.prefix, "xmlns") == 0):
+ return True
+ return False
diff --git a/src/lxml/etree.pyx b/src/lxml/etree.pyx
new file mode 100644
index 0000000..b446754
--- /dev/null
+++ b/src/lxml/etree.pyx
@@ -0,0 +1,3663 @@
+# cython: binding=True
+# cython: auto_pickle=False
+# cython: language_level=2
+
+"""
+The ``lxml.etree`` module implements the extended ElementTree API for XML.
+"""
+
+from __future__ import absolute_import
+
+__docformat__ = u"restructuredtext en"
+
+__all__ = [
+ 'AttributeBasedElementClassLookup', 'C14NError', 'C14NWriterTarget', 'CDATA',
+ 'Comment', 'CommentBase', 'CustomElementClassLookup', 'DEBUG',
+ 'DTD', 'DTDError', 'DTDParseError', 'DTDValidateError',
+ 'DocumentInvalid', 'ETCompatXMLParser', 'ETXPath', 'Element',
+ 'ElementBase', 'ElementClassLookup', 'ElementDefaultClassLookup',
+ 'ElementNamespaceClassLookup', 'ElementTree', 'Entity', 'EntityBase',
+ 'Error', 'ErrorDomains', 'ErrorLevels', 'ErrorTypes', 'Extension',
+ 'FallbackElementClassLookup', 'FunctionNamespace', 'HTML',
+ 'HTMLParser', 'LIBXML_COMPILED_VERSION', 'LIBXML_VERSION',
+ 'LIBXSLT_COMPILED_VERSION', 'LIBXSLT_VERSION', 'LXML_VERSION',
+ 'LxmlError', 'LxmlRegistryError', 'LxmlSyntaxError',
+ 'NamespaceRegistryError', 'PI', 'PIBase', 'ParseError',
+ 'ParserBasedElementClassLookup', 'ParserError', 'ProcessingInstruction',
+ 'PyErrorLog', 'PythonElementClassLookup', 'QName', 'RelaxNG',
+ 'RelaxNGError', 'RelaxNGErrorTypes', 'RelaxNGParseError',
+ 'RelaxNGValidateError', 'Resolver', 'Schematron', 'SchematronError',
+ 'SchematronParseError', 'SchematronValidateError', 'SerialisationError',
+ 'SubElement', 'TreeBuilder', 'XInclude', 'XIncludeError', 'XML',
+ 'XMLDTDID', 'XMLID', 'XMLParser', 'XMLSchema', 'XMLSchemaError',
+ 'XMLSchemaParseError', 'XMLSchemaValidateError', 'XMLSyntaxError',
+ 'XMLTreeBuilder', 'XPath', 'XPathDocumentEvaluator', 'XPathError',
+ 'XPathEvalError', 'XPathEvaluator', 'XPathFunctionError', 'XPathResultError',
+ 'XPathSyntaxError', 'XSLT', 'XSLTAccessControl', 'XSLTApplyError',
+ 'XSLTError', 'XSLTExtension', 'XSLTExtensionError', 'XSLTParseError',
+ 'XSLTSaveError', 'canonicalize',
+ 'cleanup_namespaces', 'clear_error_log', 'dump',
+ 'fromstring', 'fromstringlist', 'get_default_parser', 'iselement',
+ 'iterparse', 'iterwalk', 'parse', 'parseid', 'register_namespace',
+ 'set_default_parser', 'set_element_class_lookup', 'strip_attributes',
+ 'strip_elements', 'strip_tags', 'tostring', 'tostringlist', 'tounicode',
+ 'use_global_python_log'
+ ]
+
+cimport cython
+
+from lxml cimport python
+from lxml.includes cimport tree, config
+from lxml.includes.tree cimport xmlDoc, xmlNode, xmlAttr, xmlNs, _isElement, _getNs
+from lxml.includes.tree cimport const_xmlChar, xmlChar, _xcstr
+from lxml.python cimport _cstr, _isString
+from lxml.includes cimport xpath
+from lxml.includes cimport c14n
+
+# Cython's standard declarations
+cimport cpython.mem
+cimport cpython.ref
+from libc cimport limits, stdio, stdlib
+from libc cimport string as cstring_h # not to be confused with stdlib 'string'
+from libc.string cimport const_char
+
+cdef object os_path_abspath
+from os.path import abspath as os_path_abspath
+
+cdef object BytesIO, StringIO
+from io import BytesIO, StringIO
+
+cdef object OrderedDict
+from collections import OrderedDict
+
+cdef object _elementpath
+from lxml import _elementpath
+
+cdef object sys
+import sys
+
+cdef object re
+import re
+
+cdef object partial
+from functools import partial
+
+cdef object islice
+from itertools import islice
+
+cdef object ITER_EMPTY = iter(())
+
+try:
+ from collections.abc import MutableMapping # Py3.3+
+except ImportError:
+ from collections import MutableMapping # Py2.7
+
+class _ImmutableMapping(MutableMapping):
+ def __getitem__(self, key):
+ raise KeyError, key
+
+ def __setitem__(self, key, value):
+ raise KeyError, key
+
+ def __delitem__(self, key):
+ raise KeyError, key
+
+ def __contains__(self, key):
+ return False
+
+ def __len__(self):
+ return 0
+
+ def __iter__(self):
+ return ITER_EMPTY
+ iterkeys = itervalues = iteritems = __iter__
+
+cdef object IMMUTABLE_EMPTY_MAPPING = _ImmutableMapping()
+del MutableMapping, _ImmutableMapping
+
+
+# the rules
+# ---------
+# any libxml C argument/variable is prefixed with c_
+# any non-public function/class is prefixed with an underscore
+# instance creation is always through factories
+
+# what to do with libxml2/libxslt error messages?
+# 0 : drop
+# 1 : use log
+DEF __DEBUG = 1
+
+# maximum number of lines in the libxml2/xslt log if __DEBUG == 1
+DEF __MAX_LOG_SIZE = 100
+
+# make the compiled-in debug state publicly available
+DEBUG = __DEBUG
+
+# A struct to store a cached qualified tag name+href pair.
+# While we can borrow the c_name from the document dict,
+# PyPy requires us to store a Python reference for the
+# namespace in order to keep the byte buffer alive.
+cdef struct qname:
+ const_xmlChar* c_name
+ python.PyObject* href
+
+# global per-thread setup
+tree.xmlThrDefIndentTreeOutput(1)
+tree.xmlThrDefLineNumbersDefaultValue(1)
+
+_initThreadLogging()
+
+# initialize parser (and threading)
+xmlparser.xmlInitParser()
+
+# filename encoding
+cdef bytes _FILENAME_ENCODING = (sys.getfilesystemencoding() or sys.getdefaultencoding() or 'ascii').encode("UTF-8")
+cdef char* _C_FILENAME_ENCODING = _cstr(_FILENAME_ENCODING)
+
+# set up some default namespace prefixes
+cdef dict _DEFAULT_NAMESPACE_PREFIXES = {
+ b"http://www.w3.org/XML/1998/namespace": b'xml',
+ b"http://www.w3.org/1999/xhtml": b"html",
+ b"http://www.w3.org/1999/XSL/Transform": b"xsl",
+ b"http://www.w3.org/1999/02/22-rdf-syntax-ns#": b"rdf",
+ b"http://schemas.xmlsoap.org/wsdl/": b"wsdl",
+ # xml schema
+ b"http://www.w3.org/2001/XMLSchema": b"xs",
+ b"http://www.w3.org/2001/XMLSchema-instance": b"xsi",
+ # dublin core
+ b"http://purl.org/dc/elements/1.1/": b"dc",
+ # objectify
+ b"http://codespeak.net/lxml/objectify/pytype" : b"py",
+}
+
+cdef object _check_internal_prefix = re.compile(b"ns\d+$").match
+
+def register_namespace(prefix, uri):
+ u"""Registers a namespace prefix that newly created Elements in that
+ namespace will use. The registry is global, and any existing
+ mapping for either the given prefix or the namespace URI will be
+ removed.
+ """
+ prefix_utf, uri_utf = _utf8(prefix), _utf8(uri)
+ if _check_internal_prefix(prefix_utf):
+ raise ValueError("Prefix format reserved for internal use")
+ _tagValidOrRaise(prefix_utf)
+ _uriValidOrRaise(uri_utf)
+ if (uri_utf == b"http://www.w3.org/XML/1998/namespace" and prefix_utf != b'xml'
+ or prefix_utf == b'xml' and uri_utf != b"http://www.w3.org/XML/1998/namespace"):
+ raise ValueError("Cannot change the 'xml' prefix of the XML namespace")
+ for k, v in list(_DEFAULT_NAMESPACE_PREFIXES.items()):
+ if k == uri_utf or v == prefix_utf:
+ del _DEFAULT_NAMESPACE_PREFIXES[k]
+ _DEFAULT_NAMESPACE_PREFIXES[uri_utf] = prefix_utf
+
+
+# Error superclass for ElementTree compatibility
+cdef class Error(Exception):
+ pass
+
+# module level superclass for all exceptions
+cdef class LxmlError(Error):
+ """Main exception base class for lxml. All other exceptions inherit from
+ this one.
+ """
+ def __init__(self, message, error_log=None):
+ super(_Error, self).__init__(message)
+ if error_log is None:
+ self.error_log = __copyGlobalErrorLog()
+ else:
+ self.error_log = error_log.copy()
+
+cdef object _Error = Error
+
+
+# superclass for all syntax errors
+class LxmlSyntaxError(LxmlError, SyntaxError):
+ """Base class for all syntax errors.
+ """
+
+cdef class C14NError(LxmlError):
+ """Error during C14N serialisation.
+ """
+
+# version information
+cdef __unpackDottedVersion(version):
+ version_list = []
+ l = (version.decode("ascii").replace(u'-', u'.').split(u'.') + [0]*4)[:4]
+ for item in l:
+ try:
+ item = int(item)
+ except ValueError:
+ if item.startswith(u'dev'):
+ count = item[3:]
+ item = -300
+ elif item.startswith(u'alpha'):
+ count = item[5:]
+ item = -200
+ elif item.startswith(u'beta'):
+ count = item[4:]
+ item = -100
+ else:
+ count = 0
+ if count:
+ item += int(count)
+ version_list.append(item)
+ return tuple(version_list)
+
+cdef __unpackIntVersion(int c_version):
+ return (
+ ((c_version / (100*100)) % 100),
+ ((c_version / 100) % 100),
+ (c_version % 100)
+ )
+
+cdef int _LIBXML_VERSION_INT
+try:
+ _LIBXML_VERSION_INT = int(
+ re.match(u'[0-9]+', (<unsigned char*>tree.xmlParserVersion).decode("ascii")).group(0))
+except Exception:
+ print u"Unknown libxml2 version: %s" % (<unsigned char*>tree.xmlParserVersion).decode("latin1")
+ _LIBXML_VERSION_INT = 0
+
+LIBXML_VERSION = __unpackIntVersion(_LIBXML_VERSION_INT)
+LIBXML_COMPILED_VERSION = __unpackIntVersion(tree.LIBXML_VERSION)
+LXML_VERSION = __unpackDottedVersion(tree.LXML_VERSION_STRING)
+
+__version__ = tree.LXML_VERSION_STRING.decode("ascii")
+
+
+# class for temporary storage of Python references,
+# used e.g. for XPath results
+@cython.final
+@cython.internal
+cdef class _TempStore:
+ cdef list _storage
+ def __init__(self):
+ self._storage = []
+
+ cdef int add(self, obj) except -1:
+ self._storage.append(obj)
+ return 0
+
+ cdef int clear(self) except -1:
+ del self._storage[:]
+ return 0
+
+
+# class for temporarily storing exceptions raised in extensions
+@cython.internal
+cdef class _ExceptionContext:
+ cdef object _exc_info
+ cdef int clear(self) except -1:
+ self._exc_info = None
+ return 0
+
+ cdef void _store_raised(self):
+ try:
+ self._exc_info = sys.exc_info()
+ except BaseException as e:
+ self._store_exception(e)
+ finally:
+ return # and swallow any further exceptions
+
+ cdef int _store_exception(self, exception) except -1:
+ self._exc_info = (exception, None, None)
+ return 0
+
+ cdef bint _has_raised(self) except -1:
+ return self._exc_info is not None
+
+ cdef int _raise_if_stored(self) except -1:
+ if self._exc_info is None:
+ return 0
+ type, value, traceback = self._exc_info
+ self._exc_info = None
+ if value is None and traceback is None:
+ raise type
+ else:
+ raise type, value, traceback
+
+
+# type of a function that steps from node to node
+ctypedef public xmlNode* (*_node_to_node_function)(xmlNode*)
+
+
+################################################################################
+# Include submodules
+
+include "proxy.pxi" # Proxy handling (element backpointers/memory/etc.)
+include "apihelpers.pxi" # Private helper functions
+include "xmlerror.pxi" # Error and log handling
+
+
+################################################################################
+# Public Python API
+
+@cython.final
+@cython.freelist(8)
+cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]:
+ u"""Internal base class to reference a libxml document.
+
+ When instances of this class are garbage collected, the libxml
+ document is cleaned up.
+ """
+ cdef int _ns_counter
+ cdef bytes _prefix_tail
+ cdef xmlDoc* _c_doc
+ cdef _BaseParser _parser
+
+ def __dealloc__(self):
+ # if there are no more references to the document, it is safe
+ # to clean the whole thing up, as all nodes have a reference to
+ # the document
+ tree.xmlFreeDoc(self._c_doc)
+
+ @cython.final
+ cdef getroot(self):
+ # return an element proxy for the document root
+ cdef xmlNode* c_node
+ c_node = tree.xmlDocGetRootElement(self._c_doc)
+ if c_node is NULL:
+ return None
+ return _elementFactory(self, c_node)
+
+ @cython.final
+ cdef bint hasdoctype(self):
+ # DOCTYPE gets parsed into internal subset (xmlDTD*)
+ return self._c_doc is not NULL and self._c_doc.intSubset is not NULL
+
+ @cython.final
+ cdef getdoctype(self):
+ # get doctype info: root tag, public/system ID (or None if not known)
+ cdef tree.xmlDtd* c_dtd
+ cdef xmlNode* c_root_node
+ public_id = None
+ sys_url = None
+ c_dtd = self._c_doc.intSubset
+ if c_dtd is not NULL:
+ if c_dtd.ExternalID is not NULL:
+ public_id = funicode(c_dtd.ExternalID)
+ if c_dtd.SystemID is not NULL:
+ sys_url = funicode(c_dtd.SystemID)
+ c_dtd = self._c_doc.extSubset
+ if c_dtd is not NULL:
+ if not public_id and c_dtd.ExternalID is not NULL:
+ public_id = funicode(c_dtd.ExternalID)
+ if not sys_url and c_dtd.SystemID is not NULL:
+ sys_url = funicode(c_dtd.SystemID)
+ c_root_node = tree.xmlDocGetRootElement(self._c_doc)
+ if c_root_node is NULL:
+ root_name = None
+ else:
+ root_name = funicode(c_root_node.name)
+ return root_name, public_id, sys_url
+
+ @cython.final
+ cdef getxmlinfo(self):
+ # return XML version and encoding (or None if not known)
+ cdef xmlDoc* c_doc = self._c_doc
+ if c_doc.version is NULL:
+ version = None
+ else:
+ version = funicode(c_doc.version)
+ if c_doc.encoding is NULL:
+ encoding = None
+ else:
+ encoding = funicode(c_doc.encoding)
+ return version, encoding
+
+ @cython.final
+ cdef isstandalone(self):
+ # returns True for "standalone=true",
+ # False for "standalone=false", None if not provided
+ if self._c_doc.standalone == -1:
+ return None
+ else:
+ return <bint>(self._c_doc.standalone == 1)
+
+ @cython.final
+ cdef bytes buildNewPrefix(self):
+ # get a new unique prefix ("nsX") for this document
+ cdef bytes ns
+ if self._ns_counter < len(_PREFIX_CACHE):
+ ns = _PREFIX_CACHE[self._ns_counter]
+ else:
+ ns = python.PyBytes_FromFormat("ns%d", self._ns_counter)
+ if self._prefix_tail is not None:
+ ns += self._prefix_tail
+ self._ns_counter += 1
+ if self._ns_counter < 0:
+ # overflow!
+ self._ns_counter = 0
+ if self._prefix_tail is None:
+ self._prefix_tail = b"A"
+ else:
+ self._prefix_tail += b"A"
+ return ns
+
+ @cython.final
+ cdef xmlNs* _findOrBuildNodeNs(self, xmlNode* c_node,
+ const_xmlChar* c_href, const_xmlChar* c_prefix,
+ bint is_attribute) except NULL:
+ u"""Get or create namespace structure for a node. Reuses the prefix if
+ possible.
+ """
+ cdef xmlNs* c_ns
+ cdef xmlNs* c_doc_ns
+ cdef python.PyObject* dict_result
+ if c_node.type != tree.XML_ELEMENT_NODE:
+ assert c_node.type == tree.XML_ELEMENT_NODE, \
+ u"invalid node type %d, expected %d" % (
+ c_node.type, tree.XML_ELEMENT_NODE)
+ # look for existing ns declaration
+ c_ns = _searchNsByHref(c_node, c_href, is_attribute)
+ if c_ns is not NULL:
+ if is_attribute and c_ns.prefix is NULL:
+ # do not put namespaced attributes into the default
+ # namespace as this would break serialisation
+ pass
+ else:
+ return c_ns
+
+ # none found => determine a suitable new prefix
+ if c_prefix is NULL:
+ dict_result = python.PyDict_GetItem(
+ _DEFAULT_NAMESPACE_PREFIXES, <unsigned char*>c_href)
+ if dict_result is not NULL:
+ prefix = <object>dict_result
+ else:
+ prefix = self.buildNewPrefix()
+ c_prefix = _xcstr(prefix)
+
+ # make sure the prefix is not in use already
+ while tree.xmlSearchNs(self._c_doc, c_node, c_prefix) is not NULL:
+ prefix = self.buildNewPrefix()
+ c_prefix = _xcstr(prefix)
+
+ # declare the namespace and return it
+ c_ns = tree.xmlNewNs(c_node, c_href, c_prefix)
+ if c_ns is NULL:
+ raise MemoryError()
+ return c_ns
+
+ @cython.final
+ cdef int _setNodeNs(self, xmlNode* c_node, const_xmlChar* c_href) except -1:
+ u"Lookup namespace structure and set it for the node."
+ c_ns = self._findOrBuildNodeNs(c_node, c_href, NULL, 0)
+ tree.xmlSetNs(c_node, c_ns)
+
+cdef tuple __initPrefixCache():
+ cdef int i
+ return tuple([ python.PyBytes_FromFormat("ns%d", i)
+ for i in range(30) ])
+
+cdef tuple _PREFIX_CACHE = __initPrefixCache()
+
+cdef _Document _documentFactory(xmlDoc* c_doc, _BaseParser parser):
+ cdef _Document result
+ result = _Document.__new__(_Document)
+ result._c_doc = c_doc
+ result._ns_counter = 0
+ result._prefix_tail = None
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ result._parser = parser
+ return result
+
+
+cdef object _find_invalid_public_id_characters = re.compile(
+ ur"[^\x20\x0D\x0Aa-zA-Z0-9'()+,./:=?;!*#@$_%-]+").search
+
+
+cdef class DocInfo:
+ u"Document information provided by parser and DTD."
+ cdef _Document _doc
+ def __cinit__(self, tree):
+ u"Create a DocInfo object for an ElementTree object or root Element."
+ self._doc = _documentOrRaise(tree)
+ root_name, public_id, system_url = self._doc.getdoctype()
+ if not root_name and (public_id or system_url):
+ raise ValueError, u"Could not find root node"
+
+ @property
+ def root_name(self):
+ """Returns the name of the root node as defined by the DOCTYPE."""
+ root_name, public_id, system_url = self._doc.getdoctype()
+ return root_name
+
+ @cython.final
+ cdef tree.xmlDtd* _get_c_dtd(self):
+ """"Return the DTD. Create it if it does not yet exist."""
+ cdef xmlDoc* c_doc = self._doc._c_doc
+ cdef xmlNode* c_root_node
+ cdef const_xmlChar* c_name
+
+ if c_doc.intSubset:
+ return c_doc.intSubset
+
+ c_root_node = tree.xmlDocGetRootElement(c_doc)
+ c_name = c_root_node.name if c_root_node else NULL
+ return tree.xmlCreateIntSubset(c_doc, c_name, NULL, NULL)
+
+ def clear(self):
+ u"""Removes DOCTYPE and internal subset from the document."""
+ cdef xmlDoc* c_doc = self._doc._c_doc
+ cdef tree.xmlNode* c_dtd = <xmlNode*>c_doc.intSubset
+ if c_dtd is NULL:
+ return
+ tree.xmlUnlinkNode(c_dtd)
+ tree.xmlFreeNode(c_dtd)
+
+ property public_id:
+ u"""Public ID of the DOCTYPE.
+
+ Mutable. May be set to a valid string or None. If a DTD does not
+ exist, setting this variable (even to None) will create one.
+ """
+ def __get__(self):
+ root_name, public_id, system_url = self._doc.getdoctype()
+ return public_id
+
+ def __set__(self, value):
+ cdef xmlChar* c_value = NULL
+ if value is not None:
+ match = _find_invalid_public_id_characters(value)
+ if match:
+ raise ValueError, f'Invalid character(s) {match.group(0)!r} in public_id.'
+ value = _utf8(value)
+ c_value = tree.xmlStrdup(_xcstr(value))
+ if not c_value:
+ raise MemoryError()
+
+ c_dtd = self._get_c_dtd()
+ if not c_dtd:
+ tree.xmlFree(c_value)
+ raise MemoryError()
+ if c_dtd.ExternalID:
+ tree.xmlFree(<void*>c_dtd.ExternalID)
+ c_dtd.ExternalID = c_value
+
+ property system_url:
+ u"""System ID of the DOCTYPE.
+
+ Mutable. May be set to a valid string or None. If a DTD does not
+ exist, setting this variable (even to None) will create one.
+ """
+ def __get__(self):
+ root_name, public_id, system_url = self._doc.getdoctype()
+ return system_url
+
+ def __set__(self, value):
+ cdef xmlChar* c_value = NULL
+ if value is not None:
+ bvalue = _utf8(value)
+ # sys_url may be any valid unicode string that can be
+ # enclosed in single quotes or quotes.
+ if b"'" in bvalue and b'"' in bvalue:
+ raise ValueError(
+ 'System URL may not contain both single (\') and double quotes (").')
+ c_value = tree.xmlStrdup(_xcstr(bvalue))
+ if not c_value:
+ raise MemoryError()
+
+ c_dtd = self._get_c_dtd()
+ if not c_dtd:
+ tree.xmlFree(c_value)
+ raise MemoryError()
+ if c_dtd.SystemID:
+ tree.xmlFree(<void*>c_dtd.SystemID)
+ c_dtd.SystemID = c_value
+
+ @property
+ def xml_version(self):
+ """Returns the XML version as declared by the document."""
+ xml_version, encoding = self._doc.getxmlinfo()
+ return xml_version
+
+ @property
+ def encoding(self):
+ """Returns the encoding name as declared by the document."""
+ xml_version, encoding = self._doc.getxmlinfo()
+ return encoding
+
+ @property
+ def standalone(self):
+ """Returns the standalone flag as declared by the document. The possible
+ values are True (``standalone='yes'``), False
+ (``standalone='no'`` or flag not provided in the declaration),
+ and None (unknown or no declaration found). Note that a
+ normal truth test on this value will always tell if the
+ ``standalone`` flag was set to ``'yes'`` or not.
+ """
+ return self._doc.isstandalone()
+
+ property URL:
+ u"The source URL of the document (or None if unknown)."
+ def __get__(self):
+ if self._doc._c_doc.URL is NULL:
+ return None
+ return _decodeFilename(self._doc._c_doc.URL)
+ def __set__(self, url):
+ url = _encodeFilename(url)
+ c_oldurl = self._doc._c_doc.URL
+ if url is None:
+ self._doc._c_doc.URL = NULL
+ else:
+ self._doc._c_doc.URL = tree.xmlStrdup(_xcstr(url))
+ if c_oldurl is not NULL:
+ tree.xmlFree(<void*>c_oldurl)
+
+ @property
+ def doctype(self):
+ """Returns a DOCTYPE declaration string for the document."""
+ root_name, public_id, system_url = self._doc.getdoctype()
+ if system_url:
+ # If '"' in system_url, we must escape it with single
+ # quotes, otherwise escape with double quotes. If url
+ # contains both a single quote and a double quote, XML
+ # standard is being violated.
+ if '"' in system_url:
+ quoted_system_url = f"'{system_url}'"
+ else:
+ quoted_system_url = f'"{system_url}"'
+ if public_id:
+ if system_url:
+ return f'<!DOCTYPE {root_name} PUBLIC "{public_id}" {quoted_system_url}>'
+ else:
+ return f'<!DOCTYPE {root_name} PUBLIC "{public_id}">'
+ elif system_url:
+ return f'<!DOCTYPE {root_name} SYSTEM {quoted_system_url}>'
+ elif self._doc.hasdoctype():
+ return f'<!DOCTYPE {root_name}>'
+ else:
+ return u''
+
+ @property
+ def internalDTD(self):
+ """Returns a DTD validator based on the internal subset of the document."""
+ return _dtdFactory(self._doc._c_doc.intSubset)
+
+ @property
+ def externalDTD(self):
+ """Returns a DTD validator based on the external subset of the document."""
+ return _dtdFactory(self._doc._c_doc.extSubset)
+
+
+@cython.no_gc_clear
+cdef public class _Element [ type LxmlElementType, object LxmlElement ]:
+ u"""Element class.
+
+ References a document object and a libxml node.
+
+ By pointing to a Document instance, a reference is kept to
+ _Document as long as there is some pointer to a node in it.
+ """
+ cdef _Document _doc
+ cdef xmlNode* _c_node
+ cdef object _tag
+
+ def _init(self):
+ u"""_init(self)
+
+ Called after object initialisation. Custom subclasses may override
+ this if they recursively call _init() in the superclasses.
+ """
+
+ @cython.linetrace(False)
+ @cython.profile(False)
+ def __dealloc__(self):
+ #print "trying to free node:", <int>self._c_node
+ #displayNode(self._c_node, 0)
+ if self._c_node is not NULL:
+ _unregisterProxy(self)
+ attemptDeallocation(self._c_node)
+
+ # MANIPULATORS
+
+ def __setitem__(self, x, value):
+ u"""__setitem__(self, x, value)
+
+ Replaces the given subelement index or slice.
+ """
+ cdef xmlNode* c_node = NULL
+ cdef xmlNode* c_next
+ cdef xmlDoc* c_source_doc
+ cdef _Element element
+ cdef bint left_to_right
+ cdef Py_ssize_t slicelength = 0, step = 0
+ _assertValidNode(self)
+ if value is None:
+ raise ValueError, u"cannot assign None"
+ if isinstance(x, slice):
+ # slice assignment
+ _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
+ if step > 0:
+ left_to_right = 1
+ else:
+ left_to_right = 0
+ step = -step
+ _replaceSlice(self, c_node, slicelength, step, left_to_right, value)
+ return
+ else:
+ # otherwise: normal item assignment
+ element = value
+ _assertValidNode(element)
+ c_node = _findChild(self._c_node, x)
+ if c_node is NULL:
+ raise IndexError, u"list index out of range"
+ c_source_doc = element._c_node.doc
+ c_next = element._c_node.next
+ _removeText(c_node.next)
+ tree.xmlReplaceNode(c_node, element._c_node)
+ _moveTail(c_next, element._c_node)
+ moveNodeToDocument(self._doc, c_source_doc, element._c_node)
+ if not attemptDeallocation(c_node):
+ moveNodeToDocument(self._doc, c_node.doc, c_node)
+
+ def __delitem__(self, x):
+ u"""__delitem__(self, x)
+
+ Deletes the given subelement or a slice.
+ """
+ cdef xmlNode* c_node = NULL
+ cdef xmlNode* c_next
+ cdef Py_ssize_t step = 0, slicelength = 0
+ _assertValidNode(self)
+ if isinstance(x, slice):
+ # slice deletion
+ if _isFullSlice(<slice>x):
+ c_node = self._c_node.children
+ if c_node is not NULL:
+ if not _isElement(c_node):
+ c_node = _nextElement(c_node)
+ while c_node is not NULL:
+ c_next = _nextElement(c_node)
+ _removeNode(self._doc, c_node)
+ c_node = c_next
+ else:
+ _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
+ _deleteSlice(self._doc, c_node, slicelength, step)
+ else:
+ # item deletion
+ c_node = _findChild(self._c_node, x)
+ if c_node is NULL:
+ raise IndexError, f"index out of range: {x}"
+ _removeNode(self._doc, c_node)
+
+ def __deepcopy__(self, memo):
+ u"__deepcopy__(self, memo)"
+ return self.__copy__()
+
+ def __copy__(self):
+ u"__copy__(self)"
+ cdef xmlDoc* c_doc
+ cdef xmlNode* c_node
+ cdef _Document new_doc
+ _assertValidNode(self)
+ c_doc = _copyDocRoot(self._doc._c_doc, self._c_node) # recursive
+ new_doc = _documentFactory(c_doc, self._doc._parser)
+ root = new_doc.getroot()
+ if root is not None:
+ return root
+ # Comment/PI
+ c_node = c_doc.children
+ while c_node is not NULL and c_node.type != self._c_node.type:
+ c_node = c_node.next
+ if c_node is NULL:
+ return None
+ return _elementFactory(new_doc, c_node)
+
+ def set(self, key, value):
+ u"""set(self, key, value)
+
+ Sets an element attribute.
+ """
+ _assertValidNode(self)
+ _setAttributeValue(self, key, value)
+
+ def append(self, _Element element not None):
+ u"""append(self, element)
+
+ Adds a subelement to the end of this element.
+ """
+ _assertValidNode(self)
+ _assertValidNode(element)
+ _appendChild(self, element)
+
+ def addnext(self, _Element element not None):
+ u"""addnext(self, element)
+
+ Adds the element as a following sibling directly after this
+ element.
+
+ This is normally used to set a processing instruction or comment after
+ the root node of a document. Note that tail text is automatically
+ discarded when adding at the root level.
+ """
+ _assertValidNode(self)
+ _assertValidNode(element)
+ if self._c_node.parent != NULL and not _isElement(self._c_node.parent):
+ if element._c_node.type != tree.XML_PI_NODE:
+ if element._c_node.type != tree.XML_COMMENT_NODE:
+ raise TypeError, u"Only processing instructions and comments can be siblings of the root element"
+ element.tail = None
+ _appendSibling(self, element)
+
+ def addprevious(self, _Element element not None):
+ u"""addprevious(self, element)
+
+ Adds the element as a preceding sibling directly before this
+ element.
+
+ This is normally used to set a processing instruction or comment
+ before the root node of a document. Note that tail text is
+ automatically discarded when adding at the root level.
+ """
+ _assertValidNode(self)
+ _assertValidNode(element)
+ if self._c_node.parent != NULL and not _isElement(self._c_node.parent):
+ if element._c_node.type != tree.XML_PI_NODE:
+ if element._c_node.type != tree.XML_COMMENT_NODE:
+ raise TypeError, u"Only processing instructions and comments can be siblings of the root element"
+ element.tail = None
+ _prependSibling(self, element)
+
+ def extend(self, elements):
+ u"""extend(self, elements)
+
+ Extends the current children by the elements in the iterable.
+ """
+ cdef _Element element
+ _assertValidNode(self)
+ for element in elements:
+ if element is None:
+ raise TypeError, u"Node must not be None"
+ _assertValidNode(element)
+ _appendChild(self, element)
+
+ def clear(self, bint keep_tail=False):
+ u"""clear(self, keep_tail=False)
+
+ Resets an element. This function removes all subelements, clears
+ all attributes and sets the text and tail properties to None.
+
+ Pass ``keep_tail=True`` to leave the tail text untouched.
+ """
+ cdef xmlAttr* c_attr
+ cdef xmlAttr* c_attr_next
+ cdef xmlNode* c_node
+ cdef xmlNode* c_node_next
+ _assertValidNode(self)
+ c_node = self._c_node
+ # remove self.text and self.tail
+ _removeText(c_node.children)
+ if not keep_tail:
+ _removeText(c_node.next)
+ # remove all attributes
+ c_attr = c_node.properties
+ if c_attr:
+ c_node.properties = NULL
+ tree.xmlFreePropList(c_attr)
+ # remove all subelements
+ c_node = c_node.children
+ if c_node and not _isElement(c_node):
+ c_node = _nextElement(c_node)
+ while c_node is not NULL:
+ c_node_next = _nextElement(c_node)
+ _removeNode(self._doc, c_node)
+ c_node = c_node_next
+
+ def insert(self, index: int, _Element element not None):
+ u"""insert(self, index, element)
+
+ Inserts a subelement at the given position in this element
+ """
+ cdef xmlNode* c_node
+ cdef xmlNode* c_next
+ cdef xmlDoc* c_source_doc
+ _assertValidNode(self)
+ _assertValidNode(element)
+ c_node = _findChild(self._c_node, index)
+ if c_node is NULL:
+ _appendChild(self, element)
+ return
+ c_source_doc = element._c_node.doc
+ c_next = element._c_node.next
+ tree.xmlAddPrevSibling(c_node, element._c_node)
+ _moveTail(c_next, element._c_node)
+ moveNodeToDocument(self._doc, c_source_doc, element._c_node)
+
+ def remove(self, _Element element not None):
+ u"""remove(self, element)
+
+ Removes a matching subelement. Unlike the find methods, this
+ method compares elements based on identity, not on tag value
+ or contents.
+ """
+ cdef xmlNode* c_node
+ cdef xmlNode* c_next
+ _assertValidNode(self)
+ _assertValidNode(element)
+ c_node = element._c_node
+ if c_node.parent is not self._c_node:
+ raise ValueError, u"Element is not a child of this node."
+ c_next = element._c_node.next
+ tree.xmlUnlinkNode(c_node)
+ _moveTail(c_next, c_node)
+ # fix namespace declarations
+ moveNodeToDocument(self._doc, c_node.doc, c_node)
+
+ def replace(self, _Element old_element not None,
+ _Element new_element not None):
+ u"""replace(self, old_element, new_element)
+
+ Replaces a subelement with the element passed as second argument.
+ """
+ cdef xmlNode* c_old_node
+ cdef xmlNode* c_old_next
+ cdef xmlNode* c_new_node
+ cdef xmlNode* c_new_next
+ cdef xmlDoc* c_source_doc
+ _assertValidNode(self)
+ _assertValidNode(old_element)
+ _assertValidNode(new_element)
+ c_old_node = old_element._c_node
+ if c_old_node.parent is not self._c_node:
+ raise ValueError, u"Element is not a child of this node."
+ c_old_next = c_old_node.next
+ c_new_node = new_element._c_node
+ c_new_next = c_new_node.next
+ c_source_doc = c_new_node.doc
+ tree.xmlReplaceNode(c_old_node, c_new_node)
+ _moveTail(c_new_next, c_new_node)
+ _moveTail(c_old_next, c_old_node)
+ moveNodeToDocument(self._doc, c_source_doc, c_new_node)
+ # fix namespace declarations
+ moveNodeToDocument(self._doc, c_old_node.doc, c_old_node)
+
+ # PROPERTIES
+ property tag:
+ u"""Element tag
+ """
+ def __get__(self):
+ if self._tag is not None:
+ return self._tag
+ _assertValidNode(self)
+ self._tag = _namespacedName(self._c_node)
+ return self._tag
+
+ def __set__(self, value):
+ cdef _BaseParser parser
+ _assertValidNode(self)
+ ns, name = _getNsTag(value)
+ parser = self._doc._parser
+ if parser is not None and parser._for_html:
+ _htmlTagValidOrRaise(name)
+ else:
+ _tagValidOrRaise(name)
+ self._tag = value
+ tree.xmlNodeSetName(self._c_node, _xcstr(name))
+ if ns is None:
+ self._c_node.ns = NULL
+ else:
+ self._doc._setNodeNs(self._c_node, _xcstr(ns))
+
+ @property
+ def attrib(self):
+ """Element attribute dictionary. Where possible, use get(), set(),
+ keys(), values() and items() to access element attributes.
+ """
+ return _Attrib.__new__(_Attrib, self)
+
+ property text:
+ u"""Text before the first subelement. This is either a string or
+ the value None, if there was no text.
+ """
+ def __get__(self):
+ _assertValidNode(self)
+ return _collectText(self._c_node.children)
+
+ def __set__(self, value):
+ _assertValidNode(self)
+ if isinstance(value, QName):
+ value = _resolveQNameText(self, value).decode('utf8')
+ _setNodeText(self._c_node, value)
+
+ # using 'del el.text' is the wrong thing to do
+ #def __del__(self):
+ # _setNodeText(self._c_node, None)
+
+ property tail:
+ u"""Text after this element's end tag, but before the next sibling
+ element's start tag. This is either a string or the value None, if
+ there was no text.
+ """
+ def __get__(self):
+ _assertValidNode(self)
+ return _collectText(self._c_node.next)
+
+ def __set__(self, value):
+ _assertValidNode(self)
+ _setTailText(self._c_node, value)
+
+ # using 'del el.tail' is the wrong thing to do
+ #def __del__(self):
+ # _setTailText(self._c_node, None)
+
+ # not in ElementTree, read-only
+ @property
+ def prefix(self):
+ """Namespace prefix or None.
+ """
+ if self._c_node.ns is not NULL:
+ if self._c_node.ns.prefix is not NULL:
+ return funicode(self._c_node.ns.prefix)
+ return None
+
+ # not in ElementTree, read-only
+ property sourceline:
+ u"""Original line number as found by the parser or None if unknown.
+ """
+ def __get__(self):
+ cdef long line
+ _assertValidNode(self)
+ line = tree.xmlGetLineNo(self._c_node)
+ return line if line > 0 else None
+
+ def __set__(self, line):
+ _assertValidNode(self)
+ if line <= 0:
+ self._c_node.line = 0
+ else:
+ self._c_node.line = line
+
+ # not in ElementTree, read-only
+ @property
+ def nsmap(self):
+ """Namespace prefix->URI mapping known in the context of this
+ Element. This includes all namespace declarations of the
+ parents.
+
+ Note that changing the returned dict has no effect on the Element.
+ """
+ _assertValidNode(self)
+ return _build_nsmap(self._c_node)
+
+ # not in ElementTree, read-only
+ property base:
+ u"""The base URI of the Element (xml:base or HTML base URL).
+ None if the base URI is unknown.
+
+ Note that the value depends on the URL of the document that
+ holds the Element if there is no xml:base attribute on the
+ Element or its ancestors.
+
+ Setting this property will set an xml:base attribute on the
+ Element, regardless of the document type (XML or HTML).
+ """
+ def __get__(self):
+ _assertValidNode(self)
+ c_base = tree.xmlNodeGetBase(self._doc._c_doc, self._c_node)
+ if c_base is NULL:
+ if self._doc._c_doc.URL is NULL:
+ return None
+ return _decodeFilename(self._doc._c_doc.URL)
+ try:
+ base = _decodeFilename(c_base)
+ finally:
+ tree.xmlFree(c_base)
+ return base
+
+ def __set__(self, url):
+ _assertValidNode(self)
+ if url is None:
+ c_base = <const_xmlChar*>NULL
+ else:
+ url = _encodeFilename(url)
+ c_base = _xcstr(url)
+ tree.xmlNodeSetBase(self._c_node, c_base)
+
+ # ACCESSORS
+ def __repr__(self):
+ u"__repr__(self)"
+ return "<Element %s at 0x%x>" % (strrepr(self.tag), id(self))
+
+ def __getitem__(self, x):
+ u"""Returns the subelement at the given position or the requested
+ slice.
+ """
+ cdef xmlNode* c_node = NULL
+ cdef Py_ssize_t step = 0, slicelength = 0
+ cdef Py_ssize_t c, i
+ cdef _node_to_node_function next_element
+ cdef list result
+ _assertValidNode(self)
+ if isinstance(x, slice):
+ # slicing
+ if _isFullSlice(<slice>x):
+ return _collectChildren(self)
+ _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
+ if c_node is NULL:
+ return []
+ if step > 0:
+ next_element = _nextElement
+ else:
+ step = -step
+ next_element = _previousElement
+ result = []
+ c = 0
+ while c_node is not NULL and c < slicelength:
+ result.append(_elementFactory(self._doc, c_node))
+ c += 1
+ for i in range(step):
+ c_node = next_element(c_node)
+ if c_node is NULL:
+ break
+ return result
+ else:
+ # indexing
+ c_node = _findChild(self._c_node, x)
+ if c_node is NULL:
+ raise IndexError, u"list index out of range"
+ return _elementFactory(self._doc, c_node)
+
+ def __len__(self):
+ u"""__len__(self)
+
+ Returns the number of subelements.
+ """
+ _assertValidNode(self)
+ return _countElements(self._c_node.children)
+
+ def __nonzero__(self):
+ #u"__nonzero__(self)" # currently fails in Py3.1
+ import warnings
+ warnings.warn(
+ u"The behavior of this method will change in future versions. "
+ u"Use specific 'len(elem)' or 'elem is not None' test instead.",
+ FutureWarning
+ )
+ # emulate old behaviour
+ _assertValidNode(self)
+ return _hasChild(self._c_node)
+
+ def __contains__(self, element):
+ u"__contains__(self, element)"
+ cdef xmlNode* c_node
+ _assertValidNode(self)
+ if not isinstance(element, _Element):
+ return 0
+ c_node = (<_Element>element)._c_node
+ return c_node is not NULL and c_node.parent is self._c_node
+
+ def __iter__(self):
+ u"__iter__(self)"
+ return ElementChildIterator(self)
+
+ def __reversed__(self):
+ u"__reversed__(self)"
+ return ElementChildIterator(self, reversed=True)
+
+ def index(self, _Element child not None, start: int = None, stop: int = None):
+ u"""index(self, child, start=None, stop=None)
+
+ Find the position of the child within the parent.
+
+ This method is not part of the original ElementTree API.
+ """
+ cdef Py_ssize_t k, l
+ cdef Py_ssize_t c_start, c_stop
+ cdef xmlNode* c_child
+ cdef xmlNode* c_start_node
+ _assertValidNode(self)
+ _assertValidNode(child)
+ c_child = child._c_node
+ if c_child.parent is not self._c_node:
+ raise ValueError, u"Element is not a child of this node."
+
+ # handle the unbounded search straight away (normal case)
+ if stop is None and (start is None or start == 0):
+ k = 0
+ c_child = c_child.prev
+ while c_child is not NULL:
+ if _isElement(c_child):
+ k += 1
+ c_child = c_child.prev
+ return k
+
+ # check indices
+ if start is None:
+ c_start = 0
+ else:
+ c_start = start
+ if stop is None:
+ c_stop = 0
+ else:
+ c_stop = stop
+ if c_stop == 0 or \
+ c_start >= c_stop and (c_stop > 0 or c_start < 0):
+ raise ValueError, u"list.index(x): x not in slice"
+
+ # for negative slice indices, check slice before searching index
+ if c_start < 0 or c_stop < 0:
+ # start from right, at most up to leftmost(c_start, c_stop)
+ if c_start < c_stop:
+ k = -c_start
+ else:
+ k = -c_stop
+ c_start_node = self._c_node.last
+ l = 1
+ while c_start_node != c_child and l < k:
+ if _isElement(c_start_node):
+ l += 1
+ c_start_node = c_start_node.prev
+ if c_start_node == c_child:
+ # found! before slice end?
+ if c_stop < 0 and l <= -c_stop:
+ raise ValueError, u"list.index(x): x not in slice"
+ elif c_start < 0:
+ raise ValueError, u"list.index(x): x not in slice"
+
+ # now determine the index backwards from child
+ c_child = c_child.prev
+ k = 0
+ if c_stop > 0:
+ # we can optimize: stop after c_stop elements if not found
+ while c_child != NULL and k < c_stop:
+ if _isElement(c_child):
+ k += 1
+ c_child = c_child.prev
+ if k < c_stop:
+ return k
+ else:
+ # traverse all
+ while c_child != NULL:
+ if _isElement(c_child):
+ k = k + 1
+ c_child = c_child.prev
+ if c_start > 0:
+ if k >= c_start:
+ return k
+ else:
+ return k
+ if c_start != 0 or c_stop != 0:
+ raise ValueError, u"list.index(x): x not in slice"
+ else:
+ raise ValueError, u"list.index(x): x not in list"
+
+ def get(self, key, default=None):
+ u"""get(self, key, default=None)
+
+ Gets an element attribute.
+ """
+ _assertValidNode(self)
+ return _getAttributeValue(self, key, default)
+
+ def keys(self):
+ u"""keys(self)
+
+ Gets a list of attribute names. The names are returned in an
+ arbitrary order (just like for an ordinary Python dictionary).
+ """
+ _assertValidNode(self)
+ return _collectAttributes(self._c_node, 1)
+
+ def values(self):
+ u"""values(self)
+
+ Gets element attribute values as a sequence of strings. The
+ attributes are returned in an arbitrary order.
+ """
+ _assertValidNode(self)
+ return _collectAttributes(self._c_node, 2)
+
+ def items(self):
+ u"""items(self)
+
+ Gets element attributes, as a sequence. The attributes are returned in
+ an arbitrary order.
+ """
+ _assertValidNode(self)
+ return _collectAttributes(self._c_node, 3)
+
+ def getchildren(self):
+ u"""getchildren(self)
+
+ Returns all direct children. The elements are returned in document
+ order.
+
+ :deprecated: Note that this method has been deprecated as of
+ ElementTree 1.3 and lxml 2.0. New code should use
+ ``list(element)`` or simply iterate over elements.
+ """
+ _assertValidNode(self)
+ return _collectChildren(self)
+
+ def getparent(self):
+ u"""getparent(self)
+
+ Returns the parent of this element or None for the root element.
+ """
+ cdef xmlNode* c_node
+ #_assertValidNode(self) # not needed
+ c_node = _parentElement(self._c_node)
+ if c_node is NULL:
+ return None
+ return _elementFactory(self._doc, c_node)
+
+ def getnext(self):
+ u"""getnext(self)
+
+ Returns the following sibling of this element or None.
+ """
+ cdef xmlNode* c_node
+ #_assertValidNode(self) # not needed
+ c_node = _nextElement(self._c_node)
+ if c_node is NULL:
+ return None
+ return _elementFactory(self._doc, c_node)
+
+ def getprevious(self):
+ u"""getprevious(self)
+
+ Returns the preceding sibling of this element or None.
+ """
+ cdef xmlNode* c_node
+ #_assertValidNode(self) # not needed
+ c_node = _previousElement(self._c_node)
+ if c_node is NULL:
+ return None
+ return _elementFactory(self._doc, c_node)
+
+ def itersiblings(self, tag=None, *tags, preceding=False):
+ u"""itersiblings(self, tag=None, *tags, preceding=False)
+
+ Iterate over the following or preceding siblings of this element.
+
+ The direction is determined by the 'preceding' keyword which
+ defaults to False, i.e. forward iteration over the following
+ siblings. When True, the iterator yields the preceding
+ siblings in reverse document order, i.e. starting right before
+ the current element and going backwards.
+
+ Can be restricted to find only elements with specific tags,
+ see `iter`.
+ """
+ if preceding:
+ if self._c_node and not self._c_node.prev:
+ return ITER_EMPTY
+ elif self._c_node and not self._c_node.next:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return SiblingsIterator(self, tags, preceding=preceding)
+
+ def iterancestors(self, tag=None, *tags):
+ u"""iterancestors(self, tag=None, *tags)
+
+ Iterate over the ancestors of this element (from parent to parent).
+
+ Can be restricted to find only elements with specific tags,
+ see `iter`.
+ """
+ if self._c_node and not self._c_node.parent:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return AncestorsIterator(self, tags)
+
+ def iterdescendants(self, tag=None, *tags):
+ u"""iterdescendants(self, tag=None, *tags)
+
+ Iterate over the descendants of this element in document order.
+
+ As opposed to ``el.iter()``, this iterator does not yield the element
+ itself. The returned elements can be restricted to find only elements
+ with specific tags, see `iter`.
+ """
+ if self._c_node and not self._c_node.children:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return ElementDepthFirstIterator(self, tags, inclusive=False)
+
+ def iterchildren(self, tag=None, *tags, reversed=False):
+ u"""iterchildren(self, tag=None, *tags, reversed=False)
+
+ Iterate over the children of this element.
+
+ As opposed to using normal iteration on this element, the returned
+ elements can be reversed with the 'reversed' keyword and restricted
+ to find only elements with specific tags, see `iter`.
+ """
+ if self._c_node and not self._c_node.children:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return ElementChildIterator(self, tags, reversed=reversed)
+
+ def getroottree(self):
+ u"""getroottree(self)
+
+ Return an ElementTree for the root node of the document that
+ contains this element.
+
+ This is the same as following element.getparent() up the tree until it
+ returns None (for the root element) and then build an ElementTree for
+ the last parent that was returned."""
+ _assertValidDoc(self._doc)
+ return _elementTreeFactory(self._doc, None)
+
+ def getiterator(self, tag=None, *tags):
+ u"""getiterator(self, tag=None, *tags)
+
+ Returns a sequence or iterator of all elements in the subtree in
+ document order (depth first pre-order), starting with this
+ element.
+
+ Can be restricted to find only elements with specific tags,
+ see `iter`.
+
+ :deprecated: Note that this method is deprecated as of
+ ElementTree 1.3 and lxml 2.0. It returns an iterator in
+ lxml, which diverges from the original ElementTree
+ behaviour. If you want an efficient iterator, use the
+ ``element.iter()`` method instead. You should only use this
+ method in new code if you require backwards compatibility
+ with older versions of lxml or ElementTree.
+ """
+ if tag is not None:
+ tags += (tag,)
+ return ElementDepthFirstIterator(self, tags)
+
+ def iter(self, tag=None, *tags):
+ u"""iter(self, tag=None, *tags)
+
+ Iterate over all elements in the subtree in document order (depth
+ first pre-order), starting with this element.
+
+ Can be restricted to find only elements with specific tags:
+ pass ``"{ns}localname"`` as tag. Either or both of ``ns`` and
+ ``localname`` can be ``*`` for a wildcard; ``ns`` can be empty
+ for no namespace. ``"localname"`` is equivalent to ``"{}localname"``
+ (i.e. no namespace) but ``"*"`` is ``"{*}*"`` (any or no namespace),
+ not ``"{}*"``.
+
+ You can also pass the Element, Comment, ProcessingInstruction and
+ Entity factory functions to look only for the specific element type.
+
+ Passing multiple tags (or a sequence of tags) instead of a single tag
+ will let the iterator return all elements matching any of these tags,
+ in document order.
+ """
+ if tag is not None:
+ tags += (tag,)
+ return ElementDepthFirstIterator(self, tags)
+
+ def itertext(self, tag=None, *tags, with_tail=True):
+ u"""itertext(self, tag=None, *tags, with_tail=True)
+
+ Iterates over the text content of a subtree.
+
+ You can pass tag names to restrict text content to specific elements,
+ see `iter`.
+
+ You can set the ``with_tail`` keyword argument to ``False`` to skip
+ over tail text.
+ """
+ if tag is not None:
+ tags += (tag,)
+ return ElementTextIterator(self, tags, with_tail=with_tail)
+
+ def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
+ u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
+
+ Creates a new element associated with the same document.
+ """
+ _assertValidDoc(self._doc)
+ return _makeElement(_tag, NULL, self._doc, None, None, None,
+ attrib, nsmap, _extra)
+
+ def find(self, path, namespaces=None):
+ u"""find(self, path, namespaces=None)
+
+ Finds the first matching subelement, by tag name or path.
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ if isinstance(path, QName):
+ path = (<QName>path).text
+ return _elementpath.find(self, path, namespaces)
+
+ def findtext(self, path, default=None, namespaces=None):
+ u"""findtext(self, path, default=None, namespaces=None)
+
+ Finds text for the first matching subelement, by tag name or path.
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ if isinstance(path, QName):
+ path = (<QName>path).text
+ return _elementpath.findtext(self, path, default, namespaces)
+
+ def findall(self, path, namespaces=None):
+ u"""findall(self, path, namespaces=None)
+
+ Finds all matching subelements, by tag name or path.
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ if isinstance(path, QName):
+ path = (<QName>path).text
+ return _elementpath.findall(self, path, namespaces)
+
+ def iterfind(self, path, namespaces=None):
+ u"""iterfind(self, path, namespaces=None)
+
+ Iterates over all matching subelements, by tag name or path.
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ if isinstance(path, QName):
+ path = (<QName>path).text
+ return _elementpath.iterfind(self, path, namespaces)
+
+ def xpath(self, _path, *, namespaces=None, extensions=None,
+ smart_strings=True, **_variables):
+ u"""xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
+
+ Evaluate an xpath expression using the element as context node.
+ """
+ evaluator = XPathElementEvaluator(self, namespaces=namespaces,
+ extensions=extensions,
+ smart_strings=smart_strings)
+ return evaluator(_path, **_variables)
+
+ def cssselect(self, expr, *, translator='xml'):
+ """
+ Run the CSS expression on this element and its children,
+ returning a list of the results.
+
+ Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
+ that pre-compiling the expression can provide a substantial
+ speedup.
+ """
+ # Do the import here to make the dependency optional.
+ from lxml.cssselect import CSSSelector
+ return CSSSelector(expr, translator=translator)(self)
+
+
+cdef extern from "includes/etree_defs.h":
+ # macro call to 't->tp_new()' for fast instantiation
+ cdef object NEW_ELEMENT "PY_NEW" (object t)
+
+
+@cython.linetrace(False)
+cdef _Element _elementFactory(_Document doc, xmlNode* c_node):
+ cdef _Element result
+ result = getProxy(c_node)
+ if result is not None:
+ return result
+ if c_node is NULL:
+ return None
+
+ element_class = LOOKUP_ELEMENT_CLASS(
+ ELEMENT_CLASS_LOOKUP_STATE, doc, c_node)
+ if hasProxy(c_node):
+ # prevent re-entry race condition - we just called into Python
+ return getProxy(c_node)
+ result = NEW_ELEMENT(element_class)
+ if hasProxy(c_node):
+ # prevent re-entry race condition - we just called into Python
+ result._c_node = NULL
+ return getProxy(c_node)
+
+ _registerProxy(result, doc, c_node)
+ if element_class is not _Element:
+ result._init()
+ return result
+
+
+@cython.internal
+cdef class __ContentOnlyElement(_Element):
+ cdef int _raiseImmutable(self) except -1:
+ raise TypeError, u"this element does not have children or attributes"
+
+ def set(self, key, value):
+ u"set(self, key, value)"
+ self._raiseImmutable()
+
+ def append(self, value):
+ u"append(self, value)"
+ self._raiseImmutable()
+
+ def insert(self, index, value):
+ u"insert(self, index, value)"
+ self._raiseImmutable()
+
+ def __setitem__(self, index, value):
+ u"__setitem__(self, index, value)"
+ self._raiseImmutable()
+
+ @property
+ def attrib(self):
+ return IMMUTABLE_EMPTY_MAPPING
+
+ property text:
+ def __get__(self):
+ _assertValidNode(self)
+ return funicodeOrEmpty(self._c_node.content)
+
+ def __set__(self, value):
+ cdef tree.xmlDict* c_dict
+ _assertValidNode(self)
+ if value is None:
+ c_text = <const_xmlChar*>NULL
+ else:
+ value = _utf8(value)
+ c_text = _xcstr(value)
+ tree.xmlNodeSetContent(self._c_node, c_text)
+
+ # ACCESSORS
+ def __getitem__(self, x):
+ u"__getitem__(self, x)"
+ if isinstance(x, slice):
+ return []
+ else:
+ raise IndexError, u"list index out of range"
+
+ def __len__(self):
+ u"__len__(self)"
+ return 0
+
+ def get(self, key, default=None):
+ u"get(self, key, default=None)"
+ return None
+
+ def keys(self):
+ u"keys(self)"
+ return []
+
+ def items(self):
+ u"items(self)"
+ return []
+
+ def values(self):
+ u"values(self)"
+ return []
+
+cdef class _Comment(__ContentOnlyElement):
+ @property
+ def tag(self):
+ return Comment
+
+ def __repr__(self):
+ return "<!--%s-->" % strrepr(self.text)
+
+cdef class _ProcessingInstruction(__ContentOnlyElement):
+ @property
+ def tag(self):
+ return ProcessingInstruction
+
+ property target:
+ # not in ElementTree
+ def __get__(self):
+ _assertValidNode(self)
+ return funicode(self._c_node.name)
+
+ def __set__(self, value):
+ _assertValidNode(self)
+ value = _utf8(value)
+ c_text = _xcstr(value)
+ tree.xmlNodeSetName(self._c_node, c_text)
+
+ def __repr__(self):
+ text = self.text
+ if text:
+ return "<?%s %s?>" % (strrepr(self.target),
+ strrepr(text))
+ else:
+ return "<?%s?>" % strrepr(self.target)
+
+ def get(self, key, default=None):
+ u"""get(self, key, default=None)
+
+ Try to parse pseudo-attributes from the text content of the
+ processing instruction, search for one with the given key as
+ name and return its associated value.
+
+ Note that this is only a convenience method for the most
+ common case that all text content is structured in
+ attribute-like name-value pairs with properly quoted values.
+ It is not guaranteed to work for all possible text content.
+ """
+ return self.attrib.get(key, default)
+
+ @property
+ def attrib(self):
+ """Returns a dict containing all pseudo-attributes that can be
+ parsed from the text content of this processing instruction.
+ Note that modifying the dict currently has no effect on the
+ XML node, although this is not guaranteed to stay this way.
+ """
+ return { attr : (value1 or value2)
+ for attr, value1, value2 in _FIND_PI_ATTRIBUTES(u' ' + self.text) }
+
+cdef object _FIND_PI_ATTRIBUTES = re.compile(ur'\s+(\w+)\s*=\s*(?:\'([^\']*)\'|"([^"]*)")', re.U).findall
+
+cdef class _Entity(__ContentOnlyElement):
+ @property
+ def tag(self):
+ return Entity
+
+ property name:
+ # not in ElementTree
+ def __get__(self):
+ _assertValidNode(self)
+ return funicode(self._c_node.name)
+
+ def __set__(self, value):
+ _assertValidNode(self)
+ value_utf = _utf8(value)
+ if b'&' in value_utf or b';' in value_utf:
+ raise ValueError, f"Invalid entity name '{value}'"
+ tree.xmlNodeSetName(self._c_node, _xcstr(value_utf))
+
+ @property
+ def text(self):
+ # FIXME: should this be None or '&[VALUE];' or the resolved
+ # entity value ?
+ _assertValidNode(self)
+ return f'&{funicode(self._c_node.name)};'
+
+ def __repr__(self):
+ return "&%s;" % strrepr(self.name)
+
+
+cdef class QName:
+ u"""QName(text_or_uri_or_element, tag=None)
+
+ QName wrapper for qualified XML names.
+
+ Pass a tag name by itself or a namespace URI and a tag name to
+ create a qualified name. Alternatively, pass an Element to
+ extract its tag name. ``None`` as first argument is ignored in
+ order to allow for generic 2-argument usage.
+
+ The ``text`` property holds the qualified name in
+ ``{namespace}tagname`` notation. The ``namespace`` and
+ ``localname`` properties hold the respective parts of the tag
+ name.
+
+ You can pass QName objects wherever a tag name is expected. Also,
+ setting Element text from a QName will resolve the namespace prefix
+ on assignment and set a qualified text value. This is helpful in XML
+ languages like SOAP or XML-Schema that use prefixed tag names in
+ their text content.
+ """
+ cdef readonly unicode text
+ cdef readonly unicode localname
+ cdef readonly unicode namespace
+ def __init__(self, text_or_uri_or_element, tag=None):
+ if text_or_uri_or_element is None:
+ # Allow None as no namespace.
+ text_or_uri_or_element, tag = tag, None
+ if not _isString(text_or_uri_or_element):
+ if isinstance(text_or_uri_or_element, _Element):
+ text_or_uri_or_element = (<_Element>text_or_uri_or_element).tag
+ if not _isString(text_or_uri_or_element):
+ raise ValueError, f"Invalid input tag of type {type(text_or_uri_or_element)!r}"
+ elif isinstance(text_or_uri_or_element, QName):
+ text_or_uri_or_element = (<QName>text_or_uri_or_element).text
+ elif text_or_uri_or_element is not None:
+ text_or_uri_or_element = unicode(text_or_uri_or_element)
+ else:
+ raise ValueError, f"Invalid input tag of type {type(text_or_uri_or_element)!r}"
+
+ ns_utf, tag_utf = _getNsTag(text_or_uri_or_element)
+ if tag is not None:
+ # either ('ns', 'tag') or ('{ns}oldtag', 'newtag')
+ if ns_utf is None:
+ ns_utf = tag_utf # case 1: namespace ended up as tag name
+ tag_utf = _utf8(tag)
+ _tagValidOrRaise(tag_utf)
+ self.localname = (<bytes>tag_utf).decode('utf8')
+ if ns_utf is None:
+ self.namespace = None
+ self.text = self.localname
+ else:
+ self.namespace = (<bytes>ns_utf).decode('utf8')
+ self.text = u"{%s}%s" % (self.namespace, self.localname)
+ def __str__(self):
+ return self.text
+ def __hash__(self):
+ return hash(self.text)
+ def __richcmp__(self, other, int op):
+ try:
+ if type(other) is QName:
+ other = (<QName>other).text
+ elif not isinstance(other, unicode):
+ other = unicode(other)
+ except (ValueError, UnicodeDecodeError):
+ return NotImplemented
+ return python.PyObject_RichCompare(self.text, other, op)
+
+
+cdef public class _ElementTree [ type LxmlElementTreeType,
+ object LxmlElementTree ]:
+ cdef _Document _doc
+ cdef _Element _context_node
+
+ # Note that _doc is only used to store the original document if we do not
+ # have a _context_node. All methods should prefer self._context_node._doc
+ # to honour tree restructuring. _doc can happily be None!
+
+ @cython.final
+ cdef int _assertHasRoot(self) except -1:
+ u"""We have to take care here: the document may not have a root node!
+ This can happen if ElementTree() is called without any argument and
+ the caller 'forgets' to call parse() afterwards, so this is a bug in
+ the caller program.
+ """
+ assert self._context_node is not None, \
+ u"ElementTree not initialized, missing root"
+ return 0
+
+ def parse(self, source, _BaseParser parser=None, *, base_url=None):
+ u"""parse(self, source, parser=None, base_url=None)
+
+ Updates self with the content of source and returns its root.
+ """
+ cdef _Document doc = None
+ try:
+ doc = _parseDocument(source, parser, base_url)
+ except _TargetParserResult as result_container:
+ # raises a TypeError if we don't get an _Element
+ self._context_node = result_container.result
+ else:
+ self._context_node = doc.getroot()
+ self._doc = None if self._context_node is not None else doc
+ return self._context_node
+
+ def _setroot(self, _Element root not None):
+ u"""_setroot(self, root)
+
+ Relocate the ElementTree to a new root node.
+ """
+ _assertValidNode(root)
+ if root._c_node.type != tree.XML_ELEMENT_NODE:
+ raise TypeError, u"Only elements can be the root of an ElementTree"
+ self._context_node = root
+ self._doc = None
+
+ def getroot(self):
+ u"""getroot(self)
+
+ Gets the root element for this tree.
+ """
+ return self._context_node
+
+ def __copy__(self):
+ return _elementTreeFactory(self._doc, self._context_node)
+
+ def __deepcopy__(self, memo):
+ cdef _Element root
+ cdef _Document doc
+ cdef xmlDoc* c_doc
+ if self._context_node is not None:
+ root = self._context_node.__copy__()
+ assert root is not None
+ _assertValidNode(root)
+ _copyNonElementSiblings(self._context_node._c_node, root._c_node)
+ return _elementTreeFactory(None, root)
+ elif self._doc is not None:
+ _assertValidDoc(self._doc)
+ c_doc = tree.xmlCopyDoc(self._doc._c_doc, 1)
+ if c_doc is NULL:
+ raise MemoryError()
+ doc = _documentFactory(c_doc, self._doc._parser)
+ return _elementTreeFactory(doc, None)
+ else:
+ # so what ...
+ return self
+
+ # not in ElementTree
+ @property
+ def docinfo(self) -> DocInfo:
+ """Information about the document provided by parser and DTD."""
+ self._assertHasRoot()
+ return DocInfo(self._context_node._doc)
+
+ # not in ElementTree, read-only
+ @property
+ def parser(self):
+ """The parser that was used to parse the document in this ElementTree.
+ """
+ if self._context_node is not None and \
+ self._context_node._doc is not None:
+ return self._context_node._doc._parser
+ if self._doc is not None:
+ return self._doc._parser
+ return None
+
+ def write(self, file, *, encoding=None, method="xml",
+ bint pretty_print=False, xml_declaration=None, bint with_tail=True,
+ standalone=None, doctype=None, compression=0,
+ bint exclusive=False, inclusive_ns_prefixes=None,
+ bint with_comments=True, bint strip_text=False,
+ docstring=None):
+ u"""write(self, file, encoding=None, method="xml",
+ pretty_print=False, xml_declaration=None, with_tail=True,
+ standalone=None, doctype=None, compression=0,
+ exclusive=False, inclusive_ns_prefixes=None,
+ with_comments=True, strip_text=False)
+
+ Write the tree to a filename, file or file-like object.
+
+ Defaults to ASCII encoding and writing a declaration as needed.
+
+ The keyword argument 'method' selects the output method:
+ 'xml', 'html', 'text' or 'c14n'. Default is 'xml'.
+
+ With ``method="c14n"`` (C14N version 1), the options ``exclusive``,
+ ``with_comments`` and ``inclusive_ns_prefixes`` request exclusive
+ C14N, include comments, and list the inclusive prefixes respectively.
+
+ With ``method="c14n2"`` (C14N version 2), the ``with_comments`` and
+ ``strip_text`` options control the output of comments and text space
+ according to C14N 2.0.
+
+ Passing a boolean value to the ``standalone`` option will
+ output an XML declaration with the corresponding
+ ``standalone`` flag.
+
+ The ``doctype`` option allows passing in a plain string that will
+ be serialised before the XML tree. Note that passing in non
+ well-formed content here will make the XML output non well-formed.
+ Also, an existing doctype in the document tree will not be removed
+ when serialising an ElementTree instance.
+
+ The ``compression`` option enables GZip compression level 1-9.
+
+ The ``inclusive_ns_prefixes`` should be a list of namespace strings
+ (i.e. ['xs', 'xsi']) that will be promoted to the top-level element
+ during exclusive C14N serialisation. This parameter is ignored if
+ exclusive mode=False.
+
+ If exclusive=True and no list is provided, a namespace will only be
+ rendered if it is used by the immediate parent or one of its attributes
+ and its prefix and values have not already been rendered by an ancestor
+ of the namespace node's parent element.
+ """
+ cdef bint write_declaration
+ cdef int is_standalone
+
+ self._assertHasRoot()
+ _assertValidNode(self._context_node)
+ if compression is None or compression < 0:
+ compression = 0
+
+ # C14N serialisation
+ if method in ('c14n', 'c14n2'):
+ if encoding is not None:
+ raise ValueError("Cannot specify encoding with C14N")
+ if xml_declaration:
+ raise ValueError("Cannot enable XML declaration in C14N")
+
+ if method == 'c14n':
+ _tofilelikeC14N(file, self._context_node, exclusive, with_comments,
+ compression, inclusive_ns_prefixes)
+ else: # c14n2
+ with _open_utf8_file(file, compression=compression) as f:
+ target = C14NWriterTarget(
+ f.write, with_comments=with_comments, strip_text=strip_text)
+ _tree_to_target(self, target)
+ return
+
+ if not with_comments:
+ raise ValueError("Can only discard comments in C14N serialisation")
+ # suppress decl. in default case (purely for ElementTree compatibility)
+ if xml_declaration is not None:
+ write_declaration = xml_declaration
+ if encoding is None:
+ encoding = 'ASCII'
+ else:
+ encoding = encoding.upper()
+ elif encoding is None:
+ encoding = 'ASCII'
+ write_declaration = 0
+ else:
+ encoding = encoding.upper()
+ write_declaration = encoding not in (
+ 'US-ASCII', 'ASCII', 'UTF8', 'UTF-8')
+ if standalone is None:
+ is_standalone = -1
+ elif standalone:
+ write_declaration = 1
+ is_standalone = 1
+ else:
+ write_declaration = 1
+ is_standalone = 0
+
+ if docstring is not None and doctype is None:
+ import warnings
+ warnings.warn(
+ "The 'docstring' option is deprecated. Use 'doctype' instead.",
+ DeprecationWarning)
+ doctype = docstring
+
+ _tofilelike(file, self._context_node, encoding, doctype, method,
+ write_declaration, 1, pretty_print, with_tail,
+ is_standalone, compression)
+
+ def getpath(self, _Element element not None):
+ u"""getpath(self, element)
+
+ Returns a structural, absolute XPath expression to find the element.
+
+ For namespaced elements, the expression uses prefixes from the
+ document, which therefore need to be provided in order to make any
+ use of the expression in XPath.
+
+ Also see the method getelementpath(self, element), which returns a
+ self-contained ElementPath expression.
+ """
+ cdef _Document doc
+ cdef _Element root
+ cdef xmlDoc* c_doc
+ _assertValidNode(element)
+ if self._context_node is not None:
+ root = self._context_node
+ doc = root._doc
+ elif self._doc is not None:
+ doc = self._doc
+ root = doc.getroot()
+ else:
+ raise ValueError, u"Element is not in this tree."
+ _assertValidDoc(doc)
+ _assertValidNode(root)
+ if element._doc is not doc:
+ raise ValueError, u"Element is not in this tree."
+
+ c_doc = _fakeRootDoc(doc._c_doc, root._c_node)
+ c_path = tree.xmlGetNodePath(element._c_node)
+ _destroyFakeDoc(doc._c_doc, c_doc)
+ if c_path is NULL:
+ raise MemoryError()
+ path = funicode(c_path)
+ tree.xmlFree(c_path)
+ return path
+
+ def getelementpath(self, _Element element not None):
+ u"""getelementpath(self, element)
+
+ Returns a structural, absolute ElementPath expression to find the
+ element. This path can be used in the .find() method to look up
+ the element, provided that the elements along the path and their
+ list of immediate children were not modified in between.
+
+ ElementPath has the advantage over an XPath expression (as returned
+ by the .getpath() method) that it does not require additional prefix
+ declarations. It is always self-contained.
+ """
+ cdef _Element root
+ cdef Py_ssize_t count
+ _assertValidNode(element)
+ if element._c_node.type != tree.XML_ELEMENT_NODE:
+ raise ValueError, u"input is not an Element"
+ if self._context_node is not None:
+ root = self._context_node
+ elif self._doc is not None:
+ root = self._doc.getroot()
+ else:
+ raise ValueError, u"Element is not in this tree"
+ _assertValidNode(root)
+ if element._doc is not root._doc:
+ raise ValueError, u"Element is not in this tree"
+
+ path = []
+ c_element = element._c_node
+ while c_element is not root._c_node:
+ c_name = c_element.name
+ c_href = _getNs(c_element)
+ tag = _namespacedNameFromNsName(c_href, c_name)
+ if c_href is NULL:
+ c_href = <const_xmlChar*>b'' # no namespace (NULL is wildcard)
+ # use tag[N] if there are preceding siblings with the same tag
+ count = 0
+ c_node = c_element.prev
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ if _tagMatches(c_node, c_href, c_name):
+ count += 1
+ c_node = c_node.prev
+ if count:
+ tag = f'{tag}[{count+1}]'
+ else:
+ # use tag[1] if there are following siblings with the same tag
+ c_node = c_element.next
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ if _tagMatches(c_node, c_href, c_name):
+ tag += '[1]'
+ break
+ c_node = c_node.next
+
+ path.append(tag)
+ c_element = c_element.parent
+ if c_element is NULL or c_element.type != tree.XML_ELEMENT_NODE:
+ raise ValueError, u"Element is not in this tree."
+ if not path:
+ return '.'
+ path.reverse()
+ return '/'.join(path)
+
+ def getiterator(self, tag=None, *tags):
+ u"""getiterator(self, *tags, tag=None)
+
+ Returns a sequence or iterator of all elements in document order
+ (depth first pre-order), starting with the root element.
+
+ Can be restricted to find only elements with specific tags,
+ see `_Element.iter`.
+
+ :deprecated: Note that this method is deprecated as of
+ ElementTree 1.3 and lxml 2.0. It returns an iterator in
+ lxml, which diverges from the original ElementTree
+ behaviour. If you want an efficient iterator, use the
+ ``tree.iter()`` method instead. You should only use this
+ method in new code if you require backwards compatibility
+ with older versions of lxml or ElementTree.
+ """
+ root = self.getroot()
+ if root is None:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return root.getiterator(*tags)
+
+ def iter(self, tag=None, *tags):
+ u"""iter(self, tag=None, *tags)
+
+ Creates an iterator for the root element. The iterator loops over
+ all elements in this tree, in document order. Note that siblings
+ of the root element (comments or processing instructions) are not
+ returned by the iterator.
+
+ Can be restricted to find only elements with specific tags,
+ see `_Element.iter`.
+ """
+ root = self.getroot()
+ if root is None:
+ return ITER_EMPTY
+ if tag is not None:
+ tags += (tag,)
+ return root.iter(*tags)
+
+ def find(self, path, namespaces=None):
+ u"""find(self, path, namespaces=None)
+
+ Finds the first toplevel element with given tag. Same as
+ ``tree.getroot().find(path)``.
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ self._assertHasRoot()
+ root = self.getroot()
+ if _isString(path):
+ if path[:1] == "/":
+ path = "." + path
+ return root.find(path, namespaces)
+
+ def findtext(self, path, default=None, namespaces=None):
+ u"""findtext(self, path, default=None, namespaces=None)
+
+ Finds the text for the first element matching the ElementPath
+ expression. Same as getroot().findtext(path)
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ self._assertHasRoot()
+ root = self.getroot()
+ if _isString(path):
+ if path[:1] == "/":
+ path = "." + path
+ return root.findtext(path, default, namespaces)
+
+ def findall(self, path, namespaces=None):
+ u"""findall(self, path, namespaces=None)
+
+ Finds all elements matching the ElementPath expression. Same as
+ getroot().findall(path).
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ self._assertHasRoot()
+ root = self.getroot()
+ if _isString(path):
+ if path[:1] == "/":
+ path = "." + path
+ return root.findall(path, namespaces)
+
+ def iterfind(self, path, namespaces=None):
+ u"""iterfind(self, path, namespaces=None)
+
+ Iterates over all elements matching the ElementPath expression.
+ Same as getroot().iterfind(path).
+
+ The optional ``namespaces`` argument accepts a
+ prefix-to-namespace mapping that allows the usage of XPath
+ prefixes in the path expression.
+ """
+ self._assertHasRoot()
+ root = self.getroot()
+ if _isString(path):
+ if path[:1] == "/":
+ path = "." + path
+ return root.iterfind(path, namespaces)
+
+ def xpath(self, _path, *, namespaces=None, extensions=None,
+ smart_strings=True, **_variables):
+ u"""xpath(self, _path, namespaces=None, extensions=None, smart_strings=True, **_variables)
+
+ XPath evaluate in context of document.
+
+ ``namespaces`` is an optional dictionary with prefix to namespace URI
+ mappings, used by XPath. ``extensions`` defines additional extension
+ functions.
+
+ Returns a list (nodeset), or bool, float or string.
+
+ In case of a list result, return Element for element nodes,
+ string for text and attribute values.
+
+ Note: if you are going to apply multiple XPath expressions
+ against the same document, it is more efficient to use
+ XPathEvaluator directly.
+ """
+ self._assertHasRoot()
+ evaluator = XPathDocumentEvaluator(self, namespaces=namespaces,
+ extensions=extensions,
+ smart_strings=smart_strings)
+ return evaluator(_path, **_variables)
+
+ def xslt(self, _xslt, extensions=None, access_control=None, **_kw):
+ u"""xslt(self, _xslt, extensions=None, access_control=None, **_kw)
+
+ Transform this document using other document.
+
+ xslt is a tree that should be XSLT
+ keyword parameters are XSLT transformation parameters.
+
+ Returns the transformed tree.
+
+ Note: if you are going to apply the same XSLT stylesheet against
+ multiple documents, it is more efficient to use the XSLT
+ class directly.
+ """
+ self._assertHasRoot()
+ style = XSLT(_xslt, extensions=extensions,
+ access_control=access_control)
+ return style(self, **_kw)
+
+ def relaxng(self, relaxng):
+ u"""relaxng(self, relaxng)
+
+ Validate this document using other document.
+
+ The relaxng argument is a tree that should contain a Relax NG schema.
+
+ Returns True or False, depending on whether validation
+ succeeded.
+
+ Note: if you are going to apply the same Relax NG schema against
+ multiple documents, it is more efficient to use the RelaxNG
+ class directly.
+ """
+ self._assertHasRoot()
+ schema = RelaxNG(relaxng)
+ return schema.validate(self)
+
+ def xmlschema(self, xmlschema):
+ u"""xmlschema(self, xmlschema)
+
+ Validate this document using other document.
+
+ The xmlschema argument is a tree that should contain an XML Schema.
+
+ Returns True or False, depending on whether validation
+ succeeded.
+
+ Note: If you are going to apply the same XML Schema against
+ multiple documents, it is more efficient to use the XMLSchema
+ class directly.
+ """
+ self._assertHasRoot()
+ schema = XMLSchema(xmlschema)
+ return schema.validate(self)
+
+ def xinclude(self):
+ u"""xinclude(self)
+
+ Process the XInclude nodes in this document and include the
+ referenced XML fragments.
+
+ There is support for loading files through the file system, HTTP and
+ FTP.
+
+ Note that XInclude does not support custom resolvers in Python space
+ due to restrictions of libxml2 <= 2.6.29.
+ """
+ self._assertHasRoot()
+ XInclude()(self._context_node)
+
+ def write_c14n(self, file, *, bint exclusive=False, bint with_comments=True,
+ compression=0, inclusive_ns_prefixes=None):
+ u"""write_c14n(self, file, exclusive=False, with_comments=True,
+ compression=0, inclusive_ns_prefixes=None)
+
+ C14N write of document. Always writes UTF-8.
+
+ The ``compression`` option enables GZip compression level 1-9.
+
+ The ``inclusive_ns_prefixes`` should be a list of namespace strings
+ (i.e. ['xs', 'xsi']) that will be promoted to the top-level element
+ during exclusive C14N serialisation. This parameter is ignored if
+ exclusive mode=False.
+
+ If exclusive=True and no list is provided, a namespace will only be
+ rendered if it is used by the immediate parent or one of its attributes
+ and its prefix and values have not already been rendered by an ancestor
+ of the namespace node's parent element.
+
+ NOTE: This method is deprecated as of lxml 4.4 and will be removed in a
+ future release. Use ``.write(f, method="c14n")`` instead.
+ """
+ self._assertHasRoot()
+ _assertValidNode(self._context_node)
+ if compression is None or compression < 0:
+ compression = 0
+
+ _tofilelikeC14N(file, self._context_node, exclusive, with_comments,
+ compression, inclusive_ns_prefixes)
+
+cdef _ElementTree _elementTreeFactory(_Document doc, _Element context_node):
+ return _newElementTree(doc, context_node, _ElementTree)
+
+cdef _ElementTree _newElementTree(_Document doc, _Element context_node,
+ object baseclass):
+ cdef _ElementTree result
+ result = baseclass()
+ if context_node is None and doc is not None:
+ context_node = doc.getroot()
+ if context_node is None:
+ _assertValidDoc(doc)
+ result._doc = doc
+ else:
+ _assertValidNode(context_node)
+ result._context_node = context_node
+ return result
+
+
+@cython.final
+@cython.freelist(16)
+cdef class _Attrib:
+ u"""A dict-like proxy for the ``Element.attrib`` property.
+ """
+ cdef _Element _element
+ def __cinit__(self, _Element element not None):
+ _assertValidNode(element)
+ self._element = element
+
+ # MANIPULATORS
+ def __setitem__(self, key, value):
+ _assertValidNode(self._element)
+ _setAttributeValue(self._element, key, value)
+
+ def __delitem__(self, key):
+ _assertValidNode(self._element)
+ _delAttribute(self._element, key)
+
+ def update(self, sequence_or_dict):
+ _assertValidNode(self._element)
+ if isinstance(sequence_or_dict, (dict, _Attrib)):
+ sequence_or_dict = sequence_or_dict.items()
+ for key, value in sequence_or_dict:
+ _setAttributeValue(self._element, key, value)
+
+ def pop(self, key, *default):
+ if len(default) > 1:
+ raise TypeError, f"pop expected at most 2 arguments, got {len(default)+1}"
+ _assertValidNode(self._element)
+ result = _getAttributeValue(self._element, key, None)
+ if result is None:
+ if not default:
+ raise KeyError, key
+ result = default[0]
+ else:
+ _delAttribute(self._element, key)
+ return result
+
+ def clear(self):
+ _assertValidNode(self._element)
+ c_attrs = self._element._c_node.properties
+ if c_attrs:
+ self._element._c_node.properties = NULL
+ tree.xmlFreePropList(c_attrs)
+
+ # ACCESSORS
+ def __repr__(self):
+ _assertValidNode(self._element)
+ return repr(dict( _collectAttributes(self._element._c_node, 3) ))
+
+ def __copy__(self):
+ _assertValidNode(self._element)
+ return dict(_collectAttributes(self._element._c_node, 3))
+
+ def __deepcopy__(self, memo):
+ _assertValidNode(self._element)
+ return dict(_collectAttributes(self._element._c_node, 3))
+
+ def __getitem__(self, key):
+ _assertValidNode(self._element)
+ result = _getAttributeValue(self._element, key, None)
+ if result is None:
+ raise KeyError, key
+ return result
+
+ def __bool__(self):
+ _assertValidNode(self._element)
+ cdef xmlAttr* c_attr = self._element._c_node.properties
+ while c_attr is not NULL:
+ if c_attr.type == tree.XML_ATTRIBUTE_NODE:
+ return 1
+ c_attr = c_attr.next
+ return 0
+
+ def __len__(self):
+ _assertValidNode(self._element)
+ cdef xmlAttr* c_attr = self._element._c_node.properties
+ cdef Py_ssize_t c = 0
+ while c_attr is not NULL:
+ if c_attr.type == tree.XML_ATTRIBUTE_NODE:
+ c += 1
+ c_attr = c_attr.next
+ return c
+
+ def get(self, key, default=None):
+ _assertValidNode(self._element)
+ return _getAttributeValue(self._element, key, default)
+
+ def keys(self):
+ _assertValidNode(self._element)
+ return _collectAttributes(self._element._c_node, 1)
+
+ def __iter__(self):
+ _assertValidNode(self._element)
+ return iter(_collectAttributes(self._element._c_node, 1))
+
+ def iterkeys(self):
+ _assertValidNode(self._element)
+ return iter(_collectAttributes(self._element._c_node, 1))
+
+ def values(self):
+ _assertValidNode(self._element)
+ return _collectAttributes(self._element._c_node, 2)
+
+ def itervalues(self):
+ _assertValidNode(self._element)
+ return iter(_collectAttributes(self._element._c_node, 2))
+
+ def items(self):
+ _assertValidNode(self._element)
+ return _collectAttributes(self._element._c_node, 3)
+
+ def iteritems(self):
+ _assertValidNode(self._element)
+ return iter(_collectAttributes(self._element._c_node, 3))
+
+ def has_key(self, key):
+ _assertValidNode(self._element)
+ return key in self
+
+ def __contains__(self, key):
+ _assertValidNode(self._element)
+ cdef xmlNode* c_node
+ ns, tag = _getNsTag(key)
+ c_node = self._element._c_node
+ c_href = <const_xmlChar*>NULL if ns is None else _xcstr(ns)
+ return 1 if tree.xmlHasNsProp(c_node, _xcstr(tag), c_href) else 0
+
+ def __richcmp__(self, other, int op):
+ try:
+ one = dict(self.items())
+ if not isinstance(other, dict):
+ other = dict(other)
+ except (TypeError, ValueError):
+ return NotImplemented
+ return python.PyObject_RichCompare(one, other, op)
+
+
+@cython.final
+@cython.internal
+cdef class _AttribIterator:
+ u"""Attribute iterator - for internal use only!
+ """
+ # XML attributes must not be removed while running!
+ cdef _Element _node
+ cdef xmlAttr* _c_attr
+ cdef int _keysvalues # 1 - keys, 2 - values, 3 - items (key, value)
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ cdef xmlAttr* c_attr
+ if self._node is None:
+ raise StopIteration
+ c_attr = self._c_attr
+ while c_attr is not NULL and c_attr.type != tree.XML_ATTRIBUTE_NODE:
+ c_attr = c_attr.next
+ if c_attr is NULL:
+ self._node = None
+ raise StopIteration
+
+ self._c_attr = c_attr.next
+ if self._keysvalues == 1:
+ return _namespacedName(<xmlNode*>c_attr)
+ elif self._keysvalues == 2:
+ return _attributeValue(self._node._c_node, c_attr)
+ else:
+ return (_namespacedName(<xmlNode*>c_attr),
+ _attributeValue(self._node._c_node, c_attr))
+
+cdef object _attributeIteratorFactory(_Element element, int keysvalues):
+ cdef _AttribIterator attribs
+ if element._c_node.properties is NULL:
+ return ITER_EMPTY
+ attribs = _AttribIterator()
+ attribs._node = element
+ attribs._c_attr = element._c_node.properties
+ attribs._keysvalues = keysvalues
+ return attribs
+
+
+cdef public class _ElementTagMatcher [ object LxmlElementTagMatcher,
+ type LxmlElementTagMatcherType ]:
+ """
+ Dead but public. :)
+ """
+ cdef object _pystrings
+ cdef int _node_type
+ cdef char* _href
+ cdef char* _name
+ cdef _initTagMatch(self, tag):
+ self._href = NULL
+ self._name = NULL
+ if tag is None:
+ self._node_type = 0
+ elif tag is Comment:
+ self._node_type = tree.XML_COMMENT_NODE
+ elif tag is ProcessingInstruction:
+ self._node_type = tree.XML_PI_NODE
+ elif tag is Entity:
+ self._node_type = tree.XML_ENTITY_REF_NODE
+ elif tag is Element:
+ self._node_type = tree.XML_ELEMENT_NODE
+ else:
+ self._node_type = tree.XML_ELEMENT_NODE
+ self._pystrings = _getNsTag(tag)
+ if self._pystrings[0] is not None:
+ self._href = _cstr(self._pystrings[0])
+ self._name = _cstr(self._pystrings[1])
+ if self._name[0] == c'*' and self._name[1] == c'\0':
+ self._name = NULL
+
+cdef public class _ElementIterator(_ElementTagMatcher) [
+ object LxmlElementIterator, type LxmlElementIteratorType ]:
+ """
+ Dead but public. :)
+ """
+ # we keep Python references here to control GC
+ cdef _Element _node
+ cdef _node_to_node_function _next_element
+ def __iter__(self):
+ return self
+
+ cdef void _storeNext(self, _Element node):
+ cdef xmlNode* c_node
+ c_node = self._next_element(node._c_node)
+ while c_node is not NULL and \
+ self._node_type != 0 and \
+ (<tree.xmlElementType>self._node_type != c_node.type or
+ not _tagMatches(c_node, <const_xmlChar*>self._href, <const_xmlChar*>self._name)):
+ c_node = self._next_element(c_node)
+ if c_node is NULL:
+ self._node = None
+ else:
+ # Python ref:
+ self._node = _elementFactory(node._doc, c_node)
+
+ def __next__(self):
+ cdef xmlNode* c_node
+ cdef _Element current_node
+ if self._node is None:
+ raise StopIteration
+ # Python ref:
+ current_node = self._node
+ self._storeNext(current_node)
+ return current_node
+
+@cython.final
+@cython.internal
+cdef class _MultiTagMatcher:
+ """
+ Match an xmlNode against a list of tags.
+ """
+ cdef list _py_tags
+ cdef qname* _cached_tags
+ cdef size_t _tag_count
+ cdef size_t _cached_size
+ cdef _Document _cached_doc
+ cdef int _node_types
+
+ def __cinit__(self, tags):
+ self._py_tags = []
+ self.initTagMatch(tags)
+
+ def __dealloc__(self):
+ self._clear()
+
+ cdef bint rejectsAll(self):
+ return not self._tag_count and not self._node_types
+
+ cdef bint rejectsAllAttributes(self):
+ return not self._tag_count
+
+ cdef bint matchesType(self, int node_type):
+ if node_type == tree.XML_ELEMENT_NODE and self._tag_count:
+ return True
+ return self._node_types & (1 << node_type)
+
+ cdef void _clear(self):
+ cdef size_t i, count
+ count = self._tag_count
+ self._tag_count = 0
+ if self._cached_tags:
+ for i in xrange(count):
+ cpython.ref.Py_XDECREF(self._cached_tags[i].href)
+ python.lxml_free(self._cached_tags)
+ self._cached_tags = NULL
+
+ cdef initTagMatch(self, tags):
+ self._cached_doc = None
+ del self._py_tags[:]
+ self._clear()
+ if tags is None or tags == ():
+ # no selection in tags argument => match anything
+ self._node_types = (
+ 1 << tree.XML_COMMENT_NODE |
+ 1 << tree.XML_PI_NODE |
+ 1 << tree.XML_ENTITY_REF_NODE |
+ 1 << tree.XML_ELEMENT_NODE)
+ else:
+ self._node_types = 0
+ self._storeTags(tags, set())
+
+ cdef _storeTags(self, tag, set seen):
+ if tag is Comment:
+ self._node_types |= 1 << tree.XML_COMMENT_NODE
+ elif tag is ProcessingInstruction:
+ self._node_types |= 1 << tree.XML_PI_NODE
+ elif tag is Entity:
+ self._node_types |= 1 << tree.XML_ENTITY_REF_NODE
+ elif tag is Element:
+ self._node_types |= 1 << tree.XML_ELEMENT_NODE
+ elif python._isString(tag):
+ if tag in seen:
+ return
+ seen.add(tag)
+ if tag in ('*', '{*}*'):
+ self._node_types |= 1 << tree.XML_ELEMENT_NODE
+ else:
+ href, name = _getNsTag(tag)
+ if name == b'*':
+ name = None
+ if href is None:
+ href = b'' # no namespace
+ elif href == b'*':
+ href = None # wildcard: any namespace, including none
+ self._py_tags.append((href, name))
+ elif isinstance(tag, QName):
+ self._storeTags(tag.text, seen)
+ else:
+ # support a sequence of tags
+ for item in tag:
+ self._storeTags(item, seen)
+
+ cdef inline int cacheTags(self, _Document doc, bint force_into_dict=False) except -1:
+ """
+ Look up the tag names in the doc dict to enable string pointer comparisons.
+ """
+ cdef size_t dict_size = tree.xmlDictSize(doc._c_doc.dict)
+ if doc is self._cached_doc and dict_size == self._cached_size:
+ # doc and dict didn't change => names already cached
+ return 0
+ self._tag_count = 0
+ if not self._py_tags:
+ self._cached_doc = doc
+ self._cached_size = dict_size
+ return 0
+ if not self._cached_tags:
+ self._cached_tags = <qname*>python.lxml_malloc(len(self._py_tags), sizeof(qname))
+ if not self._cached_tags:
+ self._cached_doc = None
+ raise MemoryError()
+ self._tag_count = <size_t>_mapTagsToQnameMatchArray(
+ doc._c_doc, self._py_tags, self._cached_tags, force_into_dict)
+ self._cached_doc = doc
+ self._cached_size = dict_size
+ return 0
+
+ cdef inline bint matches(self, xmlNode* c_node):
+ cdef qname* c_qname
+ if self._node_types & (1 << c_node.type):
+ return True
+ elif c_node.type == tree.XML_ELEMENT_NODE:
+ for c_qname in self._cached_tags[:self._tag_count]:
+ if _tagMatchesExactly(c_node, c_qname):
+ return True
+ return False
+
+ cdef inline bint matchesNsTag(self, const_xmlChar* c_href,
+ const_xmlChar* c_name):
+ cdef qname* c_qname
+ if self._node_types & (1 << tree.XML_ELEMENT_NODE):
+ return True
+ for c_qname in self._cached_tags[:self._tag_count]:
+ if _nsTagMatchesExactly(c_href, c_name, c_qname):
+ return True
+ return False
+
+ cdef inline bint matchesAttribute(self, xmlAttr* c_attr):
+ """Attribute matches differ from Element matches in that they do
+ not care about node types.
+ """
+ cdef qname* c_qname
+ for c_qname in self._cached_tags[:self._tag_count]:
+ if _tagMatchesExactly(<xmlNode*>c_attr, c_qname):
+ return True
+ return False
+
+cdef class _ElementMatchIterator:
+ cdef _Element _node
+ cdef _node_to_node_function _next_element
+ cdef _MultiTagMatcher _matcher
+
+ @cython.final
+ cdef _initTagMatcher(self, tags):
+ self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tags)
+
+ def __iter__(self):
+ return self
+
+ @cython.final
+ cdef int _storeNext(self, _Element node) except -1:
+ self._matcher.cacheTags(node._doc)
+ c_node = self._next_element(node._c_node)
+ while c_node is not NULL and not self._matcher.matches(c_node):
+ c_node = self._next_element(c_node)
+ # store Python ref to next node to make sure it's kept alive
+ self._node = _elementFactory(node._doc, c_node) if c_node is not NULL else None
+ return 0
+
+ def __next__(self):
+ cdef _Element current_node = self._node
+ if current_node is None:
+ raise StopIteration
+ self._storeNext(current_node)
+ return current_node
+
+cdef class ElementChildIterator(_ElementMatchIterator):
+ u"""ElementChildIterator(self, node, tag=None, reversed=False)
+ Iterates over the children of an element.
+ """
+ def __cinit__(self, _Element node not None, tag=None, *, bint reversed=False):
+ cdef xmlNode* c_node
+ _assertValidNode(node)
+ self._initTagMatcher(tag)
+ if reversed:
+ c_node = _findChildBackwards(node._c_node, 0)
+ self._next_element = _previousElement
+ else:
+ c_node = _findChildForwards(node._c_node, 0)
+ self._next_element = _nextElement
+ self._matcher.cacheTags(node._doc)
+ while c_node is not NULL and not self._matcher.matches(c_node):
+ c_node = self._next_element(c_node)
+ # store Python ref to next node to make sure it's kept alive
+ self._node = _elementFactory(node._doc, c_node) if c_node is not NULL else None
+
+cdef class SiblingsIterator(_ElementMatchIterator):
+ u"""SiblingsIterator(self, node, tag=None, preceding=False)
+ Iterates over the siblings of an element.
+
+ You can pass the boolean keyword ``preceding`` to specify the direction.
+ """
+ def __cinit__(self, _Element node not None, tag=None, *, bint preceding=False):
+ _assertValidNode(node)
+ self._initTagMatcher(tag)
+ if preceding:
+ self._next_element = _previousElement
+ else:
+ self._next_element = _nextElement
+ self._storeNext(node)
+
+cdef class AncestorsIterator(_ElementMatchIterator):
+ u"""AncestorsIterator(self, node, tag=None)
+ Iterates over the ancestors of an element (from parent to parent).
+ """
+ def __cinit__(self, _Element node not None, tag=None):
+ _assertValidNode(node)
+ self._initTagMatcher(tag)
+ self._next_element = _parentElement
+ self._storeNext(node)
+
+cdef class ElementDepthFirstIterator:
+ u"""ElementDepthFirstIterator(self, node, tag=None, inclusive=True)
+ Iterates over an element and its sub-elements in document order (depth
+ first pre-order).
+
+ Note that this also includes comments, entities and processing
+ instructions. To filter them out, check if the ``tag`` property
+ of the returned element is a string (i.e. not None and not a
+ factory function), or pass the ``Element`` factory for the ``tag``
+ argument to receive only Elements.
+
+ If the optional ``tag`` argument is not None, the iterator returns only
+ the elements that match the respective name and namespace.
+
+ The optional boolean argument 'inclusive' defaults to True and can be set
+ to False to exclude the start element itself.
+
+ Note that the behaviour of this iterator is completely undefined if the
+ tree it traverses is modified during iteration.
+ """
+ # we keep Python references here to control GC
+ # keep the next Element after the one we return, and the (s)top node
+ cdef _Element _next_node
+ cdef _Element _top_node
+ cdef _MultiTagMatcher _matcher
+ def __cinit__(self, _Element node not None, tag=None, *, bint inclusive=True):
+ _assertValidNode(node)
+ self._top_node = node
+ self._next_node = node
+ self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
+ self._matcher.cacheTags(node._doc)
+ if not inclusive or not self._matcher.matches(node._c_node):
+ # find start node (this cannot raise StopIteration, self._next_node != None)
+ next(self)
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ cdef xmlNode* c_node
+ cdef _Element current_node = self._next_node
+ if current_node is None:
+ raise StopIteration
+ c_node = current_node._c_node
+ self._matcher.cacheTags(current_node._doc)
+ if not self._matcher._tag_count:
+ # no tag name was found in the dict => not in document either
+ # try to match by node type
+ c_node = self._nextNodeAnyTag(c_node)
+ else:
+ c_node = self._nextNodeMatchTag(c_node)
+ if c_node is NULL:
+ self._next_node = None
+ else:
+ self._next_node = _elementFactory(current_node._doc, c_node)
+ return current_node
+
+ @cython.final
+ cdef xmlNode* _nextNodeAnyTag(self, xmlNode* c_node):
+ cdef int node_types = self._matcher._node_types
+ if not node_types:
+ return NULL
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
+ if node_types & (1 << c_node.type):
+ return c_node
+ tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+ return NULL
+
+ @cython.final
+ cdef xmlNode* _nextNodeMatchTag(self, xmlNode* c_node):
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(self._top_node._c_node, c_node, 0)
+ if self._matcher.matches(c_node):
+ return c_node
+ tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+ return NULL
+
+cdef class ElementTextIterator:
+ u"""ElementTextIterator(self, element, tag=None, with_tail=True)
+ Iterates over the text content of a subtree.
+
+ You can pass the ``tag`` keyword argument to restrict text content to a
+ specific tag name.
+
+ You can set the ``with_tail`` keyword argument to ``False`` to skip over
+ tail text (e.g. if you know that it's only whitespace from pretty-printing).
+ """
+ cdef object _events
+ cdef _Element _start_element
+ def __cinit__(self, _Element element not None, tag=None, *, bint with_tail=True):
+ _assertValidNode(element)
+ if with_tail:
+ events = (u"start", u"comment", u"pi", u"end")
+ else:
+ events = (u"start", u"comment", u"pi")
+ self._start_element = element
+ self._events = iterwalk(element, events=events, tag=tag)
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ cdef _Element element
+ result = None
+ while result is None:
+ event, element = next(self._events) # raises StopIteration
+ if event == u"start":
+ result = element.text
+ elif element is not self._start_element:
+ result = element.tail
+ return result
+
+cdef xmlNode* _createElement(xmlDoc* c_doc, object name_utf) except NULL:
+ cdef xmlNode* c_node
+ c_node = tree.xmlNewDocNode(c_doc, NULL, _xcstr(name_utf), NULL)
+ return c_node
+
+cdef xmlNode* _createComment(xmlDoc* c_doc, const_xmlChar* text):
+ cdef xmlNode* c_node
+ c_node = tree.xmlNewDocComment(c_doc, text)
+ return c_node
+
+cdef xmlNode* _createPI(xmlDoc* c_doc, const_xmlChar* target, const_xmlChar* text):
+ cdef xmlNode* c_node
+ c_node = tree.xmlNewDocPI(c_doc, target, text)
+ return c_node
+
+cdef xmlNode* _createEntity(xmlDoc* c_doc, const_xmlChar* name):
+ cdef xmlNode* c_node
+ c_node = tree.xmlNewReference(c_doc, name)
+ return c_node
+
+# module-level API for ElementTree
+
+def Element(_tag, attrib=None, nsmap=None, **_extra):
+ u"""Element(_tag, attrib=None, nsmap=None, **_extra)
+
+ Element factory. This function returns an object implementing the
+ Element interface.
+
+ Also look at the `_Element.makeelement()` and
+ `_BaseParser.makeelement()` methods, which provide a faster way to
+ create an Element within a specific document or parser context.
+ """
+ return _makeElement(_tag, NULL, None, None, None, None,
+ attrib, nsmap, _extra)
+
+
+def Comment(text=None):
+ u"""Comment(text=None)
+
+ Comment element factory. This factory function creates a special element that will
+ be serialized as an XML comment.
+ """
+ cdef _Document doc
+ cdef xmlNode* c_node
+ cdef xmlDoc* c_doc
+
+ if text is None:
+ text = b''
+ else:
+ text = _utf8(text)
+ if b'--' in text or text.endswith(b'-'):
+ raise ValueError("Comment may not contain '--' or end with '-'")
+
+ c_doc = _newXMLDoc()
+ doc = _documentFactory(c_doc, None)
+ c_node = _createComment(c_doc, _xcstr(text))
+ tree.xmlAddChild(<xmlNode*>c_doc, c_node)
+ return _elementFactory(doc, c_node)
+
+
+def ProcessingInstruction(target, text=None):
+ u"""ProcessingInstruction(target, text=None)
+
+ ProcessingInstruction element factory. This factory function creates a
+ special element that will be serialized as an XML processing instruction.
+ """
+ cdef _Document doc
+ cdef xmlNode* c_node
+ cdef xmlDoc* c_doc
+
+ target = _utf8(target)
+ _tagValidOrRaise(target)
+ if target.lower() == b'xml':
+ raise ValueError, f"Invalid PI name '{target}'"
+
+ if text is None:
+ text = b''
+ else:
+ text = _utf8(text)
+ if b'?>' in text:
+ raise ValueError, "PI text must not contain '?>'"
+
+ c_doc = _newXMLDoc()
+ doc = _documentFactory(c_doc, None)
+ c_node = _createPI(c_doc, _xcstr(target), _xcstr(text))
+ tree.xmlAddChild(<xmlNode*>c_doc, c_node)
+ return _elementFactory(doc, c_node)
+
+PI = ProcessingInstruction
+
+
+cdef class CDATA:
+ u"""CDATA(data)
+
+ CDATA factory. This factory creates an opaque data object that
+ can be used to set Element text. The usual way to use it is::
+
+ >>> el = Element('content')
+ >>> el.text = CDATA('a string')
+
+ >>> print(el.text)
+ a string
+ >>> print(tostring(el, encoding="unicode"))
+ <content><![CDATA[a string]]></content>
+ """
+ cdef bytes _utf8_data
+ def __cinit__(self, data):
+ _utf8_data = _utf8(data)
+ if b']]>' in _utf8_data:
+ raise ValueError, "']]>' not allowed inside CDATA"
+ self._utf8_data = _utf8_data
+
+
+def Entity(name):
+ u"""Entity(name)
+
+ Entity factory. This factory function creates a special element
+ that will be serialized as an XML entity reference or character
+ reference. Note, however, that entities will not be automatically
+ declared in the document. A document that uses entity references
+ requires a DTD to define the entities.
+ """
+ cdef _Document doc
+ cdef xmlNode* c_node
+ cdef xmlDoc* c_doc
+ name_utf = _utf8(name)
+ c_name = _xcstr(name_utf)
+ if c_name[0] == c'#':
+ if not _characterReferenceIsValid(c_name + 1):
+ raise ValueError, f"Invalid character reference: '{name}'"
+ elif not _xmlNameIsValid(c_name):
+ raise ValueError, f"Invalid entity reference: '{name}'"
+ c_doc = _newXMLDoc()
+ doc = _documentFactory(c_doc, None)
+ c_node = _createEntity(c_doc, c_name)
+ tree.xmlAddChild(<xmlNode*>c_doc, c_node)
+ return _elementFactory(doc, c_node)
+
+
+def SubElement(_Element _parent not None, _tag,
+ attrib=None, nsmap=None, **_extra):
+ u"""SubElement(_parent, _tag, attrib=None, nsmap=None, **_extra)
+
+ Subelement factory. This function creates an element instance, and
+ appends it to an existing element.
+ """
+ return _makeSubElement(_parent, _tag, None, None, attrib, nsmap, _extra)
+
+
+def ElementTree(_Element element=None, *, file=None, _BaseParser parser=None):
+ u"""ElementTree(element=None, file=None, parser=None)
+
+ ElementTree wrapper class.
+ """
+ cdef xmlNode* c_next
+ cdef xmlNode* c_node
+ cdef xmlNode* c_node_copy
+ cdef xmlDoc* c_doc
+ cdef _ElementTree etree
+ cdef _Document doc
+
+ if element is not None:
+ doc = element._doc
+ elif file is not None:
+ try:
+ doc = _parseDocument(file, parser, None)
+ except _TargetParserResult as result_container:
+ return result_container.result
+ else:
+ c_doc = _newXMLDoc()
+ doc = _documentFactory(c_doc, parser)
+
+ return _elementTreeFactory(doc, element)
+
+
+def HTML(text, _BaseParser parser=None, *, base_url=None):
+ u"""HTML(text, parser=None, base_url=None)
+
+ Parses an HTML document from a string constant. Returns the root
+ node (or the result returned by a parser target). This function
+ can be used to embed "HTML literals" in Python code.
+
+ To override the parser with a different ``HTMLParser`` you can pass it to
+ the ``parser`` keyword argument.
+
+ The ``base_url`` keyword argument allows to set the original base URL of
+ the document to support relative Paths when looking up external entities
+ (DTD, XInclude, ...).
+ """
+ cdef _Document doc
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ if not isinstance(parser, HTMLParser):
+ parser = __DEFAULT_HTML_PARSER
+ try:
+ doc = _parseMemoryDocument(text, base_url, parser)
+ return doc.getroot()
+ except _TargetParserResult as result_container:
+ return result_container.result
+
+
+def XML(text, _BaseParser parser=None, *, base_url=None):
+ u"""XML(text, parser=None, base_url=None)
+
+ Parses an XML document or fragment from a string constant.
+ Returns the root node (or the result returned by a parser target).
+ This function can be used to embed "XML literals" in Python code,
+ like in
+
+ >>> root = XML("<root><test/></root>")
+ >>> print(root.tag)
+ root
+
+ To override the parser with a different ``XMLParser`` you can pass it to
+ the ``parser`` keyword argument.
+
+ The ``base_url`` keyword argument allows to set the original base URL of
+ the document to support relative Paths when looking up external entities
+ (DTD, XInclude, ...).
+ """
+ cdef _Document doc
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ if not isinstance(parser, XMLParser):
+ parser = __DEFAULT_XML_PARSER
+ try:
+ doc = _parseMemoryDocument(text, base_url, parser)
+ return doc.getroot()
+ except _TargetParserResult as result_container:
+ return result_container.result
+
+
+def fromstring(text, _BaseParser parser=None, *, base_url=None):
+ u"""fromstring(text, parser=None, base_url=None)
+
+ Parses an XML document or fragment from a string. Returns the
+ root node (or the result returned by a parser target).
+
+ To override the default parser with a different parser you can pass it to
+ the ``parser`` keyword argument.
+
+ The ``base_url`` keyword argument allows to set the original base URL of
+ the document to support relative Paths when looking up external entities
+ (DTD, XInclude, ...).
+ """
+ cdef _Document doc
+ try:
+ doc = _parseMemoryDocument(text, base_url, parser)
+ return doc.getroot()
+ except _TargetParserResult as result_container:
+ return result_container.result
+
+
+def fromstringlist(strings, _BaseParser parser=None):
+ u"""fromstringlist(strings, parser=None)
+
+ Parses an XML document from a sequence of strings. Returns the
+ root node (or the result returned by a parser target).
+
+ To override the default parser with a different parser you can pass it to
+ the ``parser`` keyword argument.
+ """
+ cdef _Document doc
+ if isinstance(strings, (bytes, unicode)):
+ raise ValueError("passing a single string into fromstringlist() is not"
+ " efficient, use fromstring() instead")
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ feed = parser.feed
+ for data in strings:
+ feed(data)
+ return parser.close()
+
+
+def iselement(element):
+ u"""iselement(element)
+
+ Checks if an object appears to be a valid element object.
+ """
+ return isinstance(element, _Element) and (<_Element>element)._c_node is not NULL
+
+
+def indent(tree, space=" ", *, Py_ssize_t level=0):
+ """indent(tree, space=" ", level=0)
+
+ Indent an XML document by inserting newlines and indentation space
+ after elements.
+
+ *tree* is the ElementTree or Element to modify. The (root) element
+ itself will not be changed, but the tail text of all elements in its
+ subtree will be adapted.
+
+ *space* is the whitespace to insert for each indentation level, two
+ space characters by default.
+
+ *level* is the initial indentation level. Setting this to a higher
+ value than 0 can be used for indenting subtrees that are more deeply
+ nested inside of a document.
+ """
+ root = _rootNodeOrRaise(tree)
+ if level < 0:
+ raise ValueError(f"Initial indentation level must be >= 0, got {level}")
+ if _hasChild(root._c_node):
+ space = _utf8(space)
+ indent = b"\n" + level * space
+ _indent_children(root._c_node, 1, space, [indent, indent + space])
+
+
+cdef int _indent_children(xmlNode* c_node, Py_ssize_t level, bytes one_space, list indentations) except -1:
+ # Reuse indentation strings for speed.
+ if len(indentations) <= level:
+ indentations.append(indentations[-1] + one_space)
+
+ # Start a new indentation level for the first child.
+ child_indentation = indentations[level]
+ if not _hasNonWhitespaceText(c_node):
+ _setNodeText(c_node, child_indentation)
+
+ # Recursively indent all children.
+ cdef xmlNode* c_child = _findChildForwards(c_node, 0)
+ while c_child is not NULL:
+ if _hasChild(c_child):
+ _indent_children(c_child, level+1, one_space, indentations)
+ c_next_child = _nextElement(c_child)
+ if not _hasNonWhitespaceTail(c_child):
+ if c_next_child is NULL:
+ # Dedent after the last child.
+ child_indentation = indentations[level-1]
+ _setTailText(c_child, child_indentation)
+ c_child = c_next_child
+ return 0
+
+
+def dump(_Element elem not None, *, bint pretty_print=True, with_tail=True):
+ u"""dump(elem, pretty_print=True, with_tail=True)
+
+ Writes an element tree or element structure to sys.stdout. This function
+ should be used for debugging only.
+ """
+ xml = tostring(elem, pretty_print=pretty_print, with_tail=with_tail,
+ encoding=None if python.IS_PYTHON2 else 'unicode')
+ if not pretty_print:
+ xml += '\n'
+ sys.stdout.write(xml)
+
+
+def tostring(element_or_tree, *, encoding=None, method="xml",
+ xml_declaration=None, bint pretty_print=False, bint with_tail=True,
+ standalone=None, doctype=None,
+ # method='c14n'
+ bint exclusive=False, inclusive_ns_prefixes=None,
+ # method='c14n2'
+ bint with_comments=True, bint strip_text=False,
+ ):
+ u"""tostring(element_or_tree, encoding=None, method="xml",
+ xml_declaration=None, pretty_print=False, with_tail=True,
+ standalone=None, doctype=None,
+ exclusive=False, inclusive_ns_prefixes=None,
+ with_comments=True, strip_text=False,
+ )
+
+ Serialize an element to an encoded string representation of its XML
+ tree.
+
+ Defaults to ASCII encoding without XML declaration. This
+ behaviour can be configured with the keyword arguments 'encoding'
+ (string) and 'xml_declaration' (bool). Note that changing the
+ encoding to a non UTF-8 compatible encoding will enable a
+ declaration by default.
+
+ You can also serialise to a Unicode string without declaration by
+ passing the name ``'unicode'`` as encoding (or the ``str`` function
+ in Py3 or ``unicode`` in Py2). This changes the return value from
+ a byte string to an unencoded unicode string.
+
+ The keyword argument 'pretty_print' (bool) enables formatted XML.
+
+ The keyword argument 'method' selects the output method: 'xml',
+ 'html', plain 'text' (text content without tags), 'c14n' or 'c14n2'.
+ Default is 'xml'.
+
+ With ``method="c14n"`` (C14N version 1), the options ``exclusive``,
+ ``with_comments`` and ``inclusive_ns_prefixes`` request exclusive
+ C14N, include comments, and list the inclusive prefixes respectively.
+
+ With ``method="c14n2"`` (C14N version 2), the ``with_comments`` and
+ ``strip_text`` options control the output of comments and text space
+ according to C14N 2.0.
+
+ Passing a boolean value to the ``standalone`` option will output
+ an XML declaration with the corresponding ``standalone`` flag.
+
+ The ``doctype`` option allows passing in a plain string that will
+ be serialised before the XML tree. Note that passing in non
+ well-formed content here will make the XML output non well-formed.
+ Also, an existing doctype in the document tree will not be removed
+ when serialising an ElementTree instance.
+
+ You can prevent the tail text of the element from being serialised
+ by passing the boolean ``with_tail`` option. This has no impact
+ on the tail text of children, which will always be serialised.
+ """
+ cdef bint write_declaration
+ cdef int is_standalone
+ # C14N serialisation
+ if method in ('c14n', 'c14n2'):
+ if encoding is not None:
+ raise ValueError("Cannot specify encoding with C14N")
+ if xml_declaration:
+ raise ValueError("Cannot enable XML declaration in C14N")
+ if method == 'c14n':
+ return _tostringC14N(element_or_tree, exclusive, with_comments, inclusive_ns_prefixes)
+ else:
+ out = BytesIO()
+ target = C14NWriterTarget(
+ utf8_writer(out).write,
+ with_comments=with_comments, strip_text=strip_text)
+ _tree_to_target(element_or_tree, target)
+ return out.getvalue()
+ if not with_comments:
+ raise ValueError("Can only discard comments in C14N serialisation")
+ if strip_text:
+ raise ValueError("Can only strip text in C14N 2.0 serialisation")
+ if encoding is unicode or (encoding is not None and encoding.lower() == 'unicode'):
+ if xml_declaration:
+ raise ValueError, \
+ u"Serialisation to unicode must not request an XML declaration"
+ write_declaration = 0
+ encoding = unicode
+ elif xml_declaration is None:
+ # by default, write an XML declaration only for non-standard encodings
+ write_declaration = encoding is not None and encoding.upper() not in \
+ (u'ASCII', u'UTF-8', u'UTF8', u'US-ASCII')
+ else:
+ write_declaration = xml_declaration
+ if encoding is None:
+ encoding = u'ASCII'
+ if standalone is None:
+ is_standalone = -1
+ elif standalone:
+ write_declaration = 1
+ is_standalone = 1
+ else:
+ write_declaration = 1
+ is_standalone = 0
+
+ if isinstance(element_or_tree, _Element):
+ return _tostring(<_Element>element_or_tree, encoding, doctype, method,
+ write_declaration, 0, pretty_print, with_tail,
+ is_standalone)
+ elif isinstance(element_or_tree, _ElementTree):
+ return _tostring((<_ElementTree>element_or_tree)._context_node,
+ encoding, doctype, method, write_declaration, 1,
+ pretty_print, with_tail, is_standalone)
+ else:
+ raise TypeError, f"Type '{python._fqtypename(element_or_tree).decode('utf8')}' cannot be serialized."
+
+
+
+def tostringlist(element_or_tree, *args, **kwargs):
+ u"""tostringlist(element_or_tree, *args, **kwargs)
+
+ Serialize an element to an encoded string representation of its XML
+ tree, stored in a list of partial strings.
+
+ This is purely for ElementTree 1.3 compatibility. The result is a
+ single string wrapped in a list.
+ """
+ return [tostring(element_or_tree, *args, **kwargs)]
+
+
+def tounicode(element_or_tree, *, method=u"xml", bint pretty_print=False,
+ bint with_tail=True, doctype=None):
+ u"""tounicode(element_or_tree, method="xml", pretty_print=False,
+ with_tail=True, doctype=None)
+
+ Serialize an element to the Python unicode representation of its XML
+ tree.
+
+ :deprecated: use ``tostring(el, encoding='unicode')`` instead.
+
+ Note that the result does not carry an XML encoding declaration and is
+ therefore not necessarily suited for serialization to byte streams without
+ further treatment.
+
+ The boolean keyword argument 'pretty_print' enables formatted XML.
+
+ The keyword argument 'method' selects the output method: 'xml',
+ 'html' or plain 'text'.
+
+ You can prevent the tail text of the element from being serialised
+ by passing the boolean ``with_tail`` option. This has no impact
+ on the tail text of children, which will always be serialised.
+ """
+ if isinstance(element_or_tree, _Element):
+ return _tostring(<_Element>element_or_tree, unicode, doctype, method,
+ 0, 0, pretty_print, with_tail, -1)
+ elif isinstance(element_or_tree, _ElementTree):
+ return _tostring((<_ElementTree>element_or_tree)._context_node,
+ unicode, doctype, method, 0, 1, pretty_print,
+ with_tail, -1)
+ else:
+ raise TypeError, f"Type '{type(element_or_tree)}' cannot be serialized."
+
+
+def parse(source, _BaseParser parser=None, *, base_url=None):
+ u"""parse(source, parser=None, base_url=None)
+
+ Return an ElementTree object loaded with source elements. If no parser
+ is provided as second argument, the default parser is used.
+
+ The ``source`` can be any of the following:
+
+ - a file name/path
+ - a file object
+ - a file-like object
+ - a URL using the HTTP or FTP protocol
+
+ To parse from a string, use the ``fromstring()`` function instead.
+
+ Note that it is generally faster to parse from a file path or URL
+ than from an open file object or file-like object. Transparent
+ decompression from gzip compressed sources is supported (unless
+ explicitly disabled in libxml2).
+
+ The ``base_url`` keyword allows setting a URL for the document
+ when parsing from a file-like object. This is needed when looking
+ up external entities (DTD, XInclude, ...) with relative paths.
+ """
+ cdef _Document doc
+ try:
+ doc = _parseDocument(source, parser, base_url)
+ return _elementTreeFactory(doc, None)
+ except _TargetParserResult as result_container:
+ return result_container.result
+
+
+def adopt_external_document(capsule, _BaseParser parser=None):
+ """adopt_external_document(capsule, parser=None)
+
+ Unpack a libxml2 document pointer from a PyCapsule and wrap it in an
+ lxml ElementTree object.
+
+ This allows external libraries to build XML/HTML trees using libxml2
+ and then pass them efficiently into lxml for further processing.
+
+ If a ``parser`` is provided, it will be used for configuring the
+ lxml document. No parsing will be done.
+
+ The capsule must have the name ``"libxml2:xmlDoc"`` and its pointer
+ value must reference a correct libxml2 document of type ``xmlDoc*``.
+ The creator of the capsule must take care to correctly clean up the
+ document using an appropriate capsule destructor. By default, the
+ libxml2 document will be copied to let lxml safely own the memory
+ of the internal tree that it uses.
+
+ If the capsule context is non-NULL, it must point to a C string that
+ can be compared using ``strcmp()``. If the context string equals
+ ``"destructor:xmlFreeDoc"``, the libxml2 document will not be copied
+ but the capsule invalidated instead by clearing its destructor and
+ name. That way, lxml takes ownership of the libxml2 document in memory
+ without creating a copy first, and the capsule destructor will not be
+ called. The document will then eventually be cleaned up by lxml using
+ the libxml2 API function ``xmlFreeDoc()`` once it is no longer used.
+
+ If no copy is made, later modifications of the tree outside of lxml
+ should not be attempted after transferring the ownership.
+ """
+ cdef xmlDoc* c_doc
+ cdef bint is_owned = False
+ c_doc = <xmlDoc*> python.lxml_unpack_xmldoc_capsule(capsule, &is_owned)
+ doc = _adoptForeignDoc(c_doc, parser, is_owned)
+ return _elementTreeFactory(doc, None)
+
+
+################################################################################
+# Include submodules
+
+include "readonlytree.pxi" # Read-only implementation of Element proxies
+include "classlookup.pxi" # Element class lookup mechanisms
+include "nsclasses.pxi" # Namespace implementation and registry
+include "docloader.pxi" # Support for custom document loaders
+include "parser.pxi" # XML and HTML parsers
+include "saxparser.pxi" # SAX-like Parser interface and tree builder
+include "parsertarget.pxi" # ET Parser target
+include "serializer.pxi" # XML output functions
+include "iterparse.pxi" # incremental XML parsing
+include "xmlid.pxi" # XMLID and IDDict
+include "xinclude.pxi" # XInclude
+include "cleanup.pxi" # Cleanup and recursive element removal functions
+
+
+################################################################################
+# Include submodules for XPath and XSLT
+
+include "extensions.pxi" # XPath/XSLT extension functions
+include "xpath.pxi" # XPath evaluation
+include "xslt.pxi" # XSL transformations
+include "xsltext.pxi" # XSL extension elements
+
+
+################################################################################
+# Validation
+
+cdef class DocumentInvalid(LxmlError):
+ """Validation error.
+
+ Raised by all document validators when their ``assertValid(tree)``
+ method fails.
+ """
+
+
+cdef class _Validator:
+ u"Base class for XML validators."
+ cdef _ErrorLog _error_log
+ def __cinit__(self):
+ self._error_log = _ErrorLog()
+
+ def validate(self, etree):
+ u"""validate(self, etree)
+
+ Validate the document using this schema.
+
+ Returns true if document is valid, false if not.
+ """
+ return self(etree)
+
+ def assertValid(self, etree):
+ u"""assertValid(self, etree)
+
+ Raises `DocumentInvalid` if the document does not comply with the schema.
+ """
+ if not self(etree):
+ raise DocumentInvalid(self._error_log._buildExceptionMessage(
+ u"Document does not comply with schema"),
+ self._error_log)
+
+ def assert_(self, etree):
+ u"""assert_(self, etree)
+
+ Raises `AssertionError` if the document does not comply with the schema.
+ """
+ if not self(etree):
+ raise AssertionError, self._error_log._buildExceptionMessage(
+ u"Document does not comply with schema")
+
+ cpdef _append_log_message(self, int domain, int type, int level, int line,
+ message, filename):
+ self._error_log._receiveGeneric(domain, type, level, line, message,
+ filename)
+
+ cpdef _clear_error_log(self):
+ self._error_log.clear()
+
+ @property
+ def error_log(self):
+ """The log of validation errors and warnings."""
+ assert self._error_log is not None, "XPath evaluator not initialised"
+ return self._error_log.copy()
+
+include "dtd.pxi" # DTD
+include "relaxng.pxi" # RelaxNG
+include "xmlschema.pxi" # XMLSchema
+include "schematron.pxi" # Schematron (requires libxml2 2.6.21+)
+
+################################################################################
+# Public C API
+
+include "public-api.pxi"
+
+################################################################################
+# Other stuff
+
+include "debug.pxi"
diff --git a/src/lxml/extensions.pxi b/src/lxml/extensions.pxi
new file mode 100644
index 0000000..35a321b
--- /dev/null
+++ b/src/lxml/extensions.pxi
@@ -0,0 +1,871 @@
+# support for extension functions in XPath and XSLT
+
+cdef class XPathError(LxmlError):
+ """Base class of all XPath errors.
+ """
+
+cdef class XPathEvalError(XPathError):
+ """Error during XPath evaluation.
+ """
+
+cdef class XPathFunctionError(XPathEvalError):
+ """Internal error looking up an XPath extension function.
+ """
+
+cdef class XPathResultError(XPathEvalError):
+ """Error handling an XPath result.
+ """
+
+
+# forward declarations
+
+ctypedef int (*_register_function)(void* ctxt, name_utf, ns_uri_utf)
+cdef class _ExsltRegExp
+
+################################################################################
+# Base class for XSLT and XPath evaluation contexts: functions, namespaces, ...
+
+@cython.internal
+cdef class _BaseContext:
+ cdef xpath.xmlXPathContext* _xpathCtxt
+ cdef _Document _doc
+ cdef dict _extensions
+ cdef list _namespaces
+ cdef list _global_namespaces
+ cdef dict _utf_refs
+ cdef dict _function_cache
+ cdef dict _eval_context_dict
+ cdef bint _build_smart_strings
+ # for exception handling and temporary reference keeping:
+ cdef _TempStore _temp_refs
+ cdef set _temp_documents
+ cdef _ExceptionContext _exc
+ cdef _ErrorLog _error_log
+
+ def __cinit__(self):
+ self._xpathCtxt = NULL
+
+ def __init__(self, namespaces, extensions, error_log, enable_regexp,
+ build_smart_strings):
+ cdef _ExsltRegExp _regexp
+ cdef dict new_extensions
+ cdef list ns
+ self._utf_refs = {}
+ self._global_namespaces = []
+ self._function_cache = {}
+ self._eval_context_dict = None
+ self._error_log = error_log
+
+ if extensions is not None:
+ # convert extensions to UTF-8
+ if isinstance(extensions, dict):
+ extensions = (extensions,)
+ # format: [ {(ns, name):function} ] -> {(ns_utf, name_utf):function}
+ new_extensions = {}
+ for extension in extensions:
+ for (ns_uri, name), function in extension.items():
+ if name is None:
+ raise ValueError, u"extensions must have non empty names"
+ ns_utf = self._to_utf(ns_uri)
+ name_utf = self._to_utf(name)
+ new_extensions[(ns_utf, name_utf)] = function
+ extensions = new_extensions or None
+
+ if namespaces is not None:
+ if isinstance(namespaces, dict):
+ namespaces = namespaces.items()
+ if namespaces:
+ ns = []
+ for prefix, ns_uri in namespaces:
+ if prefix is None or not prefix:
+ raise TypeError, \
+ u"empty namespace prefix is not supported in XPath"
+ if ns_uri is None or not ns_uri:
+ raise TypeError, \
+ u"setting default namespace is not supported in XPath"
+ prefix_utf = self._to_utf(prefix)
+ ns_uri_utf = self._to_utf(ns_uri)
+ ns.append( (prefix_utf, ns_uri_utf) )
+ namespaces = ns
+ else:
+ namespaces = None
+
+ self._doc = None
+ self._exc = _ExceptionContext()
+ self._extensions = extensions
+ self._namespaces = namespaces
+ self._temp_refs = _TempStore()
+ self._temp_documents = set()
+ self._build_smart_strings = build_smart_strings
+
+ if enable_regexp:
+ _regexp = _ExsltRegExp()
+ _regexp._register_in_context(self)
+
+ cdef _BaseContext _copy(self):
+ cdef _BaseContext context
+ if self._namespaces is not None:
+ namespaces = self._namespaces[:]
+ else:
+ namespaces = None
+ context = self.__class__(namespaces, None, self._error_log, False,
+ self._build_smart_strings)
+ if self._extensions is not None:
+ context._extensions = self._extensions.copy()
+ return context
+
+ cdef bytes _to_utf(self, s):
+ u"Convert to UTF-8 and keep a reference to the encoded string"
+ cdef python.PyObject* dict_result
+ if s is None:
+ return None
+ dict_result = python.PyDict_GetItem(self._utf_refs, s)
+ if dict_result is not NULL:
+ return <bytes>dict_result
+ utf = _utf8(s)
+ self._utf_refs[s] = utf
+ if python.IS_PYPY:
+ # use C level refs, PyPy refs are not enough!
+ python.Py_INCREF(utf)
+ return utf
+
+ cdef void _set_xpath_context(self, xpath.xmlXPathContext* xpathCtxt):
+ self._xpathCtxt = xpathCtxt
+ xpathCtxt.userData = <void*>self
+ xpathCtxt.error = _receiveXPathError
+
+ @cython.final
+ cdef _register_context(self, _Document doc):
+ self._doc = doc
+ self._exc.clear()
+
+ @cython.final
+ cdef _cleanup_context(self):
+ #xpath.xmlXPathRegisteredNsCleanup(self._xpathCtxt)
+ #self.unregisterGlobalNamespaces()
+ if python.IS_PYPY:
+ # clean up double refs in PyPy (see "_to_utf()" method)
+ for ref in self._utf_refs.itervalues():
+ python.Py_DECREF(ref)
+ self._utf_refs.clear()
+ self._eval_context_dict = None
+ self._doc = None
+
+ @cython.final
+ cdef _release_context(self):
+ if self._xpathCtxt is not NULL:
+ self._xpathCtxt.userData = NULL
+ self._xpathCtxt = NULL
+
+ # namespaces (internal UTF-8 methods with leading '_')
+
+ cdef addNamespace(self, prefix, ns_uri):
+ cdef list namespaces
+ if prefix is None:
+ raise TypeError, u"empty prefix is not supported in XPath"
+ prefix_utf = self._to_utf(prefix)
+ ns_uri_utf = self._to_utf(ns_uri)
+ new_item = (prefix_utf, ns_uri_utf)
+ if self._namespaces is None:
+ self._namespaces = [new_item]
+ else:
+ namespaces = []
+ for item in self._namespaces:
+ if item[0] == prefix_utf:
+ item = new_item
+ new_item = None
+ namespaces.append(item)
+ if new_item is not None:
+ namespaces.append(new_item)
+ self._namespaces = namespaces
+ if self._xpathCtxt is not NULL:
+ xpath.xmlXPathRegisterNs(
+ self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
+
+ cdef registerNamespace(self, prefix, ns_uri):
+ if prefix is None:
+ raise TypeError, u"empty prefix is not supported in XPath"
+ prefix_utf = self._to_utf(prefix)
+ ns_uri_utf = self._to_utf(ns_uri)
+ self._global_namespaces.append(prefix_utf)
+ xpath.xmlXPathRegisterNs(self._xpathCtxt,
+ _xcstr(prefix_utf), _xcstr(ns_uri_utf))
+
+ cdef registerLocalNamespaces(self):
+ if self._namespaces is None:
+ return
+ for prefix_utf, ns_uri_utf in self._namespaces:
+ xpath.xmlXPathRegisterNs(
+ self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
+
+ cdef registerGlobalNamespaces(self):
+ cdef list ns_prefixes = _find_all_extension_prefixes()
+ if python.PyList_GET_SIZE(ns_prefixes) > 0:
+ for prefix_utf, ns_uri_utf in ns_prefixes:
+ self._global_namespaces.append(prefix_utf)
+ xpath.xmlXPathRegisterNs(
+ self._xpathCtxt, _xcstr(prefix_utf), _xcstr(ns_uri_utf))
+
+ cdef unregisterGlobalNamespaces(self):
+ if python.PyList_GET_SIZE(self._global_namespaces) > 0:
+ for prefix_utf in self._global_namespaces:
+ xpath.xmlXPathRegisterNs(self._xpathCtxt,
+ _xcstr(prefix_utf), NULL)
+ del self._global_namespaces[:]
+
+ cdef void _unregisterNamespace(self, prefix_utf):
+ xpath.xmlXPathRegisterNs(self._xpathCtxt,
+ _xcstr(prefix_utf), NULL)
+
+ # extension functions
+
+ cdef int _addLocalExtensionFunction(self, ns_utf, name_utf, function) except -1:
+ if self._extensions is None:
+ self._extensions = {}
+ self._extensions[(ns_utf, name_utf)] = function
+ return 0
+
+ cdef registerGlobalFunctions(self, void* ctxt,
+ _register_function reg_func):
+ cdef python.PyObject* dict_result
+ cdef dict d
+ for ns_utf, ns_functions in __FUNCTION_NAMESPACE_REGISTRIES.iteritems():
+ dict_result = python.PyDict_GetItem(
+ self._function_cache, ns_utf)
+ if dict_result is not NULL:
+ d = <dict>dict_result
+ else:
+ d = {}
+ self._function_cache[ns_utf] = d
+ for name_utf, function in ns_functions.iteritems():
+ d[name_utf] = function
+ reg_func(ctxt, name_utf, ns_utf)
+
+ cdef registerLocalFunctions(self, void* ctxt,
+ _register_function reg_func):
+ cdef python.PyObject* dict_result
+ cdef dict d
+ if self._extensions is None:
+ return # done
+ last_ns = None
+ d = None
+ for (ns_utf, name_utf), function in self._extensions.iteritems():
+ if ns_utf is not last_ns or d is None:
+ last_ns = ns_utf
+ dict_result = python.PyDict_GetItem(
+ self._function_cache, ns_utf)
+ if dict_result is not NULL:
+ d = <dict>dict_result
+ else:
+ d = {}
+ self._function_cache[ns_utf] = d
+ d[name_utf] = function
+ reg_func(ctxt, name_utf, ns_utf)
+
+ cdef unregisterAllFunctions(self, void* ctxt,
+ _register_function unreg_func):
+ for ns_utf, functions in self._function_cache.iteritems():
+ for name_utf in functions:
+ unreg_func(ctxt, name_utf, ns_utf)
+
+ cdef unregisterGlobalFunctions(self, void* ctxt,
+ _register_function unreg_func):
+ for ns_utf, functions in self._function_cache.items():
+ for name_utf in functions:
+ if self._extensions is None or \
+ (ns_utf, name_utf) not in self._extensions:
+ unreg_func(ctxt, name_utf, ns_utf)
+
+ @cython.final
+ cdef _find_cached_function(self, const_xmlChar* c_ns_uri, const_xmlChar* c_name):
+ u"""Lookup an extension function in the cache and return it.
+
+ Parameters: c_ns_uri may be NULL, c_name must not be NULL
+ """
+ cdef python.PyObject* c_dict
+ cdef python.PyObject* dict_result
+ c_dict = python.PyDict_GetItem(
+ self._function_cache, None if c_ns_uri is NULL else c_ns_uri)
+ if c_dict is not NULL:
+ dict_result = python.PyDict_GetItem(
+ <object>c_dict, <unsigned char*>c_name)
+ if dict_result is not NULL:
+ return <object>dict_result
+ return None
+
+ # Python access to the XPath context for extension functions
+
+ @property
+ def context_node(self):
+ cdef xmlNode* c_node
+ if self._xpathCtxt is NULL:
+ raise XPathError, \
+ u"XPath context is only usable during the evaluation"
+ c_node = self._xpathCtxt.node
+ if c_node is NULL:
+ raise XPathError, u"no context node"
+ if c_node.doc != self._xpathCtxt.doc:
+ raise XPathError, \
+ u"document-external context nodes are not supported"
+ if self._doc is None:
+ raise XPathError, u"document context is missing"
+ return _elementFactory(self._doc, c_node)
+
+ @property
+ def eval_context(self):
+ if self._eval_context_dict is None:
+ self._eval_context_dict = {}
+ return self._eval_context_dict
+
+ # Python reference keeping during XPath function evaluation
+
+ @cython.final
+ cdef _release_temp_refs(self):
+ u"Free temporarily referenced objects from this context."
+ self._temp_refs.clear()
+ self._temp_documents.clear()
+
+ @cython.final
+ cdef _hold(self, obj):
+ u"""A way to temporarily hold references to nodes in the evaluator.
+
+ This is needed because otherwise nodes created in XPath extension
+ functions would be reference counted too soon, during the XPath
+ evaluation. This is most important in the case of exceptions.
+ """
+ cdef _Element element
+ if isinstance(obj, _Element):
+ self._temp_refs.add(obj)
+ self._temp_documents.add((<_Element>obj)._doc)
+ return
+ elif _isString(obj) or not python.PySequence_Check(obj):
+ return
+ for o in obj:
+ if isinstance(o, _Element):
+ #print "Holding element:", <int>element._c_node
+ self._temp_refs.add(o)
+ #print "Holding document:", <int>element._doc._c_doc
+ self._temp_documents.add((<_Element>o)._doc)
+
+ @cython.final
+ cdef _Document _findDocumentForNode(self, xmlNode* c_node):
+ u"""If an XPath expression returns an element from a different
+ document than the current context document, we call this to
+ see if it was possibly created by an extension and is a known
+ document instance.
+ """
+ cdef _Document doc
+ for doc in self._temp_documents:
+ if doc is not None and doc._c_doc is c_node.doc:
+ return doc
+ return None
+
+
+# libxml2 keeps these error messages in a static array in its code
+# and doesn't give us access to them ...
+
+cdef tuple LIBXML2_XPATH_ERROR_MESSAGES = (
+ b"Ok",
+ b"Number encoding",
+ b"Unfinished literal",
+ b"Start of literal",
+ b"Expected $ for variable reference",
+ b"Undefined variable",
+ b"Invalid predicate",
+ b"Invalid expression",
+ b"Missing closing curly brace",
+ b"Unregistered function",
+ b"Invalid operand",
+ b"Invalid type",
+ b"Invalid number of arguments",
+ b"Invalid context size",
+ b"Invalid context position",
+ b"Memory allocation error",
+ b"Syntax error",
+ b"Resource error",
+ b"Sub resource error",
+ b"Undefined namespace prefix",
+ b"Encoding error",
+ b"Char out of XML range",
+ b"Invalid or incomplete context",
+ b"Stack usage error",
+ b"Forbidden variable\n",
+ b"?? Unknown error ??\n",
+)
+
+cdef void _forwardXPathError(void* c_ctxt, xmlerror.xmlError* c_error) with gil:
+ cdef xmlerror.xmlError error
+ cdef int xpath_code
+ if c_error.message is not NULL:
+ error.message = c_error.message
+ else:
+ xpath_code = c_error.code - xmlerror.XML_XPATH_EXPRESSION_OK
+ if 0 <= xpath_code < len(LIBXML2_XPATH_ERROR_MESSAGES):
+ error.message = _cstr(LIBXML2_XPATH_ERROR_MESSAGES[xpath_code])
+ else:
+ error.message = b"unknown error"
+ error.domain = c_error.domain
+ error.code = c_error.code
+ error.level = c_error.level
+ error.line = c_error.line
+ error.int2 = c_error.int1 # column
+ error.file = c_error.file
+ error.node = NULL
+
+ (<_BaseContext>c_ctxt)._error_log._receive(&error)
+
+cdef void _receiveXPathError(void* c_context, xmlerror.xmlError* error) nogil:
+ if not __DEBUG:
+ return
+ if c_context is NULL:
+ _forwardError(NULL, error)
+ else:
+ _forwardXPathError(c_context, error)
+
+
+def Extension(module, function_mapping=None, *, ns=None):
+ u"""Extension(module, function_mapping=None, ns=None)
+
+ Build a dictionary of extension functions from the functions
+ defined in a module or the methods of an object.
+
+ As second argument, you can pass an additional mapping of
+ attribute names to XPath function names, or a list of function
+ names that should be taken.
+
+ The ``ns`` keyword argument accepts a namespace URI for the XPath
+ functions.
+ """
+ cdef dict functions = {}
+ if isinstance(function_mapping, dict):
+ for function_name, xpath_name in function_mapping.items():
+ functions[(ns, xpath_name)] = getattr(module, function_name)
+ else:
+ if function_mapping is None:
+ function_mapping = [ name for name in dir(module)
+ if not name.startswith(u'_') ]
+ for function_name in function_mapping:
+ functions[(ns, function_name)] = getattr(module, function_name)
+ return functions
+
+################################################################################
+# EXSLT regexp implementation
+
+@cython.final
+@cython.internal
+cdef class _ExsltRegExp:
+ cdef dict _compile_map
+ def __cinit__(self):
+ self._compile_map = {}
+
+ cdef _make_string(self, value):
+ if _isString(value):
+ return value
+ elif isinstance(value, list):
+ # node set: take recursive text concatenation of first element
+ if python.PyList_GET_SIZE(value) == 0:
+ return u''
+ firstnode = value[0]
+ if _isString(firstnode):
+ return firstnode
+ elif isinstance(firstnode, _Element):
+ c_text = tree.xmlNodeGetContent((<_Element>firstnode)._c_node)
+ if c_text is NULL:
+ raise MemoryError()
+ try:
+ return funicode(c_text)
+ finally:
+ tree.xmlFree(c_text)
+ else:
+ return unicode(firstnode)
+ else:
+ return unicode(value)
+
+ cdef _compile(self, rexp, ignore_case):
+ cdef python.PyObject* c_result
+ rexp = self._make_string(rexp)
+ key = (rexp, ignore_case)
+ c_result = python.PyDict_GetItem(self._compile_map, key)
+ if c_result is not NULL:
+ return <object>c_result
+ py_flags = re.UNICODE
+ if ignore_case:
+ py_flags = py_flags | re.IGNORECASE
+ rexp_compiled = re.compile(rexp, py_flags)
+ self._compile_map[key] = rexp_compiled
+ return rexp_compiled
+
+ def test(self, ctxt, s, rexp, flags=u''):
+ flags = self._make_string(flags)
+ s = self._make_string(s)
+ rexpc = self._compile(rexp, u'i' in flags)
+ if rexpc.search(s) is None:
+ return False
+ else:
+ return True
+
+ def match(self, ctxt, s, rexp, flags=u''):
+ cdef list result_list
+ flags = self._make_string(flags)
+ s = self._make_string(s)
+ rexpc = self._compile(rexp, u'i' in flags)
+ if u'g' in flags:
+ results = rexpc.findall(s)
+ if not results:
+ return ()
+ else:
+ result = rexpc.search(s)
+ if not result:
+ return ()
+ results = [ result.group() ]
+ results.extend( result.groups(u'') )
+ result_list = []
+ root = Element(u'matches')
+ join_groups = u''.join
+ for s_match in results:
+ if python.PyTuple_CheckExact(s_match):
+ s_match = join_groups(s_match)
+ elem = SubElement(root, u'match')
+ elem.text = s_match
+ result_list.append(elem)
+ return result_list
+
+ def replace(self, ctxt, s, rexp, flags, replacement):
+ replacement = self._make_string(replacement)
+ flags = self._make_string(flags)
+ s = self._make_string(s)
+ rexpc = self._compile(rexp, u'i' in flags)
+ if u'g' in flags:
+ count = 0
+ else:
+ count = 1
+ return rexpc.sub(replacement, s, count)
+
+ cdef _register_in_context(self, _BaseContext context):
+ ns = b"http://exslt.org/regular-expressions"
+ context._addLocalExtensionFunction(ns, b"test", self.test)
+ context._addLocalExtensionFunction(ns, b"match", self.match)
+ context._addLocalExtensionFunction(ns, b"replace", self.replace)
+
+
+################################################################################
+# helper functions
+
+cdef xpath.xmlXPathObject* _wrapXPathObject(object obj, _Document doc,
+ _BaseContext context) except NULL:
+ cdef xpath.xmlNodeSet* resultSet
+ cdef _Element fake_node = None
+ cdef xmlNode* c_node
+
+ if isinstance(obj, unicode):
+ obj = _utf8(obj)
+ if isinstance(obj, bytes):
+ # libxml2 copies the string value
+ return xpath.xmlXPathNewCString(_cstr(obj))
+ if isinstance(obj, bool):
+ return xpath.xmlXPathNewBoolean(obj)
+ if python.PyNumber_Check(obj):
+ return xpath.xmlXPathNewFloat(obj)
+ if obj is None:
+ resultSet = xpath.xmlXPathNodeSetCreate(NULL)
+ elif isinstance(obj, _Element):
+ resultSet = xpath.xmlXPathNodeSetCreate((<_Element>obj)._c_node)
+ elif python.PySequence_Check(obj):
+ resultSet = xpath.xmlXPathNodeSetCreate(NULL)
+ try:
+ for value in obj:
+ if isinstance(value, _Element):
+ if context is not None:
+ context._hold(value)
+ xpath.xmlXPathNodeSetAdd(resultSet, (<_Element>value)._c_node)
+ else:
+ if context is None or doc is None:
+ raise XPathResultError, \
+ f"Non-Element values not supported at this point - got {value!r}"
+ # support strings by appending text nodes to an Element
+ if isinstance(value, unicode):
+ value = _utf8(value)
+ if isinstance(value, bytes):
+ if fake_node is None:
+ fake_node = _makeElement("text-root", NULL, doc, None,
+ None, None, None, None, None)
+ context._hold(fake_node)
+ else:
+ # append a comment node to keep the text nodes separate
+ c_node = tree.xmlNewDocComment(doc._c_doc, <unsigned char*>"")
+ if c_node is NULL:
+ raise MemoryError()
+ tree.xmlAddChild(fake_node._c_node, c_node)
+ context._hold(value)
+ c_node = tree.xmlNewDocText(doc._c_doc, _xcstr(value))
+ if c_node is NULL:
+ raise MemoryError()
+ tree.xmlAddChild(fake_node._c_node, c_node)
+ xpath.xmlXPathNodeSetAdd(resultSet, c_node)
+ else:
+ raise XPathResultError, \
+ f"This is not a supported node-set result: {value!r}"
+ except:
+ xpath.xmlXPathFreeNodeSet(resultSet)
+ raise
+ else:
+ raise XPathResultError, f"Unknown return type: {python._fqtypename(obj).decode('utf8')}"
+ return xpath.xmlXPathWrapNodeSet(resultSet)
+
+cdef object _unwrapXPathObject(xpath.xmlXPathObject* xpathObj,
+ _Document doc, _BaseContext context):
+ if xpathObj.type == xpath.XPATH_UNDEFINED:
+ raise XPathResultError, u"Undefined xpath result"
+ elif xpathObj.type == xpath.XPATH_NODESET:
+ return _createNodeSetResult(xpathObj, doc, context)
+ elif xpathObj.type == xpath.XPATH_BOOLEAN:
+ return xpathObj.boolval
+ elif xpathObj.type == xpath.XPATH_NUMBER:
+ return xpathObj.floatval
+ elif xpathObj.type == xpath.XPATH_STRING:
+ stringval = funicode(xpathObj.stringval)
+ if context._build_smart_strings:
+ stringval = _elementStringResultFactory(
+ stringval, None, None, 0)
+ return stringval
+ elif xpathObj.type == xpath.XPATH_POINT:
+ raise NotImplementedError, u"XPATH_POINT"
+ elif xpathObj.type == xpath.XPATH_RANGE:
+ raise NotImplementedError, u"XPATH_RANGE"
+ elif xpathObj.type == xpath.XPATH_LOCATIONSET:
+ raise NotImplementedError, u"XPATH_LOCATIONSET"
+ elif xpathObj.type == xpath.XPATH_USERS:
+ raise NotImplementedError, u"XPATH_USERS"
+ elif xpathObj.type == xpath.XPATH_XSLT_TREE:
+ return _createNodeSetResult(xpathObj, doc, context)
+ else:
+ raise XPathResultError, f"Unknown xpath result {xpathObj.type}"
+
+cdef object _createNodeSetResult(xpath.xmlXPathObject* xpathObj, _Document doc,
+ _BaseContext context):
+ cdef xmlNode* c_node
+ cdef int i
+ cdef list result
+ result = []
+ if xpathObj.nodesetval is NULL:
+ return result
+ for i in range(xpathObj.nodesetval.nodeNr):
+ c_node = xpathObj.nodesetval.nodeTab[i]
+ _unpackNodeSetEntry(result, c_node, doc, context,
+ xpathObj.type == xpath.XPATH_XSLT_TREE)
+ return result
+
+cdef _unpackNodeSetEntry(list results, xmlNode* c_node, _Document doc,
+ _BaseContext context, bint is_fragment):
+ cdef xmlNode* c_child
+ if _isElement(c_node):
+ if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
+ # XXX: works, but maybe not always the right thing to do?
+ # XPath: only runs when extensions create or copy trees
+ # -> we store Python refs to these, so that is OK
+ # XSLT: can it leak when merging trees from multiple sources?
+ c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
+ # FIXME: call _instantiateElementFromXPath() instead?
+ results.append(
+ _fakeDocElementFactory(doc, c_node))
+ elif c_node.type == tree.XML_TEXT_NODE or \
+ c_node.type == tree.XML_CDATA_SECTION_NODE or \
+ c_node.type == tree.XML_ATTRIBUTE_NODE:
+ results.append(
+ _buildElementStringResult(doc, c_node, context))
+ elif c_node.type == tree.XML_NAMESPACE_DECL:
+ results.append( (funicodeOrNone((<xmlNs*>c_node).prefix),
+ funicodeOrNone((<xmlNs*>c_node).href)) )
+ elif c_node.type == tree.XML_DOCUMENT_NODE or \
+ c_node.type == tree.XML_HTML_DOCUMENT_NODE:
+ # ignored for everything but result tree fragments
+ if is_fragment:
+ c_child = c_node.children
+ while c_child is not NULL:
+ _unpackNodeSetEntry(results, c_child, doc, context, 0)
+ c_child = c_child.next
+ elif c_node.type == tree.XML_XINCLUDE_START or \
+ c_node.type == tree.XML_XINCLUDE_END:
+ pass
+ else:
+ raise NotImplementedError, \
+ f"Not yet implemented result node type: {c_node.type}"
+
+cdef void _freeXPathObject(xpath.xmlXPathObject* xpathObj):
+ u"""Free the XPath object, but *never* free the *content* of node sets.
+ Python dealloc will do that for us.
+ """
+ if xpathObj.nodesetval is not NULL:
+ xpath.xmlXPathFreeNodeSet(xpathObj.nodesetval)
+ xpathObj.nodesetval = NULL
+ xpath.xmlXPathFreeObject(xpathObj)
+
+cdef _Element _instantiateElementFromXPath(xmlNode* c_node, _Document doc,
+ _BaseContext context):
+ # NOTE: this may copy the element - only call this when it can't leak
+ if c_node.doc != doc._c_doc and c_node.doc._private is NULL:
+ # not from the context document and not from a fake document
+ # either => may still be from a known document, e.g. one
+ # created by an extension function
+ node_doc = context._findDocumentForNode(c_node)
+ if node_doc is None:
+ # not from a known document at all! => can only make a
+ # safety copy here
+ c_node = tree.xmlDocCopyNode(c_node, doc._c_doc, 1)
+ else:
+ doc = node_doc
+ return _fakeDocElementFactory(doc, c_node)
+
+################################################################################
+# special str/unicode subclasses
+
+@cython.final
+cdef class _ElementUnicodeResult(unicode):
+ cdef _Element _parent
+ cdef readonly object attrname
+ cdef readonly bint is_tail
+ cdef readonly bint is_text
+ cdef readonly bint is_attribute
+
+ def getparent(self):
+ return self._parent
+
+cdef object _PyElementUnicodeResult
+if python.IS_PYPY:
+ class _PyElementUnicodeResult(unicode):
+ # we need to use a Python class here, or PyPy will crash on creation
+ # https://bitbucket.org/pypy/pypy/issues/2021/pypy3-pytype_ready-crashes-for-extension
+ def getparent(self):
+ return self._parent
+
+class _ElementStringResult(bytes):
+ # we need to use a Python class here, bytes cannot be C-subclassed
+ # in Pyrex/Cython
+ def getparent(self):
+ return self._parent
+
+cdef object _elementStringResultFactory(string_value, _Element parent,
+ attrname, bint is_tail):
+ cdef _ElementUnicodeResult uresult
+ cdef bint is_text
+ cdef bint is_attribute = attrname is not None
+ if parent is None:
+ is_text = 0
+ else:
+ is_text = not (is_tail or is_attribute)
+
+ if type(string_value) is bytes:
+ result = _ElementStringResult(string_value)
+ result._parent = parent
+ result.is_attribute = is_attribute
+ result.is_tail = is_tail
+ result.is_text = is_text
+ result.attrname = attrname
+ return result
+ elif python.IS_PYPY:
+ result = _PyElementUnicodeResult(string_value)
+ result._parent = parent
+ result.is_attribute = is_attribute
+ result.is_tail = is_tail
+ result.is_text = is_text
+ result.attrname = attrname
+ return result
+ else:
+ uresult = _ElementUnicodeResult(string_value)
+ uresult._parent = parent
+ uresult.is_attribute = is_attribute
+ uresult.is_tail = is_tail
+ uresult.is_text = is_text
+ uresult.attrname = attrname
+ return uresult
+
+cdef object _buildElementStringResult(_Document doc, xmlNode* c_node,
+ _BaseContext context):
+ cdef _Element parent = None
+ cdef object attrname = None
+ cdef xmlNode* c_element
+ cdef bint is_tail
+
+ if c_node.type == tree.XML_ATTRIBUTE_NODE:
+ attrname = _namespacedName(c_node)
+ is_tail = 0
+ s = tree.xmlNodeGetContent(c_node)
+ try:
+ value = funicode(s)
+ finally:
+ tree.xmlFree(s)
+ c_element = NULL
+ else:
+ #assert c_node.type == tree.XML_TEXT_NODE or c_node.type == tree.XML_CDATA_SECTION_NODE, "invalid node type"
+ # may be tail text or normal text
+ value = funicode(c_node.content)
+ c_element = _previousElement(c_node)
+ is_tail = c_element is not NULL
+
+ if not context._build_smart_strings:
+ return value
+
+ if c_element is NULL:
+ # non-tail text or attribute text
+ c_element = c_node.parent
+ while c_element is not NULL and not _isElement(c_element):
+ c_element = c_element.parent
+
+ if c_element is not NULL:
+ parent = _instantiateElementFromXPath(c_element, doc, context)
+
+ return _elementStringResultFactory(
+ value, parent, attrname, is_tail)
+
+################################################################################
+# callbacks for XPath/XSLT extension functions
+
+cdef void _extension_function_call(_BaseContext context, function,
+ xpath.xmlXPathParserContext* ctxt, int nargs):
+ cdef _Document doc
+ cdef xpath.xmlXPathObject* obj
+ cdef list args
+ cdef int i
+ doc = context._doc
+ try:
+ args = []
+ for i in range(nargs):
+ obj = xpath.valuePop(ctxt)
+ o = _unwrapXPathObject(obj, doc, context)
+ _freeXPathObject(obj)
+ args.append(o)
+ args.reverse()
+
+ res = function(context, *args)
+ # wrap result for XPath consumption
+ obj = _wrapXPathObject(res, doc, context)
+ # prevent Python from deallocating elements handed to libxml2
+ context._hold(res)
+ xpath.valuePush(ctxt, obj)
+ except:
+ xpath.xmlXPathErr(ctxt, xpath.XPATH_EXPR_ERROR)
+ context._exc._store_raised()
+ finally:
+ return # swallow any further exceptions
+
+# lookup the function by name and call it
+
+cdef void _xpath_function_call(xpath.xmlXPathParserContext* ctxt,
+ int nargs) with gil:
+ cdef _BaseContext context
+ cdef xpath.xmlXPathContext* rctxt = ctxt.context
+ context = <_BaseContext> rctxt.userData
+ try:
+ function = context._find_cached_function(rctxt.functionURI, rctxt.function)
+ if function is not None:
+ _extension_function_call(context, function, ctxt, nargs)
+ else:
+ xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
+ context._exc._store_exception(XPathFunctionError(
+ f"XPath function '{_namespacedNameFromNsName(rctxt.functionURI, rctxt.function)}' not found"))
+ except:
+ # may not be the right error, but we need to tell libxml2 *something*
+ xpath.xmlXPathErr(ctxt, xpath.XPATH_UNKNOWN_FUNC_ERROR)
+ context._exc._store_raised()
+ finally:
+ return # swallow any further exceptions
diff --git a/src/lxml/html/ElementSoup.py b/src/lxml/html/ElementSoup.py
new file mode 100644
index 0000000..c35365d
--- /dev/null
+++ b/src/lxml/html/ElementSoup.py
@@ -0,0 +1,10 @@
+__doc__ = """Legacy interface to the BeautifulSoup HTML parser.
+"""
+
+__all__ = ["parse", "convert_tree"]
+
+from .soupparser import convert_tree, parse as _parse
+
+def parse(file, beautifulsoup=None, makeelement=None):
+ root = _parse(file, beautifulsoup=beautifulsoup, makeelement=makeelement)
+ return root.getroot()
diff --git a/src/lxml/html/__init__.py b/src/lxml/html/__init__.py
new file mode 100644
index 0000000..2139c75
--- /dev/null
+++ b/src/lxml/html/__init__.py
@@ -0,0 +1,1948 @@
+# Copyright (c) 2004 Ian Bicking. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in
+# the documentation and/or other materials provided with the
+# distribution.
+#
+# 3. Neither the name of Ian Bicking nor the names of its contributors may
+# be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""The ``lxml.html`` tool set for HTML handling.
+"""
+
+from __future__ import absolute_import
+
+__all__ = [
+ 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
+ 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
+ 'find_rel_links', 'find_class', 'make_links_absolute',
+ 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse']
+
+
+import copy
+import sys
+import re
+from functools import partial
+
+try:
+ from collections.abc import MutableMapping, MutableSet
+except ImportError:
+ from collections import MutableMapping, MutableSet
+
+from .. import etree
+from . import defs
+from ._setmixin import SetMixin
+
+try:
+ from urlparse import urljoin
+except ImportError:
+ # Python 3
+ from urllib.parse import urljoin
+
+try:
+ unicode
+except NameError:
+ # Python 3
+ unicode = str
+try:
+ basestring
+except NameError:
+ # Python 3
+ basestring = (str, bytes)
+
+
+def __fix_docstring(s):
+ if not s:
+ return s
+ if sys.version_info[0] >= 3:
+ sub = re.compile(r"^(\s*)u'", re.M).sub
+ else:
+ sub = re.compile(r"^(\s*)b'", re.M).sub
+ return sub(r"\1'", s)
+
+
+XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"
+
+_rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]",
+ namespaces={'x':XHTML_NAMESPACE})
+_options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option",
+ namespaces={'x':XHTML_NAMESPACE})
+_forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form",
+ namespaces={'x':XHTML_NAMESPACE})
+#_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'})
+_class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
+_id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
+_collect_string_content = etree.XPath("string()")
+_iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer
+_iter_css_imports = re.compile(r'@import "(.*?)"').finditer
+_label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]",
+ namespaces={'x':XHTML_NAMESPACE})
+_archive_re = re.compile(r'[^ ]+')
+_parse_meta_refresh_url = re.compile(
+ r'[^;=]*;\s*(?:url\s*=\s*)?(?P<url>.*)$', re.I).search
+
+
+def _unquote_match(s, pos):
+ if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'":
+ return s[1:-1], pos+1
+ else:
+ return s,pos
+
+
+def _transform_result(typ, result):
+ """Convert the result back into the input type.
+ """
+ if issubclass(typ, bytes):
+ return tostring(result, encoding='utf-8')
+ elif issubclass(typ, unicode):
+ return tostring(result, encoding='unicode')
+ else:
+ return result
+
+
+def _nons(tag):
+ if isinstance(tag, basestring):
+ if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE:
+ return tag.split('}')[-1]
+ return tag
+
+
+class Classes(MutableSet):
+ """Provides access to an element's class attribute as a set-like collection.
+ Usage::
+
+ >>> el = fromstring('<p class="hidden large">Text</p>')
+ >>> classes = el.classes # or: classes = Classes(el.attrib)
+ >>> classes |= ['block', 'paragraph']
+ >>> el.get('class')
+ 'hidden large block paragraph'
+ >>> classes.toggle('hidden')
+ False
+ >>> el.get('class')
+ 'large block paragraph'
+ >>> classes -= ('some', 'classes', 'block')
+ >>> el.get('class')
+ 'large paragraph'
+ """
+ def __init__(self, attributes):
+ self._attributes = attributes
+ self._get_class_value = partial(attributes.get, 'class', '')
+
+ def add(self, value):
+ """
+ Add a class.
+
+ This has no effect if the class is already present.
+ """
+ if not value or re.search(r'\s', value):
+ raise ValueError("Invalid class name: %r" % value)
+ classes = self._get_class_value().split()
+ if value in classes:
+ return
+ classes.append(value)
+ self._attributes['class'] = ' '.join(classes)
+
+ def discard(self, value):
+ """
+ Remove a class if it is currently present.
+
+ If the class is not present, do nothing.
+ """
+ if not value or re.search(r'\s', value):
+ raise ValueError("Invalid class name: %r" % value)
+ classes = [name for name in self._get_class_value().split()
+ if name != value]
+ if classes:
+ self._attributes['class'] = ' '.join(classes)
+ elif 'class' in self._attributes:
+ del self._attributes['class']
+
+ def remove(self, value):
+ """
+ Remove a class; it must currently be present.
+
+ If the class is not present, raise a KeyError.
+ """
+ if not value or re.search(r'\s', value):
+ raise ValueError("Invalid class name: %r" % value)
+ super(Classes, self).remove(value)
+
+ def __contains__(self, name):
+ classes = self._get_class_value()
+ return name in classes and name in classes.split()
+
+ def __iter__(self):
+ return iter(self._get_class_value().split())
+
+ def __len__(self):
+ return len(self._get_class_value().split())
+
+ # non-standard methods
+
+ def update(self, values):
+ """
+ Add all names from 'values'.
+ """
+ classes = self._get_class_value().split()
+ extended = False
+ for value in values:
+ if value not in classes:
+ classes.append(value)
+ extended = True
+ if extended:
+ self._attributes['class'] = ' '.join(classes)
+
+ def toggle(self, value):
+ """
+ Add a class name if it isn't there yet, or remove it if it exists.
+
+ Returns true if the class was added (and is now enabled) and
+ false if it was removed (and is now disabled).
+ """
+ if not value or re.search(r'\s', value):
+ raise ValueError("Invalid class name: %r" % value)
+ classes = self._get_class_value().split()
+ try:
+ classes.remove(value)
+ enabled = False
+ except ValueError:
+ classes.append(value)
+ enabled = True
+ if classes:
+ self._attributes['class'] = ' '.join(classes)
+ else:
+ del self._attributes['class']
+ return enabled
+
+
+class HtmlMixin(object):
+
+ def set(self, key, value=None):
+ """set(self, key, value=None)
+
+ Sets an element attribute. If no value is provided, or if the value is None,
+ creates a 'boolean' attribute without value, e.g. "<form novalidate></form>"
+ for ``form.set('novalidate')``.
+ """
+ super(HtmlElement, self).set(key, value)
+
+ @property
+ def classes(self):
+ """
+ A set-like wrapper around the 'class' attribute.
+ """
+ return Classes(self.attrib)
+
+ @classes.setter
+ def classes(self, classes):
+ assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc.
+ value = classes._get_class_value()
+ if value:
+ self.set('class', value)
+ elif self.get('class') is not None:
+ del self.attrib['class']
+
+ @property
+ def base_url(self):
+ """
+ Returns the base URL, given when the page was parsed.
+
+ Use with ``urlparse.urljoin(el.base_url, href)`` to get
+ absolute URLs.
+ """
+ return self.getroottree().docinfo.URL
+
+ @property
+ def forms(self):
+ """
+ Return a list of all the forms
+ """
+ return _forms_xpath(self)
+
+ @property
+ def body(self):
+ """
+ Return the <body> element. Can be called from a child element
+ to get the document's head.
+ """
+ return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
+
+ @property
+ def head(self):
+ """
+ Returns the <head> element. Can be called from a child
+ element to get the document's head.
+ """
+ return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
+
+ @property
+ def label(self):
+ """
+ Get or set any <label> element associated with this element.
+ """
+ id = self.get('id')
+ if not id:
+ return None
+ result = _label_xpath(self, id=id)
+ if not result:
+ return None
+ else:
+ return result[0]
+
+ @label.setter
+ def label(self, label):
+ id = self.get('id')
+ if not id:
+ raise TypeError(
+ "You cannot set a label for an element (%r) that has no id"
+ % self)
+ if _nons(label.tag) != 'label':
+ raise TypeError(
+ "You can only assign label to a label element (not %r)"
+ % label)
+ label.set('for', id)
+
+ @label.deleter
+ def label(self):
+ label = self.label
+ if label is not None:
+ del label.attrib['for']
+
+ def drop_tree(self):
+ """
+ Removes this element from the tree, including its children and
+ text. The tail text is joined to the previous element or
+ parent.
+ """
+ parent = self.getparent()
+ assert parent is not None
+ if self.tail:
+ previous = self.getprevious()
+ if previous is None:
+ parent.text = (parent.text or '') + self.tail
+ else:
+ previous.tail = (previous.tail or '') + self.tail
+ parent.remove(self)
+
+ def drop_tag(self):
+ """
+ Remove the tag, but not its children or text. The children and text
+ are merged into the parent.
+
+ Example::
+
+ >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
+ >>> h.find('.//b').drop_tag()
+ >>> print(tostring(h, encoding='unicode'))
+ <div>Hello World!</div>
+ """
+ parent = self.getparent()
+ assert parent is not None
+ previous = self.getprevious()
+ if self.text and isinstance(self.tag, basestring):
+ # not a Comment, etc.
+ if previous is None:
+ parent.text = (parent.text or '') + self.text
+ else:
+ previous.tail = (previous.tail or '') + self.text
+ if self.tail:
+ if len(self):
+ last = self[-1]
+ last.tail = (last.tail or '') + self.tail
+ elif previous is None:
+ parent.text = (parent.text or '') + self.tail
+ else:
+ previous.tail = (previous.tail or '') + self.tail
+ index = parent.index(self)
+ parent[index:index+1] = self[:]
+
+ def find_rel_links(self, rel):
+ """
+ Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
+ """
+ rel = rel.lower()
+ return [el for el in _rel_links_xpath(self)
+ if el.get('rel').lower() == rel]
+
+ def find_class(self, class_name):
+ """
+ Find any elements with the given class name.
+ """
+ return _class_xpath(self, class_name=class_name)
+
+ def get_element_by_id(self, id, *default):
+ """
+ Get the first element in a document with the given id. If none is
+ found, return the default argument if provided or raise KeyError
+ otherwise.
+
+ Note that there can be more than one element with the same id,
+ and this isn't uncommon in HTML documents found in the wild.
+ Browsers return only the first match, and this function does
+ the same.
+ """
+ try:
+ # FIXME: should this check for multiple matches?
+ # browsers just return the first one
+ return _id_xpath(self, id=id)[0]
+ except IndexError:
+ if default:
+ return default[0]
+ else:
+ raise KeyError(id)
+
+ def text_content(self):
+ """
+ Return the text content of the tag (and the text in any children).
+ """
+ return _collect_string_content(self)
+
+ def cssselect(self, expr, translator='html'):
+ """
+ Run the CSS expression on this element and its children,
+ returning a list of the results.
+
+ Equivalent to lxml.cssselect.CSSSelect(expr, translator='html')(self)
+ -- note that pre-compiling the expression can provide a substantial
+ speedup.
+ """
+ # Do the import here to make the dependency optional.
+ from lxml.cssselect import CSSSelector
+ return CSSSelector(expr, translator=translator)(self)
+
+ ########################################
+ ## Link functions
+ ########################################
+
+ def make_links_absolute(self, base_url=None, resolve_base_href=True,
+ handle_failures=None):
+ """
+ Make all links in the document absolute, given the
+ ``base_url`` for the document (the full URL where the document
+ came from), or if no ``base_url`` is given, then the ``.base_url``
+ of the document.
+
+ If ``resolve_base_href`` is true, then any ``<base href>``
+ tags in the document are used *and* removed from the document.
+ If it is false then any such tag is ignored.
+
+ If ``handle_failures`` is None (default), a failure to process
+ a URL will abort the processing. If set to 'ignore', errors
+ are ignored. If set to 'discard', failing URLs will be removed.
+ """
+ if base_url is None:
+ base_url = self.base_url
+ if base_url is None:
+ raise TypeError(
+ "No base_url given, and the document has no base_url")
+ if resolve_base_href:
+ self.resolve_base_href()
+
+ if handle_failures == 'ignore':
+ def link_repl(href):
+ try:
+ return urljoin(base_url, href)
+ except ValueError:
+ return href
+ elif handle_failures == 'discard':
+ def link_repl(href):
+ try:
+ return urljoin(base_url, href)
+ except ValueError:
+ return None
+ elif handle_failures is None:
+ def link_repl(href):
+ return urljoin(base_url, href)
+ else:
+ raise ValueError(
+ "unexpected value for handle_failures: %r" % handle_failures)
+
+ self.rewrite_links(link_repl)
+
+ def resolve_base_href(self, handle_failures=None):
+ """
+ Find any ``<base href>`` tag in the document, and apply its
+ values to all links found in the document. Also remove the
+ tag once it has been applied.
+
+ If ``handle_failures`` is None (default), a failure to process
+ a URL will abort the processing. If set to 'ignore', errors
+ are ignored. If set to 'discard', failing URLs will be removed.
+ """
+ base_href = None
+ basetags = self.xpath('//base[@href]|//x:base[@href]',
+ namespaces={'x': XHTML_NAMESPACE})
+ for b in basetags:
+ base_href = b.get('href')
+ b.drop_tree()
+ if not base_href:
+ return
+ self.make_links_absolute(base_href, resolve_base_href=False,
+ handle_failures=handle_failures)
+
+ def iterlinks(self):
+ """
+ Yield (element, attribute, link, pos), where attribute may be None
+ (indicating the link is in the text). ``pos`` is the position
+ where the link occurs; often 0, but sometimes something else in
+ the case of links in stylesheets or style tags.
+
+ Note: <base href> is *not* taken into account in any way. The
+ link you get is exactly the link in the document.
+
+ Note: multiple links inside of a single text string or
+ attribute value are returned in reversed order. This makes it
+ possible to replace or delete them from the text string value
+ based on their reported text positions. Otherwise, a
+ modification at one text position can change the positions of
+ links reported later on.
+ """
+ link_attrs = defs.link_attrs
+ for el in self.iter(etree.Element):
+ attribs = el.attrib
+ tag = _nons(el.tag)
+ if tag == 'object':
+ codebase = None
+ ## <object> tags have attributes that are relative to
+ ## codebase
+ if 'codebase' in attribs:
+ codebase = el.get('codebase')
+ yield (el, 'codebase', codebase, 0)
+ for attrib in ('classid', 'data'):
+ if attrib in attribs:
+ value = el.get(attrib)
+ if codebase is not None:
+ value = urljoin(codebase, value)
+ yield (el, attrib, value, 0)
+ if 'archive' in attribs:
+ for match in _archive_re.finditer(el.get('archive')):
+ value = match.group(0)
+ if codebase is not None:
+ value = urljoin(codebase, value)
+ yield (el, 'archive', value, match.start())
+ else:
+ for attrib in link_attrs:
+ if attrib in attribs:
+ yield (el, attrib, attribs[attrib], 0)
+ if tag == 'meta':
+ http_equiv = attribs.get('http-equiv', '').lower()
+ if http_equiv == 'refresh':
+ content = attribs.get('content', '')
+ match = _parse_meta_refresh_url(content)
+ url = (match.group('url') if match else content).strip()
+ # unexpected content means the redirect won't work, but we might
+ # as well be permissive and return the entire string.
+ if url:
+ url, pos = _unquote_match(
+ url, match.start('url') if match else content.find(url))
+ yield (el, 'content', url, pos)
+ elif tag == 'param':
+ valuetype = el.get('valuetype') or ''
+ if valuetype.lower() == 'ref':
+ ## FIXME: while it's fine we *find* this link,
+ ## according to the spec we aren't supposed to
+ ## actually change the value, including resolving
+ ## it. It can also still be a link, even if it
+ ## doesn't have a valuetype="ref" (which seems to be the norm)
+ ## http://www.w3.org/TR/html401/struct/objects.html#adef-valuetype
+ yield (el, 'value', el.get('value'), 0)
+ elif tag == 'style' and el.text:
+ urls = [
+ # (start_pos, url)
+ _unquote_match(match.group(1), match.start(1))[::-1]
+ for match in _iter_css_urls(el.text)
+ ] + [
+ (match.start(1), match.group(1))
+ for match in _iter_css_imports(el.text)
+ ]
+ if urls:
+ # sort by start pos to bring both match sets back into order
+ # and reverse the list to report correct positions despite
+ # modifications
+ urls.sort(reverse=True)
+ for start, url in urls:
+ yield (el, None, url, start)
+ if 'style' in attribs:
+ urls = list(_iter_css_urls(attribs['style']))
+ if urls:
+ # return in reversed order to simplify in-place modifications
+ for match in urls[::-1]:
+ url, start = _unquote_match(match.group(1), match.start(1))
+ yield (el, 'style', url, start)
+
+ def rewrite_links(self, link_repl_func, resolve_base_href=True,
+ base_href=None):
+ """
+ Rewrite all the links in the document. For each link
+ ``link_repl_func(link)`` will be called, and the return value
+ will replace the old link.
+
+ Note that links may not be absolute (unless you first called
+ ``make_links_absolute()``), and may be internal (e.g.,
+ ``'#anchor'``). They can also be values like
+ ``'mailto:email'`` or ``'javascript:expr'``.
+
+ If you give ``base_href`` then all links passed to
+ ``link_repl_func()`` will take that into account.
+
+ If the ``link_repl_func`` returns None, the attribute or
+ tag text will be removed completely.
+ """
+ if base_href is not None:
+ # FIXME: this can be done in one pass with a wrapper
+ # around link_repl_func
+ self.make_links_absolute(
+ base_href, resolve_base_href=resolve_base_href)
+ elif resolve_base_href:
+ self.resolve_base_href()
+
+ for el, attrib, link, pos in self.iterlinks():
+ new_link = link_repl_func(link.strip())
+ if new_link == link:
+ continue
+ if new_link is None:
+ # Remove the attribute or element content
+ if attrib is None:
+ el.text = ''
+ else:
+ del el.attrib[attrib]
+ continue
+
+ if attrib is None:
+ new = el.text[:pos] + new_link + el.text[pos+len(link):]
+ el.text = new
+ else:
+ cur = el.get(attrib)
+ if not pos and len(cur) == len(link):
+ new = new_link # most common case
+ else:
+ new = cur[:pos] + new_link + cur[pos+len(link):]
+ el.set(attrib, new)
+
+
+class _MethodFunc(object):
+ """
+ An object that represents a method on an element as a function;
+ the function takes either an element or an HTML string. It
+ returns whatever the function normally returns, or if the function
+ works in-place (and so returns None) it returns a serialized form
+ of the resulting document.
+ """
+ def __init__(self, name, copy=False, source_class=HtmlMixin):
+ self.name = name
+ self.copy = copy
+ self.__doc__ = getattr(source_class, self.name).__doc__
+ def __call__(self, doc, *args, **kw):
+ result_type = type(doc)
+ if isinstance(doc, basestring):
+ if 'copy' in kw:
+ raise TypeError(
+ "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
+ doc = fromstring(doc, **kw)
+ else:
+ if 'copy' in kw:
+ make_a_copy = kw.pop('copy')
+ else:
+ make_a_copy = self.copy
+ if make_a_copy:
+ doc = copy.deepcopy(doc)
+ meth = getattr(doc, self.name)
+ result = meth(*args, **kw)
+ # FIXME: this None test is a bit sloppy
+ if result is None:
+ # Then return what we got in
+ return _transform_result(result_type, doc)
+ else:
+ return result
+
+
+find_rel_links = _MethodFunc('find_rel_links', copy=False)
+find_class = _MethodFunc('find_class', copy=False)
+make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
+resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
+iterlinks = _MethodFunc('iterlinks', copy=False)
+rewrite_links = _MethodFunc('rewrite_links', copy=True)
+
+
+class HtmlComment(etree.CommentBase, HtmlMixin):
+ pass
+
+
+class HtmlElement(etree.ElementBase, HtmlMixin):
+ # Override etree.ElementBase.cssselect() and set(), despite the MRO (FIXME: change base order?)
+ cssselect = HtmlMixin.cssselect
+ set = HtmlMixin.set
+
+
+class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
+ pass
+
+
+class HtmlEntity(etree.EntityBase, HtmlMixin):
+ pass
+
+
+class HtmlElementClassLookup(etree.CustomElementClassLookup):
+ """A lookup scheme for HTML Element classes.
+
+ To create a lookup instance with different Element classes, pass a tag
+ name mapping of Element classes in the ``classes`` keyword argument and/or
+ a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
+ The special key '*' denotes a Mixin class that should be mixed into all
+ Element classes.
+ """
+ _default_element_classes = {}
+
+ def __init__(self, classes=None, mixins=None):
+ etree.CustomElementClassLookup.__init__(self)
+ if classes is None:
+ classes = self._default_element_classes.copy()
+ if mixins:
+ mixers = {}
+ for name, value in mixins:
+ if name == '*':
+ for n in classes.keys():
+ mixers.setdefault(n, []).append(value)
+ else:
+ mixers.setdefault(name, []).append(value)
+ for name, mix_bases in mixers.items():
+ cur = classes.get(name, HtmlElement)
+ bases = tuple(mix_bases + [cur])
+ classes[name] = type(cur.__name__, bases, {})
+ self._element_classes = classes
+
+ def lookup(self, node_type, document, namespace, name):
+ if node_type == 'element':
+ return self._element_classes.get(name.lower(), HtmlElement)
+ elif node_type == 'comment':
+ return HtmlComment
+ elif node_type == 'PI':
+ return HtmlProcessingInstruction
+ elif node_type == 'entity':
+ return HtmlEntity
+ # Otherwise normal lookup
+ return None
+
+
+################################################################################
+# parsing
+################################################################################
+
+_looks_like_full_html_unicode = re.compile(
+ unicode(r'^\s*<(?:html|!doctype)'), re.I).match
+_looks_like_full_html_bytes = re.compile(
+ r'^\s*<(?:html|!doctype)'.encode('ascii'), re.I).match
+
+
+def document_fromstring(html, parser=None, ensure_head_body=False, **kw):
+ if parser is None:
+ parser = html_parser
+ value = etree.fromstring(html, parser, **kw)
+ if value is None:
+ raise etree.ParserError(
+ "Document is empty")
+ if ensure_head_body and value.find('head') is None:
+ value.insert(0, Element('head'))
+ if ensure_head_body and value.find('body') is None:
+ value.append(Element('body'))
+ return value
+
+
+def fragments_fromstring(html, no_leading_text=False, base_url=None,
+ parser=None, **kw):
+ """Parses several HTML elements, returning a list of elements.
+
+ The first item in the list may be a string.
+ If no_leading_text is true, then it will be an error if there is
+ leading text, and it will always be a list of only elements.
+
+ base_url will set the document's base_url attribute
+ (and the tree's docinfo.URL).
+ """
+ if parser is None:
+ parser = html_parser
+ # FIXME: check what happens when you give html with a body, head, etc.
+ if isinstance(html, bytes):
+ if not _looks_like_full_html_bytes(html):
+ # can't use %-formatting in early Py3 versions
+ html = ('<html><body>'.encode('ascii') + html +
+ '</body></html>'.encode('ascii'))
+ else:
+ if not _looks_like_full_html_unicode(html):
+ html = '<html><body>%s</body></html>' % html
+ doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
+ assert _nons(doc.tag) == 'html'
+ bodies = [e for e in doc if _nons(e.tag) == 'body']
+ assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
+ body = bodies[0]
+ elements = []
+ if no_leading_text and body.text and body.text.strip():
+ raise etree.ParserError(
+ "There is leading text: %r" % body.text)
+ if body.text and body.text.strip():
+ elements.append(body.text)
+ elements.extend(body)
+ # FIXME: removing the reference to the parent artificial document
+ # would be nice
+ return elements
+
+
+def fragment_fromstring(html, create_parent=False, base_url=None,
+ parser=None, **kw):
+ """
+ Parses a single HTML element; it is an error if there is more than
+ one element, or if anything but whitespace precedes or follows the
+ element.
+
+ If ``create_parent`` is true (or is a tag name) then a parent node
+ will be created to encapsulate the HTML in a single element. In this
+ case, leading or trailing text is also allowed, as are multiple elements
+ as result of the parsing.
+
+ Passing a ``base_url`` will set the document's ``base_url`` attribute
+ (and the tree's docinfo.URL).
+ """
+ if parser is None:
+ parser = html_parser
+
+ accept_leading_text = bool(create_parent)
+
+ elements = fragments_fromstring(
+ html, parser=parser, no_leading_text=not accept_leading_text,
+ base_url=base_url, **kw)
+
+ if create_parent:
+ if not isinstance(create_parent, basestring):
+ create_parent = 'div'
+ new_root = Element(create_parent)
+ if elements:
+ if isinstance(elements[0], basestring):
+ new_root.text = elements[0]
+ del elements[0]
+ new_root.extend(elements)
+ return new_root
+
+ if not elements:
+ raise etree.ParserError('No elements found')
+ if len(elements) > 1:
+ raise etree.ParserError(
+ "Multiple elements found (%s)"
+ % ', '.join([_element_name(e) for e in elements]))
+ el = elements[0]
+ if el.tail and el.tail.strip():
+ raise etree.ParserError(
+ "Element followed by text: %r" % el.tail)
+ el.tail = None
+ return el
+
+
+def fromstring(html, base_url=None, parser=None, **kw):
+ """
+ Parse the html, returning a single element/document.
+
+ This tries to minimally parse the chunk of text, without knowing if it
+ is a fragment or a document.
+
+ base_url will set the document's base_url attribute (and the tree's docinfo.URL)
+ """
+ if parser is None:
+ parser = html_parser
+ if isinstance(html, bytes):
+ is_full_html = _looks_like_full_html_bytes(html)
+ else:
+ is_full_html = _looks_like_full_html_unicode(html)
+ doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)
+ if is_full_html:
+ return doc
+ # otherwise, lets parse it out...
+ bodies = doc.findall('body')
+ if not bodies:
+ bodies = doc.findall('{%s}body' % XHTML_NAMESPACE)
+ if bodies:
+ body = bodies[0]
+ if len(bodies) > 1:
+ # Somehow there are multiple bodies, which is bad, but just
+ # smash them into one body
+ for other_body in bodies[1:]:
+ if other_body.text:
+ if len(body):
+ body[-1].tail = (body[-1].tail or '') + other_body.text
+ else:
+ body.text = (body.text or '') + other_body.text
+ body.extend(other_body)
+ # We'll ignore tail
+ # I guess we are ignoring attributes too
+ other_body.drop_tree()
+ else:
+ body = None
+ heads = doc.findall('head')
+ if not heads:
+ heads = doc.findall('{%s}head' % XHTML_NAMESPACE)
+ if heads:
+ # Well, we have some sort of structure, so lets keep it all
+ head = heads[0]
+ if len(heads) > 1:
+ for other_head in heads[1:]:
+ head.extend(other_head)
+ # We don't care about text or tail in a head
+ other_head.drop_tree()
+ return doc
+ if body is None:
+ return doc
+ if (len(body) == 1 and (not body.text or not body.text.strip())
+ and (not body[-1].tail or not body[-1].tail.strip())):
+ # The body has just one element, so it was probably a single
+ # element passed in
+ return body[0]
+ # Now we have a body which represents a bunch of tags which have the
+ # content that was passed in. We will create a fake container, which
+ # is the body tag, except <body> implies too much structure.
+ if _contains_block_level_tag(body):
+ body.tag = 'div'
+ else:
+ body.tag = 'span'
+ return body
+
+
+def parse(filename_or_url, parser=None, base_url=None, **kw):
+ """
+ Parse a filename, URL, or file-like object into an HTML document
+ tree. Note: this returns a tree, not an element. Use
+ ``parse(...).getroot()`` to get the document root.
+
+ You can override the base URL with the ``base_url`` keyword. This
+ is most useful when parsing from a file-like object.
+ """
+ if parser is None:
+ parser = html_parser
+ return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
+
+
+def _contains_block_level_tag(el):
+ # FIXME: I could do this with XPath, but would that just be
+ # unnecessarily slow?
+ for el in el.iter(etree.Element):
+ if _nons(el.tag) in defs.block_tags:
+ return True
+ return False
+
+
+def _element_name(el):
+ if isinstance(el, etree.CommentBase):
+ return 'comment'
+ elif isinstance(el, basestring):
+ return 'string'
+ else:
+ return _nons(el.tag)
+
+
+################################################################################
+# form handling
+################################################################################
+
+class FormElement(HtmlElement):
+ """
+ Represents a <form> element.
+ """
+
+ @property
+ def inputs(self):
+ """
+ Returns an accessor for all the input elements in the form.
+
+ See `InputGetter` for more information about the object.
+ """
+ return InputGetter(self)
+
+ @property
+ def fields(self):
+ """
+ Dictionary-like object that represents all the fields in this
+ form. You can set values in this dictionary to effect the
+ form.
+ """
+ return FieldsDict(self.inputs)
+
+ @fields.setter
+ def fields(self, value):
+ fields = self.fields
+ prev_keys = fields.keys()
+ for key, value in value.items():
+ if key in prev_keys:
+ prev_keys.remove(key)
+ fields[key] = value
+ for key in prev_keys:
+ if key is None:
+ # Case of an unnamed input; these aren't really
+ # expressed in form_values() anyway.
+ continue
+ fields[key] = None
+
+ def _name(self):
+ if self.get('name'):
+ return self.get('name')
+ elif self.get('id'):
+ return '#' + self.get('id')
+ iter_tags = self.body.iter
+ forms = list(iter_tags('form'))
+ if not forms:
+ forms = list(iter_tags('{%s}form' % XHTML_NAMESPACE))
+ return str(forms.index(self))
+
+ def form_values(self):
+ """
+ Return a list of tuples of the field values for the form.
+ This is suitable to be passed to ``urllib.urlencode()``.
+ """
+ results = []
+ for el in self.inputs:
+ name = el.name
+ if not name or 'disabled' in el.attrib:
+ continue
+ tag = _nons(el.tag)
+ if tag == 'textarea':
+ results.append((name, el.value))
+ elif tag == 'select':
+ value = el.value
+ if el.multiple:
+ for v in value:
+ results.append((name, v))
+ elif value is not None:
+ results.append((name, el.value))
+ else:
+ assert tag == 'input', (
+ "Unexpected tag: %r" % el)
+ if el.checkable and not el.checked:
+ continue
+ if el.type in ('submit', 'image', 'reset', 'file'):
+ continue
+ value = el.value
+ if value is not None:
+ results.append((name, el.value))
+ return results
+
+ @property
+ def action(self):
+ """
+ Get/set the form's ``action`` attribute.
+ """
+ base_url = self.base_url
+ action = self.get('action')
+ if base_url and action is not None:
+ return urljoin(base_url, action)
+ else:
+ return action
+
+ @action.setter
+ def action(self, value):
+ self.set('action', value)
+
+ @action.deleter
+ def action(self):
+ attrib = self.attrib
+ if 'action' in attrib:
+ del attrib['action']
+
+ @property
+ def method(self):
+ """
+ Get/set the form's method. Always returns a capitalized
+ string, and defaults to ``'GET'``
+ """
+ return self.get('method', 'GET').upper()
+
+ @method.setter
+ def method(self, value):
+ self.set('method', value.upper())
+
+
+HtmlElementClassLookup._default_element_classes['form'] = FormElement
+
+
+def submit_form(form, extra_values=None, open_http=None):
+ """
+ Helper function to submit a form. Returns a file-like object, as from
+ ``urllib.urlopen()``. This object also has a ``.geturl()`` function,
+ which shows the URL if there were any redirects.
+
+ You can use this like::
+
+ form = doc.forms[0]
+ form.inputs['foo'].value = 'bar' # etc
+ response = form.submit()
+ doc = parse(response)
+ doc.make_links_absolute(response.geturl())
+
+ To change the HTTP requester, pass a function as ``open_http`` keyword
+ argument that opens the URL for you. The function must have the following
+ signature::
+
+ open_http(method, URL, values)
+
+ The action is one of 'GET' or 'POST', the URL is the target URL as a
+ string, and the values are a sequence of ``(name, value)`` tuples with the
+ form data.
+ """
+ values = form.form_values()
+ if extra_values:
+ if hasattr(extra_values, 'items'):
+ extra_values = extra_values.items()
+ values.extend(extra_values)
+ if open_http is None:
+ open_http = open_http_urllib
+ if form.action:
+ url = form.action
+ else:
+ url = form.base_url
+ return open_http(form.method, url, values)
+
+
+def open_http_urllib(method, url, values):
+ if not url:
+ raise ValueError("cannot submit, no URL provided")
+ ## FIXME: should test that it's not a relative URL or something
+ try:
+ from urllib import urlencode, urlopen
+ except ImportError: # Python 3
+ from urllib.request import urlopen
+ from urllib.parse import urlencode
+ if method == 'GET':
+ if '?' in url:
+ url += '&'
+ else:
+ url += '?'
+ url += urlencode(values)
+ data = None
+ else:
+ data = urlencode(values)
+ if not isinstance(data, bytes):
+ data = data.encode('ASCII')
+ return urlopen(url, data)
+
+
+class FieldsDict(MutableMapping):
+
+ def __init__(self, inputs):
+ self.inputs = inputs
+ def __getitem__(self, item):
+ return self.inputs[item].value
+ def __setitem__(self, item, value):
+ self.inputs[item].value = value
+ def __delitem__(self, item):
+ raise KeyError(
+ "You cannot remove keys from ElementDict")
+ def keys(self):
+ return self.inputs.keys()
+ def __contains__(self, item):
+ return item in self.inputs
+ def __iter__(self):
+ return iter(self.inputs.keys())
+ def __len__(self):
+ return len(self.inputs)
+
+ def __repr__(self):
+ return '<%s for form %s>' % (
+ self.__class__.__name__,
+ self.inputs.form._name())
+
+
+class InputGetter(object):
+
+ """
+ An accessor that represents all the input fields in a form.
+
+ You can get fields by name from this, with
+ ``form.inputs['field_name']``. If there are a set of checkboxes
+ with the same name, they are returned as a list (a `CheckboxGroup`
+ which also allows value setting). Radio inputs are handled
+ similarly. Use ``.keys()`` and ``.items()`` to process all fields
+ in this way.
+
+ You can also iterate over this to get all input elements. This
+ won't return the same thing as if you get all the names, as
+ checkboxes and radio elements are returned individually.
+ """
+
+ def __init__(self, form):
+ self.form = form
+
+ def __repr__(self):
+ return '<%s for form %s>' % (
+ self.__class__.__name__,
+ self.form._name())
+
+ ## FIXME: there should be more methods, and it's unclear if this is
+ ## a dictionary-like object or list-like object
+
+ def __getitem__(self, name):
+ fields = [field for field in self if field.name == name]
+ if not fields:
+ raise KeyError("No input element with the name %r" % name)
+
+ input_type = fields[0].get('type')
+ if input_type == 'radio' and len(fields) > 1:
+ group = RadioGroup(fields)
+ group.name = name
+ return group
+ elif input_type == 'checkbox' and len(fields) > 1:
+ group = CheckboxGroup(fields)
+ group.name = name
+ return group
+ else:
+ # I don't like throwing away elements like this
+ return fields[0]
+
+ def __contains__(self, name):
+ for field in self:
+ if field.name == name:
+ return True
+ return False
+
+ def keys(self):
+ """
+ Returns all unique field names, in document order.
+
+ :return: A list of all unique field names.
+ """
+ names = []
+ seen = {None}
+ for el in self:
+ name = el.name
+ if name not in seen:
+ names.append(name)
+ seen.add(name)
+ return names
+
+ def items(self):
+ """
+ Returns all fields with their names, similar to dict.items().
+
+ :return: A list of (name, field) tuples.
+ """
+ items = []
+ seen = set()
+ for el in self:
+ name = el.name
+ if name not in seen:
+ seen.add(name)
+ items.append((name, self[name]))
+ return items
+
+ def __iter__(self):
+ return self.form.iter('select', 'input', 'textarea')
+
+ def __len__(self):
+ return sum(1 for _ in self)
+
+
+class InputMixin(object):
+ """
+ Mix-in for all input elements (input, select, and textarea)
+ """
+ @property
+ def name(self):
+ """
+ Get/set the name of the element
+ """
+ return self.get('name')
+
+ @name.setter
+ def name(self, value):
+ self.set('name', value)
+
+ @name.deleter
+ def name(self):
+ attrib = self.attrib
+ if 'name' in attrib:
+ del attrib['name']
+
+ def __repr__(self):
+ type_name = getattr(self, 'type', None)
+ if type_name:
+ type_name = ' type=%r' % type_name
+ else:
+ type_name = ''
+ return '<%s %x name=%r%s>' % (
+ self.__class__.__name__, id(self), self.name, type_name)
+
+
+class TextareaElement(InputMixin, HtmlElement):
+ """
+ ``<textarea>`` element. You can get the name with ``.name`` and
+ get/set the value with ``.value``
+ """
+ @property
+ def value(self):
+ """
+ Get/set the value (which is the contents of this element)
+ """
+ content = self.text or ''
+ if self.tag.startswith("{%s}" % XHTML_NAMESPACE):
+ serialisation_method = 'xml'
+ else:
+ serialisation_method = 'html'
+ for el in self:
+ # it's rare that we actually get here, so let's not use ''.join()
+ content += etree.tostring(
+ el, method=serialisation_method, encoding='unicode')
+ return content
+
+ @value.setter
+ def value(self, value):
+ del self[:]
+ self.text = value
+
+ @value.deleter
+ def value(self):
+ self.text = ''
+ del self[:]
+
+
+HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
+
+
+class SelectElement(InputMixin, HtmlElement):
+ """
+ ``<select>`` element. You can get the name with ``.name``.
+
+ ``.value`` will be the value of the selected option, unless this
+ is a multi-select element (``<select multiple>``), in which case
+ it will be a set-like object. In either case ``.value_options``
+ gives the possible values.
+
+ The boolean attribute ``.multiple`` shows if this is a
+ multi-select.
+ """
+ @property
+ def value(self):
+ """
+ Get/set the value of this select (the selected option).
+
+ If this is a multi-select, this is a set-like object that
+ represents all the selected options.
+ """
+ if self.multiple:
+ return MultipleSelectOptions(self)
+ options = _options_xpath(self)
+
+ try:
+ selected_option = next(el for el in reversed(options) if el.get('selected') is not None)
+ except StopIteration:
+ try:
+ selected_option = next(el for el in options if el.get('disabled') is None)
+ except StopIteration:
+ return None
+ value = selected_option.get('value')
+ if value is None:
+ value = (selected_option.text or '').strip()
+ return value
+
+ @value.setter
+ def value(self, value):
+ if self.multiple:
+ if isinstance(value, basestring):
+ raise TypeError("You must pass in a sequence")
+ values = self.value
+ values.clear()
+ values.update(value)
+ return
+ checked_option = None
+ if value is not None:
+ for el in _options_xpath(self):
+ opt_value = el.get('value')
+ if opt_value is None:
+ opt_value = (el.text or '').strip()
+ if opt_value == value:
+ checked_option = el
+ break
+ else:
+ raise ValueError(
+ "There is no option with the value of %r" % value)
+ for el in _options_xpath(self):
+ if 'selected' in el.attrib:
+ del el.attrib['selected']
+ if checked_option is not None:
+ checked_option.set('selected', '')
+
+ @value.deleter
+ def value(self):
+ # FIXME: should del be allowed at all?
+ if self.multiple:
+ self.value.clear()
+ else:
+ self.value = None
+
+ @property
+ def value_options(self):
+ """
+ All the possible values this select can have (the ``value``
+ attribute of all the ``<option>`` elements.
+ """
+ options = []
+ for el in _options_xpath(self):
+ value = el.get('value')
+ if value is None:
+ value = (el.text or '').strip()
+ options.append(value)
+ return options
+
+ @property
+ def multiple(self):
+ """
+ Boolean attribute: is there a ``multiple`` attribute on this element.
+ """
+ return 'multiple' in self.attrib
+
+ @multiple.setter
+ def multiple(self, value):
+ if value:
+ self.set('multiple', '')
+ elif 'multiple' in self.attrib:
+ del self.attrib['multiple']
+
+
+HtmlElementClassLookup._default_element_classes['select'] = SelectElement
+
+
+class MultipleSelectOptions(SetMixin):
+ """
+ Represents all the selected options in a ``<select multiple>`` element.
+
+ You can add to this set-like option to select an option, or remove
+ to unselect the option.
+ """
+
+ def __init__(self, select):
+ self.select = select
+
+ @property
+ def options(self):
+ """
+ Iterator of all the ``<option>`` elements.
+ """
+ return iter(_options_xpath(self.select))
+
+ def __iter__(self):
+ for option in self.options:
+ if 'selected' in option.attrib:
+ opt_value = option.get('value')
+ if opt_value is None:
+ opt_value = (option.text or '').strip()
+ yield opt_value
+
+ def add(self, item):
+ for option in self.options:
+ opt_value = option.get('value')
+ if opt_value is None:
+ opt_value = (option.text or '').strip()
+ if opt_value == item:
+ option.set('selected', '')
+ break
+ else:
+ raise ValueError(
+ "There is no option with the value %r" % item)
+
+ def remove(self, item):
+ for option in self.options:
+ opt_value = option.get('value')
+ if opt_value is None:
+ opt_value = (option.text or '').strip()
+ if opt_value == item:
+ if 'selected' in option.attrib:
+ del option.attrib['selected']
+ else:
+ raise ValueError(
+ "The option %r is not currently selected" % item)
+ break
+ else:
+ raise ValueError(
+ "There is not option with the value %r" % item)
+
+ def __repr__(self):
+ return '<%s {%s} for select name=%r>' % (
+ self.__class__.__name__,
+ ', '.join([repr(v) for v in self]),
+ self.select.name)
+
+
+class RadioGroup(list):
+ """
+ This object represents several ``<input type=radio>`` elements
+ that have the same name.
+
+ You can use this like a list, but also use the property
+ ``.value`` to check/uncheck inputs. Also you can use
+ ``.value_options`` to get the possible values.
+ """
+ @property
+ def value(self):
+ """
+ Get/set the value, which checks the radio with that value (and
+ unchecks any other value).
+ """
+ for el in self:
+ if 'checked' in el.attrib:
+ return el.get('value')
+ return None
+
+ @value.setter
+ def value(self, value):
+ checked_option = None
+ if value is not None:
+ for el in self:
+ if el.get('value') == value:
+ checked_option = el
+ break
+ else:
+ raise ValueError("There is no radio input with the value %r" % value)
+ for el in self:
+ if 'checked' in el.attrib:
+ del el.attrib['checked']
+ if checked_option is not None:
+ checked_option.set('checked', '')
+
+ @value.deleter
+ def value(self):
+ self.value = None
+
+ @property
+ def value_options(self):
+ """
+ Returns a list of all the possible values.
+ """
+ return [el.get('value') for el in self]
+
+ def __repr__(self):
+ return '%s(%s)' % (
+ self.__class__.__name__,
+ list.__repr__(self))
+
+
+class CheckboxGroup(list):
+ """
+ Represents a group of checkboxes (``<input type=checkbox>``) that
+ have the same name.
+
+ In addition to using this like a list, the ``.value`` attribute
+ returns a set-like object that you can add to or remove from to
+ check and uncheck checkboxes. You can also use ``.value_options``
+ to get the possible values.
+ """
+ @property
+ def value(self):
+ """
+ Return a set-like object that can be modified to check or
+ uncheck individual checkboxes according to their value.
+ """
+ return CheckboxValues(self)
+
+ @value.setter
+ def value(self, value):
+ values = self.value
+ values.clear()
+ if not hasattr(value, '__iter__'):
+ raise ValueError(
+ "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
+ % (self[0].name, value))
+ values.update(value)
+
+ @value.deleter
+ def value(self):
+ self.value.clear()
+
+ @property
+ def value_options(self):
+ """
+ Returns a list of all the possible values.
+ """
+ return [el.get('value') for el in self]
+
+ def __repr__(self):
+ return '%s(%s)' % (
+ self.__class__.__name__, list.__repr__(self))
+
+
+class CheckboxValues(SetMixin):
+ """
+ Represents the values of the checked checkboxes in a group of
+ checkboxes with the same name.
+ """
+
+ def __init__(self, group):
+ self.group = group
+
+ def __iter__(self):
+ return iter([
+ el.get('value')
+ for el in self.group
+ if 'checked' in el.attrib])
+
+ def add(self, value):
+ for el in self.group:
+ if el.get('value') == value:
+ el.set('checked', '')
+ break
+ else:
+ raise KeyError("No checkbox with value %r" % value)
+
+ def remove(self, value):
+ for el in self.group:
+ if el.get('value') == value:
+ if 'checked' in el.attrib:
+ del el.attrib['checked']
+ else:
+ raise KeyError(
+ "The checkbox with value %r was already unchecked" % value)
+ break
+ else:
+ raise KeyError(
+ "No checkbox with value %r" % value)
+
+ def __repr__(self):
+ return '<%s {%s} for checkboxes name=%r>' % (
+ self.__class__.__name__,
+ ', '.join([repr(v) for v in self]),
+ self.group.name)
+
+
+class InputElement(InputMixin, HtmlElement):
+ """
+ Represents an ``<input>`` element.
+
+ You can get the type with ``.type`` (which is lower-cased and
+ defaults to ``'text'``).
+
+ Also you can get and set the value with ``.value``
+
+ Checkboxes and radios have the attribute ``input.checkable ==
+ True`` (for all others it is false) and a boolean attribute
+ ``.checked``.
+
+ """
+
+ ## FIXME: I'm a little uncomfortable with the use of .checked
+ @property
+ def value(self):
+ """
+ Get/set the value of this element, using the ``value`` attribute.
+
+ Also, if this is a checkbox and it has no value, this defaults
+ to ``'on'``. If it is a checkbox or radio that is not
+ checked, this returns None.
+ """
+ if self.checkable:
+ if self.checked:
+ return self.get('value') or 'on'
+ else:
+ return None
+ return self.get('value')
+
+ @value.setter
+ def value(self, value):
+ if self.checkable:
+ if not value:
+ self.checked = False
+ else:
+ self.checked = True
+ if isinstance(value, basestring):
+ self.set('value', value)
+ else:
+ self.set('value', value)
+
+ @value.deleter
+ def value(self):
+ if self.checkable:
+ self.checked = False
+ else:
+ if 'value' in self.attrib:
+ del self.attrib['value']
+
+ @property
+ def type(self):
+ """
+ Return the type of this element (using the type attribute).
+ """
+ return self.get('type', 'text').lower()
+
+ @type.setter
+ def type(self, value):
+ self.set('type', value)
+
+ @property
+ def checkable(self):
+ """
+ Boolean: can this element be checked?
+ """
+ return self.type in ('checkbox', 'radio')
+
+ @property
+ def checked(self):
+ """
+ Boolean attribute to get/set the presence of the ``checked``
+ attribute.
+
+ You can only use this on checkable input types.
+ """
+ if not self.checkable:
+ raise AttributeError('Not a checkable input type')
+ return 'checked' in self.attrib
+
+ @checked.setter
+ def checked(self, value):
+ if not self.checkable:
+ raise AttributeError('Not a checkable input type')
+ if value:
+ self.set('checked', '')
+ else:
+ attrib = self.attrib
+ if 'checked' in attrib:
+ del attrib['checked']
+
+
+HtmlElementClassLookup._default_element_classes['input'] = InputElement
+
+
+class LabelElement(HtmlElement):
+ """
+ Represents a ``<label>`` element.
+
+ Label elements are linked to other elements with their ``for``
+ attribute. You can access this element with ``label.for_element``.
+ """
+ @property
+ def for_element(self):
+ """
+ Get/set the element this label points to. Return None if it
+ can't be found.
+ """
+ id = self.get('for')
+ if not id:
+ return None
+ return self.body.get_element_by_id(id)
+
+ @for_element.setter
+ def for_element(self, other):
+ id = other.get('id')
+ if not id:
+ raise TypeError(
+ "Element %r has no id attribute" % other)
+ self.set('for', id)
+
+ @for_element.deleter
+ def for_element(self):
+ attrib = self.attrib
+ if 'id' in attrib:
+ del attrib['id']
+
+
+HtmlElementClassLookup._default_element_classes['label'] = LabelElement
+
+
+############################################################
+## Serialization
+############################################################
+
+def html_to_xhtml(html):
+ """Convert all tags in an HTML tree to XHTML by moving them to the
+ XHTML namespace.
+ """
+ try:
+ html = html.getroot()
+ except AttributeError:
+ pass
+ prefix = "{%s}" % XHTML_NAMESPACE
+ for el in html.iter(etree.Element):
+ tag = el.tag
+ if tag[0] != '{':
+ el.tag = prefix + tag
+
+
+def xhtml_to_html(xhtml):
+ """Convert all tags in an XHTML tree to HTML by removing their
+ XHTML namespace.
+ """
+ try:
+ xhtml = xhtml.getroot()
+ except AttributeError:
+ pass
+ prefix = "{%s}" % XHTML_NAMESPACE
+ prefix_len = len(prefix)
+ for el in xhtml.iter(prefix + "*"):
+ el.tag = el.tag[prefix_len:]
+
+
+# This isn't a general match, but it's a match for what libxml2
+# specifically serialises:
+__str_replace_meta_content_type = re.compile(
+ r'<meta http-equiv="Content-Type"[^>]*>').sub
+__bytes_replace_meta_content_type = re.compile(
+ r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub
+
+
+def tostring(doc, pretty_print=False, include_meta_content_type=False,
+ encoding=None, method="html", with_tail=True, doctype=None):
+ """Return an HTML string representation of the document.
+
+ Note: if include_meta_content_type is true this will create a
+ ``<meta http-equiv="Content-Type" ...>`` tag in the head;
+ regardless of the value of include_meta_content_type any existing
+ ``<meta http-equiv="Content-Type" ...>`` tag will be removed
+
+ The ``encoding`` argument controls the output encoding (defaults to
+ ASCII, with &#...; character references for any characters outside
+ of ASCII). Note that you can pass the name ``'unicode'`` as
+ ``encoding`` argument to serialise to a Unicode string.
+
+ The ``method`` argument defines the output method. It defaults to
+ 'html', but can also be 'xml' for xhtml output, or 'text' to
+ serialise to plain text without markup.
+
+ To leave out the tail text of the top-level element that is being
+ serialised, pass ``with_tail=False``.
+
+ The ``doctype`` option allows passing in a plain string that will
+ be serialised before the XML tree. Note that passing in non
+ well-formed content here will make the XML output non well-formed.
+ Also, an existing doctype in the document tree will not be removed
+ when serialising an ElementTree instance.
+
+ Example::
+
+ >>> from lxml import html
+ >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
+
+ >>> html.tostring(root)
+ b'<p>Hello<br>world!</p>'
+ >>> html.tostring(root, method='html')
+ b'<p>Hello<br>world!</p>'
+
+ >>> html.tostring(root, method='xml')
+ b'<p>Hello<br/>world!</p>'
+
+ >>> html.tostring(root, method='text')
+ b'Helloworld!'
+
+ >>> html.tostring(root, method='text', encoding='unicode')
+ u'Helloworld!'
+
+ >>> root = html.fragment_fromstring('<div><p>Hello<br>world!</p>TAIL</div>')
+ >>> html.tostring(root[0], method='text', encoding='unicode')
+ u'Helloworld!TAIL'
+
+ >>> html.tostring(root[0], method='text', encoding='unicode', with_tail=False)
+ u'Helloworld!'
+
+ >>> doc = html.document_fromstring('<p>Hello<br>world!</p>')
+ >>> html.tostring(doc, method='html', encoding='unicode')
+ u'<html><body><p>Hello<br>world!</p></body></html>'
+
+ >>> print(html.tostring(doc, method='html', encoding='unicode',
+ ... doctype='<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"'
+ ... ' "http://www.w3.org/TR/html4/strict.dtd">'))
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+ <html><body><p>Hello<br>world!</p></body></html>
+ """
+ html = etree.tostring(doc, method=method, pretty_print=pretty_print,
+ encoding=encoding, with_tail=with_tail,
+ doctype=doctype)
+ if method == 'html' and not include_meta_content_type:
+ if isinstance(html, str):
+ html = __str_replace_meta_content_type('', html)
+ else:
+ html = __bytes_replace_meta_content_type(bytes(), html)
+ return html
+
+
+tostring.__doc__ = __fix_docstring(tostring.__doc__)
+
+
+def open_in_browser(doc, encoding=None):
+ """
+ Open the HTML document in a web browser, saving it to a temporary
+ file to open it. Note that this does not delete the file after
+ use. This is mainly meant for debugging.
+ """
+ import os
+ import webbrowser
+ import tempfile
+ if not isinstance(doc, etree._ElementTree):
+ doc = etree.ElementTree(doc)
+ handle, fn = tempfile.mkstemp(suffix='.html')
+ f = os.fdopen(handle, 'wb')
+ try:
+ doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8")
+ finally:
+ # we leak the file itself here, but we should at least close it
+ f.close()
+ url = 'file://' + fn.replace(os.path.sep, '/')
+ print(url)
+ webbrowser.open(url)
+
+
+################################################################################
+# configure Element class lookup
+################################################################################
+
+class HTMLParser(etree.HTMLParser):
+ """An HTML parser that is configured to return lxml.html Element
+ objects.
+ """
+ def __init__(self, **kwargs):
+ super(HTMLParser, self).__init__(**kwargs)
+ self.set_element_class_lookup(HtmlElementClassLookup())
+
+
+class XHTMLParser(etree.XMLParser):
+ """An XML parser that is configured to return lxml.html Element
+ objects.
+
+ Note that this parser is not really XHTML aware unless you let it
+ load a DTD that declares the HTML entities. To do this, make sure
+ you have the XHTML DTDs installed in your catalogs, and create the
+ parser like this::
+
+ >>> parser = XHTMLParser(load_dtd=True)
+
+ If you additionally want to validate the document, use this::
+
+ >>> parser = XHTMLParser(dtd_validation=True)
+
+ For catalog support, see http://www.xmlsoft.org/catalog.html.
+ """
+ def __init__(self, **kwargs):
+ super(XHTMLParser, self).__init__(**kwargs)
+ self.set_element_class_lookup(HtmlElementClassLookup())
+
+
+def Element(*args, **kw):
+ """Create a new HTML Element.
+
+ This can also be used for XHTML documents.
+ """
+ v = html_parser.makeelement(*args, **kw)
+ return v
+
+
+html_parser = HTMLParser()
+xhtml_parser = XHTMLParser()
diff --git a/src/lxml/html/_diffcommand.py b/src/lxml/html/_diffcommand.py
new file mode 100644
index 0000000..e0502c0
--- /dev/null
+++ b/src/lxml/html/_diffcommand.py
@@ -0,0 +1,88 @@
+from __future__ import absolute_import
+
+import optparse
+import sys
+import re
+import os
+from .diff import htmldiff
+
+description = """\
+"""
+
+parser = optparse.OptionParser(
+ usage="%prog [OPTIONS] FILE1 FILE2\n"
+ "%prog --annotate [OPTIONS] INFO1 FILE1 INFO2 FILE2 ...",
+ description=description,
+ )
+
+parser.add_option(
+ '-o', '--output',
+ metavar="FILE",
+ dest="output",
+ default="-",
+ help="File to write the difference to",
+ )
+
+parser.add_option(
+ '-a', '--annotation',
+ action="store_true",
+ dest="annotation",
+ help="Do an annotation")
+
+def main(args=None):
+ if args is None:
+ args = sys.argv[1:]
+ options, args = parser.parse_args(args)
+ if options.annotation:
+ return annotate(options, args)
+ if len(args) != 2:
+ print('Error: you must give two files')
+ parser.print_help()
+ sys.exit(1)
+ file1, file2 = args
+ input1 = read_file(file1)
+ input2 = read_file(file2)
+ body1 = split_body(input1)[1]
+ pre, body2, post = split_body(input2)
+ result = htmldiff(body1, body2)
+ result = pre + result + post
+ if options.output == '-':
+ if not result.endswith('\n'):
+ result += '\n'
+ sys.stdout.write(result)
+ else:
+ with open(options.output, 'wb') as f:
+ f.write(result)
+
+def read_file(filename):
+ if filename == '-':
+ c = sys.stdin.read()
+ elif not os.path.exists(filename):
+ raise OSError(
+ "Input file %s does not exist" % filename)
+ else:
+ with open(filename, 'rb') as f:
+ c = f.read()
+ return c
+
+body_start_re = re.compile(
+ r"<body.*?>", re.I|re.S)
+body_end_re = re.compile(
+ r"</body.*?>", re.I|re.S)
+
+def split_body(html):
+ pre = post = ''
+ match = body_start_re.search(html)
+ if match:
+ pre = html[:match.end()]
+ html = html[match.end():]
+ match = body_end_re.search(html)
+ if match:
+ post = html[match.start():]
+ html = html[:match.start()]
+ return pre, html, post
+
+def annotate(options, args):
+ print("Not yet implemented")
+ sys.exit(1)
+
diff --git a/src/lxml/html/_html5builder.py b/src/lxml/html/_html5builder.py
new file mode 100644
index 0000000..3405c20
--- /dev/null
+++ b/src/lxml/html/_html5builder.py
@@ -0,0 +1,100 @@
+"""
+Legacy module - don't use in new code!
+
+html5lib now has its own proper implementation.
+
+This module implements a tree builder for html5lib that generates lxml
+html element trees. This module uses camelCase as it follows the
+html5lib style guide.
+"""
+
+from html5lib.treebuilders import _base, etree as etree_builders
+from lxml import html, etree
+
+
+class DocumentType(object):
+
+ def __init__(self, name, publicId, systemId):
+ self.name = name
+ self.publicId = publicId
+ self.systemId = systemId
+
+class Document(object):
+
+ def __init__(self):
+ self._elementTree = None
+ self.childNodes = []
+
+ def appendChild(self, element):
+ self._elementTree.getroot().addnext(element._element)
+
+
+class TreeBuilder(_base.TreeBuilder):
+ documentClass = Document
+ doctypeClass = DocumentType
+ elementClass = None
+ commentClass = None
+ fragmentClass = Document
+
+ def __init__(self, *args, **kwargs):
+ html_builder = etree_builders.getETreeModule(html, fullTree=False)
+ etree_builder = etree_builders.getETreeModule(etree, fullTree=False)
+ self.elementClass = html_builder.Element
+ self.commentClass = etree_builder.Comment
+ _base.TreeBuilder.__init__(self, *args, **kwargs)
+
+ def reset(self):
+ _base.TreeBuilder.reset(self)
+ self.rootInserted = False
+ self.initialComments = []
+ self.doctype = None
+
+ def getDocument(self):
+ return self.document._elementTree
+
+ def getFragment(self):
+ fragment = []
+ element = self.openElements[0]._element
+ if element.text:
+ fragment.append(element.text)
+ fragment.extend(element.getchildren())
+ if element.tail:
+ fragment.append(element.tail)
+ return fragment
+
+ def insertDoctype(self, name, publicId, systemId):
+ doctype = self.doctypeClass(name, publicId, systemId)
+ self.doctype = doctype
+
+ def insertComment(self, data, parent=None):
+ if not self.rootInserted:
+ self.initialComments.append(data)
+ else:
+ _base.TreeBuilder.insertComment(self, data, parent)
+
+ def insertRoot(self, name):
+ buf = []
+ if self.doctype and self.doctype.name:
+ buf.append('<!DOCTYPE %s' % self.doctype.name)
+ if self.doctype.publicId is not None or self.doctype.systemId is not None:
+ buf.append(' PUBLIC "%s" "%s"' % (self.doctype.publicId,
+ self.doctype.systemId))
+ buf.append('>')
+ buf.append('<html></html>')
+ root = html.fromstring(''.join(buf))
+
+ # Append the initial comments:
+ for comment in self.initialComments:
+ root.addprevious(etree.Comment(comment))
+
+ # Create the root document and add the ElementTree to it
+ self.document = self.documentClass()
+ self.document._elementTree = root.getroottree()
+
+ # Add the root element to the internal child/open data structures
+ root_element = self.elementClass(name)
+ root_element._element = root
+ self.document.childNodes.append(root_element)
+ self.openElements.append(root_element)
+
+ self.rootInserted = True
diff --git a/src/lxml/html/_setmixin.py b/src/lxml/html/_setmixin.py
new file mode 100644
index 0000000..c99738e
--- /dev/null
+++ b/src/lxml/html/_setmixin.py
@@ -0,0 +1,56 @@
+try:
+ from collections.abc import MutableSet
+except ImportError:
+ from collections import MutableSet
+
+
+class SetMixin(MutableSet):
+
+ """
+ Mix-in for sets. You must define __iter__, add, remove
+ """
+
+ def __len__(self):
+ length = 0
+ for item in self:
+ length += 1
+ return length
+
+ def __contains__(self, item):
+ for has_item in self:
+ if item == has_item:
+ return True
+ return False
+
+ issubset = MutableSet.__le__
+ issuperset = MutableSet.__ge__
+
+ union = MutableSet.__or__
+ intersection = MutableSet.__and__
+ difference = MutableSet.__sub__
+ symmetric_difference = MutableSet.__xor__
+
+ def copy(self):
+ return set(self)
+
+ def update(self, other):
+ self |= other
+
+ def intersection_update(self, other):
+ self &= other
+
+ def difference_update(self, other):
+ self -= other
+
+ def symmetric_difference_update(self, other):
+ self ^= other
+
+ def discard(self, item):
+ try:
+ self.remove(item)
+ except KeyError:
+ pass
+
+ @classmethod
+ def _from_iterable(cls, it):
+ return set(it)
diff --git a/src/lxml/html/builder.py b/src/lxml/html/builder.py
new file mode 100644
index 0000000..8a074ec
--- /dev/null
+++ b/src/lxml/html/builder.py
@@ -0,0 +1,133 @@
+# --------------------------------------------------------------------
+# The ElementTree toolkit is
+# Copyright (c) 1999-2004 by Fredrik Lundh
+# --------------------------------------------------------------------
+
+"""
+A set of HTML generator tags for building HTML documents.
+
+Usage::
+
+ >>> from lxml.html.builder import *
+ >>> html = HTML(
+ ... HEAD( TITLE("Hello World") ),
+ ... BODY( CLASS("main"),
+ ... H1("Hello World !")
+ ... )
+ ... )
+
+ >>> import lxml.etree
+ >>> print lxml.etree.tostring(html, pretty_print=True)
+ <html>
+ <head>
+ <title>Hello World</title>
+ </head>
+ <body class="main">
+ <h1>Hello World !</h1>
+ </body>
+ </html>
+
+"""
+
+from lxml.builder import ElementMaker
+from lxml.html import html_parser
+
+E = ElementMaker(makeelement=html_parser.makeelement)
+
+# elements
+A = E.a #: anchor
+ABBR = E.abbr #: abbreviated form (e.g., WWW, HTTP, etc.)
+ACRONYM = E.acronym #:
+ADDRESS = E.address #: information on author
+APPLET = E.applet #: Java applet (DEPRECATED)
+AREA = E.area #: client-side image map area
+B = E.b #: bold text style
+BASE = E.base #: document base URI
+BASEFONT = E.basefont #: base font size (DEPRECATED)
+BDO = E.bdo #: I18N BiDi over-ride
+BIG = E.big #: large text style
+BLOCKQUOTE = E.blockquote #: long quotation
+BODY = E.body #: document body
+BR = E.br #: forced line break
+BUTTON = E.button #: push button
+CAPTION = E.caption #: table caption
+CENTER = E.center #: shorthand for DIV align=center (DEPRECATED)
+CITE = E.cite #: citation
+CODE = E.code #: computer code fragment
+COL = E.col #: table column
+COLGROUP = E.colgroup #: table column group
+DD = E.dd #: definition description
+DEL = getattr(E, 'del') #: deleted text
+DFN = E.dfn #: instance definition
+DIR = E.dir #: directory list (DEPRECATED)
+DIV = E.div #: generic language/style container
+DL = E.dl #: definition list
+DT = E.dt #: definition term
+EM = E.em #: emphasis
+FIELDSET = E.fieldset #: form control group
+FONT = E.font #: local change to font (DEPRECATED)
+FORM = E.form #: interactive form
+FRAME = E.frame #: subwindow
+FRAMESET = E.frameset #: window subdivision
+H1 = E.h1 #: heading
+H2 = E.h2 #: heading
+H3 = E.h3 #: heading
+H4 = E.h4 #: heading
+H5 = E.h5 #: heading
+H6 = E.h6 #: heading
+HEAD = E.head #: document head
+HR = E.hr #: horizontal rule
+HTML = E.html #: document root element
+I = E.i #: italic text style
+IFRAME = E.iframe #: inline subwindow
+IMG = E.img #: Embedded image
+INPUT = E.input #: form control
+INS = E.ins #: inserted text
+ISINDEX = E.isindex #: single line prompt (DEPRECATED)
+KBD = E.kbd #: text to be entered by the user
+LABEL = E.label #: form field label text
+LEGEND = E.legend #: fieldset legend
+LI = E.li #: list item
+LINK = E.link #: a media-independent link
+MAP = E.map #: client-side image map
+MENU = E.menu #: menu list (DEPRECATED)
+META = E.meta #: generic metainformation
+NOFRAMES = E.noframes #: alternate content container for non frame-based rendering
+NOSCRIPT = E.noscript #: alternate content container for non script-based rendering
+OBJECT = E.object #: generic embedded object
+OL = E.ol #: ordered list
+OPTGROUP = E.optgroup #: option group
+OPTION = E.option #: selectable choice
+P = E.p #: paragraph
+PARAM = E.param #: named property value
+PRE = E.pre #: preformatted text
+Q = E.q #: short inline quotation
+S = E.s #: strike-through text style (DEPRECATED)
+SAMP = E.samp #: sample program output, scripts, etc.
+SCRIPT = E.script #: script statements
+SELECT = E.select #: option selector
+SMALL = E.small #: small text style
+SPAN = E.span #: generic language/style container
+STRIKE = E.strike #: strike-through text (DEPRECATED)
+STRONG = E.strong #: strong emphasis
+STYLE = E.style #: style info
+SUB = E.sub #: subscript
+SUP = E.sup #: superscript
+TABLE = E.table #:
+TBODY = E.tbody #: table body
+TD = E.td #: table data cell
+TEXTAREA = E.textarea #: multi-line text field
+TFOOT = E.tfoot #: table footer
+TH = E.th #: table header cell
+THEAD = E.thead #: table header
+TITLE = E.title #: document title
+TR = E.tr #: table row
+TT = E.tt #: teletype or monospaced text style
+U = E.u #: underlined text style (DEPRECATED)
+UL = E.ul #: unordered list
+VAR = E.var #: instance of a variable or program argument
+
+# attributes (only reserved words are included here)
+ATTR = dict
+def CLASS(v): return {'class': v}
+def FOR(v): return {'for': v}
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
new file mode 100644
index 0000000..0494357
--- /dev/null
+++ b/src/lxml/html/clean.py
@@ -0,0 +1,779 @@
+# cython: language_level=3str
+
+"""A cleanup tool for HTML.
+
+Removes unwanted tags and content. See the `Cleaner` class for
+details.
+"""
+
+from __future__ import absolute_import
+
+import copy
+import re
+import sys
+try:
+ from urlparse import urlsplit
+ from urllib import unquote_plus
+except ImportError:
+ # Python 3
+ from urllib.parse import urlsplit, unquote_plus
+from lxml import etree
+from lxml.html import defs
+from lxml.html import fromstring, XHTML_NAMESPACE
+from lxml.html import xhtml_to_html, _transform_result
+
+try:
+ unichr
+except NameError:
+ # Python 3
+ unichr = chr
+try:
+ unicode
+except NameError:
+ # Python 3
+ unicode = str
+try:
+ basestring
+except NameError:
+ basestring = (str, bytes)
+
+
+__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
+ 'word_break', 'word_break_html']
+
+# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
+# Particularly the CSS cleaning; most of the tag cleaning is integrated now
+# I have multiple kinds of schemes searched; but should schemes be
+# whitelisted instead?
+# max height?
+# remove images? Also in CSS? background attribute?
+# Some way to whitelist object, iframe, etc (e.g., if you want to
+# allow *just* embedded YouTube movies)
+# Log what was deleted and why?
+# style="behavior: ..." might be bad in IE?
+# Should we have something for just <meta http-equiv>? That's the worst of the
+# metas.
+# UTF-7 detections? Example:
+# <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
+# you don't always have to have the charset set, if the page has no charset
+# and there's UTF7-like code in it.
+# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
+
+
+# This is an IE-specific construct you can have in a stylesheet to
+# run some Javascript:
+_replace_css_javascript = re.compile(
+ r'expression\s*\(.*?\)', re.S|re.I).sub
+
+# Do I have to worry about @\nimport?
+_replace_css_import = re.compile(
+ r'@\s*import', re.I).sub
+
+_looks_like_tag_content = re.compile(
+ r'</?[a-zA-Z]+|\son[a-zA-Z]+\s*=',
+ *((re.ASCII,) if sys.version_info[0] >= 3 else ())).search
+
+# All kinds of schemes besides just javascript: that can cause
+# execution:
+_is_image_dataurl = re.compile(
+ r'^data:image/.+;base64', re.I).search
+_is_possibly_malicious_scheme = re.compile(
+ r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
+ re.I).search
+def _is_javascript_scheme(s):
+ if _is_image_dataurl(s):
+ return None
+ return _is_possibly_malicious_scheme(s)
+
+_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
+# FIXME: should data: be blocked?
+
+# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
+_conditional_comment_re = re.compile(
+ r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
+
+_find_styled_elements = etree.XPath(
+ "descendant-or-self::*[@style]")
+
+_find_external_links = etree.XPath(
+ ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
+ "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
+ namespaces={'x':XHTML_NAMESPACE})
+
+
+class Cleaner(object):
+ """
+ Instances cleans the document of each of the possible offending
+ elements. The cleaning is controlled by attributes; you can
+ override attributes in a subclass, or set them in the constructor.
+
+ ``scripts``:
+ Removes any ``<script>`` tags.
+
+ ``javascript``:
+ Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
+ as they could contain Javascript.
+
+ ``comments``:
+ Removes any comments.
+
+ ``style``:
+ Removes any style tags.
+
+ ``inline_style``
+ Removes any style attributes. Defaults to the value of the ``style`` option.
+
+ ``links``:
+ Removes any ``<link>`` tags
+
+ ``meta``:
+ Removes any ``<meta>`` tags
+
+ ``page_structure``:
+ Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
+
+ ``processing_instructions``:
+ Removes any processing instructions.
+
+ ``embedded``:
+ Removes any embedded objects (flash, iframes)
+
+ ``frames``:
+ Removes any frame-related tags
+
+ ``forms``:
+ Removes any form tags
+
+ ``annoying_tags``:
+ Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
+
+ ``remove_tags``:
+ A list of tags to remove. Only the tags will be removed,
+ their content will get pulled up into the parent tag.
+
+ ``kill_tags``:
+ A list of tags to kill. Killing also removes the tag's content,
+ i.e. the whole subtree, not just the tag itself.
+
+ ``allow_tags``:
+ A list of tags to include (default include all).
+
+ ``remove_unknown_tags``:
+ Remove any tags that aren't standard parts of HTML.
+
+ ``safe_attrs_only``:
+ If true, only include 'safe' attributes (specifically the list
+ from the feedparser HTML sanitisation web site).
+
+ ``safe_attrs``:
+ A set of attribute names to override the default list of attributes
+ considered 'safe' (when safe_attrs_only=True).
+
+ ``add_nofollow``:
+ If true, then any <a> tags will have ``rel="nofollow"`` added to them.
+
+ ``host_whitelist``:
+ A list or set of hosts that you can use for embedded content
+ (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
+ You can also implement/override the method
+ ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
+ implement more complex rules for what can be embedded.
+ Anything that passes this test will be shown, regardless of
+ the value of (for instance) ``embedded``.
+
+ Note that this parameter might not work as intended if you do not
+ make the links absolute before doing the cleaning.
+
+ Note that you may also need to set ``whitelist_tags``.
+
+ ``whitelist_tags``:
+ A set of tags that can be included with ``host_whitelist``.
+ The default is ``iframe`` and ``embed``; you may wish to
+ include other tags like ``script``, or you may want to
+ implement ``allow_embedded_url`` for more control. Set to None to
+ include all tags.
+
+ This modifies the document *in place*.
+ """
+
+ scripts = True
+ javascript = True
+ comments = True
+ style = False
+ inline_style = None
+ links = True
+ meta = True
+ page_structure = True
+ processing_instructions = True
+ embedded = True
+ frames = True
+ forms = True
+ annoying_tags = True
+ remove_tags = None
+ allow_tags = None
+ kill_tags = None
+ remove_unknown_tags = True
+ safe_attrs_only = True
+ safe_attrs = defs.safe_attrs
+ add_nofollow = False
+ host_whitelist = ()
+ whitelist_tags = {'iframe', 'embed'}
+
+ def __init__(self, **kw):
+ not_an_attribute = object()
+ for name, value in kw.items():
+ default = getattr(self, name, not_an_attribute)
+ if (default is not None and default is not True and default is not False
+ and not isinstance(default, (frozenset, set, tuple, list))):
+ raise TypeError(
+ "Unknown parameter: %s=%r" % (name, value))
+ setattr(self, name, value)
+ if self.inline_style is None and 'inline_style' not in kw:
+ self.inline_style = self.style
+
+ if kw.get("allow_tags"):
+ if kw.get("remove_unknown_tags"):
+ raise ValueError("It does not make sense to pass in both "
+ "allow_tags and remove_unknown_tags")
+ self.remove_unknown_tags = False
+
+ # Used to lookup the primary URL for a given tag that is up for
+ # removal:
+ _tag_link_attrs = dict(
+ script='src',
+ link='href',
+ # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
+ # From what I can tell, both attributes can contain a link:
+ applet=['code', 'object'],
+ iframe='src',
+ embed='src',
+ layer='src',
+ # FIXME: there doesn't really seem like a general way to figure out what
+ # links an <object> tag uses; links often go in <param> tags with values
+ # that we don't really know. You'd have to have knowledge about specific
+ # kinds of plugins (probably keyed off classid), and match against those.
+ ##object=?,
+ # FIXME: not looking at the action currently, because it is more complex
+ # than than -- if you keep the form, you should keep the form controls.
+ ##form='action',
+ a='href',
+ )
+
+ def __call__(self, doc):
+ """
+ Cleans the document.
+ """
+ try:
+ getroot = doc.getroot
+ except AttributeError:
+ pass # Element instance
+ else:
+ doc = getroot() # ElementTree instance, instead of an element
+ # convert XHTML to HTML
+ xhtml_to_html(doc)
+ # Normalize a case that IE treats <image> like <img>, and that
+ # can confuse either this step or later steps.
+ for el in doc.iter('image'):
+ el.tag = 'img'
+ if not self.comments:
+ # Of course, if we were going to kill comments anyway, we don't
+ # need to worry about this
+ self.kill_conditional_comments(doc)
+
+ kill_tags = set(self.kill_tags or ())
+ remove_tags = set(self.remove_tags or ())
+ allow_tags = set(self.allow_tags or ())
+
+ if self.scripts:
+ kill_tags.add('script')
+ if self.safe_attrs_only:
+ safe_attrs = set(self.safe_attrs)
+ for el in doc.iter(etree.Element):
+ attrib = el.attrib
+ for aname in attrib.keys():
+ if aname not in safe_attrs:
+ del attrib[aname]
+ if self.javascript:
+ if not (self.safe_attrs_only and
+ self.safe_attrs == defs.safe_attrs):
+ # safe_attrs handles events attributes itself
+ for el in doc.iter(etree.Element):
+ attrib = el.attrib
+ for aname in attrib.keys():
+ if aname.startswith('on'):
+ del attrib[aname]
+ doc.rewrite_links(self._remove_javascript_link,
+ resolve_base_href=False)
+ # If we're deleting style then we don't have to remove JS links
+ # from styles, otherwise...
+ if not self.inline_style:
+ for el in _find_styled_elements(doc):
+ old = el.get('style')
+ new = _replace_css_javascript('', old)
+ new = _replace_css_import('', new)
+ if self._has_sneaky_javascript(new):
+ # Something tricky is going on...
+ del el.attrib['style']
+ elif new != old:
+ el.set('style', new)
+ if not self.style:
+ for el in list(doc.iter('style')):
+ if el.get('type', '').lower().strip() == 'text/javascript':
+ el.drop_tree()
+ continue
+ old = el.text or ''
+ new = _replace_css_javascript('', old)
+ # The imported CSS can do anything; we just can't allow:
+ new = _replace_css_import('', new)
+ if self._has_sneaky_javascript(new):
+ # Something tricky is going on...
+ el.text = '/* deleted */'
+ elif new != old:
+ el.text = new
+ if self.comments:
+ kill_tags.add(etree.Comment)
+ if self.processing_instructions:
+ kill_tags.add(etree.ProcessingInstruction)
+ if self.style:
+ kill_tags.add('style')
+ if self.inline_style:
+ etree.strip_attributes(doc, 'style')
+ if self.links:
+ kill_tags.add('link')
+ elif self.style or self.javascript:
+ # We must get rid of included stylesheets if Javascript is not
+ # allowed, as you can put Javascript in them
+ for el in list(doc.iter('link')):
+ if 'stylesheet' in el.get('rel', '').lower():
+ # Note this kills alternate stylesheets as well
+ if not self.allow_element(el):
+ el.drop_tree()
+ if self.meta:
+ kill_tags.add('meta')
+ if self.page_structure:
+ remove_tags.update(('head', 'html', 'title'))
+ if self.embedded:
+ # FIXME: is <layer> really embedded?
+ # We should get rid of any <param> tags not inside <applet>;
+ # These are not really valid anyway.
+ for el in list(doc.iter('param')):
+ parent = el.getparent()
+ while parent is not None and parent.tag not in ('applet', 'object'):
+ parent = parent.getparent()
+ if parent is None:
+ el.drop_tree()
+ kill_tags.update(('applet',))
+ # The alternate contents that are in an iframe are a good fallback:
+ remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
+ if self.frames:
+ # FIXME: ideally we should look at the frame links, but
+ # generally frames don't mix properly with an HTML
+ # fragment anyway.
+ kill_tags.update(defs.frame_tags)
+ if self.forms:
+ remove_tags.add('form')
+ kill_tags.update(('button', 'input', 'select', 'textarea'))
+ if self.annoying_tags:
+ remove_tags.update(('blink', 'marquee'))
+
+ _remove = []
+ _kill = []
+ for el in doc.iter():
+ if el.tag in kill_tags:
+ if self.allow_element(el):
+ continue
+ _kill.append(el)
+ elif el.tag in remove_tags:
+ if self.allow_element(el):
+ continue
+ _remove.append(el)
+
+ if _remove and _remove[0] == doc:
+ # We have to drop the parent-most tag, which we can't
+ # do. Instead we'll rewrite it:
+ el = _remove.pop(0)
+ el.tag = 'div'
+ el.attrib.clear()
+ elif _kill and _kill[0] == doc:
+ # We have to drop the parent-most element, which we can't
+ # do. Instead we'll clear it:
+ el = _kill.pop(0)
+ if el.tag != 'html':
+ el.tag = 'div'
+ el.clear()
+
+ _kill.reverse() # start with innermost tags
+ for el in _kill:
+ el.drop_tree()
+ for el in _remove:
+ el.drop_tag()
+
+ if self.remove_unknown_tags:
+ if allow_tags:
+ raise ValueError(
+ "It does not make sense to pass in both allow_tags and remove_unknown_tags")
+ allow_tags = set(defs.tags)
+ if allow_tags:
+ # make sure we do not remove comments/PIs if users want them (which is rare enough)
+ if not self.comments:
+ allow_tags.add(etree.Comment)
+ if not self.processing_instructions:
+ allow_tags.add(etree.ProcessingInstruction)
+
+ bad = []
+ for el in doc.iter():
+ if el.tag not in allow_tags:
+ bad.append(el)
+ if bad:
+ if bad[0] is doc:
+ el = bad.pop(0)
+ el.tag = 'div'
+ el.attrib.clear()
+ for el in bad:
+ el.drop_tag()
+ if self.add_nofollow:
+ for el in _find_external_links(doc):
+ if not self.allow_follow(el):
+ rel = el.get('rel')
+ if rel:
+ if ('nofollow' in rel
+ and ' nofollow ' in (' %s ' % rel)):
+ continue
+ rel = '%s nofollow' % rel
+ else:
+ rel = 'nofollow'
+ el.set('rel', rel)
+
+ def allow_follow(self, anchor):
+ """
+ Override to suppress rel="nofollow" on some anchors.
+ """
+ return False
+
+ def allow_element(self, el):
+ """
+ Decide whether an element is configured to be accepted or rejected.
+
+ :param el: an element.
+ :return: true to accept the element or false to reject/discard it.
+ """
+ if el.tag not in self._tag_link_attrs:
+ return False
+ attr = self._tag_link_attrs[el.tag]
+ if isinstance(attr, (list, tuple)):
+ for one_attr in attr:
+ url = el.get(one_attr)
+ if not url:
+ return False
+ if not self.allow_embedded_url(el, url):
+ return False
+ return True
+ else:
+ url = el.get(attr)
+ if not url:
+ return False
+ return self.allow_embedded_url(el, url)
+
+ def allow_embedded_url(self, el, url):
+ """
+ Decide whether a URL that was found in an element's attributes or text
+ if configured to be accepted or rejected.
+
+ :param el: an element.
+ :param url: a URL found on the element.
+ :return: true to accept the URL and false to reject it.
+ """
+ if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
+ return False
+ scheme, netloc, path, query, fragment = urlsplit(url)
+ netloc = netloc.lower().split(':', 1)[0]
+ if scheme not in ('http', 'https'):
+ return False
+ if netloc in self.host_whitelist:
+ return True
+ return False
+
+ def kill_conditional_comments(self, doc):
+ """
+ IE conditional comments basically embed HTML that the parser
+ doesn't normally see. We can't allow anything like that, so
+ we'll kill any comments that could be conditional.
+ """
+ has_conditional_comment = _conditional_comment_re.search
+ self._kill_elements(
+ doc, lambda el: has_conditional_comment(el.text),
+ etree.Comment)
+
+ def _kill_elements(self, doc, condition, iterate=None):
+ bad = []
+ for el in doc.iter(iterate):
+ if condition(el):
+ bad.append(el)
+ for el in bad:
+ el.drop_tree()
+
+ def _remove_javascript_link(self, link):
+ # links like "j a v a s c r i p t:" might be interpreted in IE
+ new = _substitute_whitespace('', unquote_plus(link))
+ if _is_javascript_scheme(new):
+ # FIXME: should this be None to delete?
+ return ''
+ return link
+
+ _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
+
+ def _has_sneaky_javascript(self, style):
+ """
+ Depending on the browser, stuff like ``e x p r e s s i o n(...)``
+ can get interpreted, or ``expre/* stuff */ssion(...)``. This
+ checks for attempt to do stuff like this.
+
+ Typically the response will be to kill the entire style; if you
+ have just a bit of Javascript in the style another rule will catch
+ that and remove only the Javascript from the style; this catches
+ more sneaky attempts.
+ """
+ style = self._substitute_comments('', style)
+ style = style.replace('\\', '')
+ style = _substitute_whitespace('', style)
+ style = style.lower()
+ if 'javascript:' in style:
+ return True
+ if 'expression(' in style:
+ return True
+ if '</noscript' in style:
+ # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+ return True
+ if _looks_like_tag_content(style):
+ # e.g. '<math><style><img src=x onerror=alert(1)></style></math>'
+ return True
+ return False
+
+ def clean_html(self, html):
+ result_type = type(html)
+ if isinstance(html, basestring):
+ doc = fromstring(html)
+ else:
+ doc = copy.deepcopy(html)
+ self(doc)
+ return _transform_result(result_type, doc)
+
+clean = Cleaner()
+clean_html = clean.clean_html
+
+############################################################
+## Autolinking
+############################################################
+
+_link_regexes = [
+ re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
+ # This is conservative, but autolinking can be a bit conservative:
+ re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
+ ]
+
+_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
+
+_avoid_hosts = [
+ re.compile(r'^localhost', re.I),
+ re.compile(r'\bexample\.(?:com|org|net)$', re.I),
+ re.compile(r'^127\.0\.0\.1$'),
+ ]
+
+_avoid_classes = ['nolink']
+
+def autolink(el, link_regexes=_link_regexes,
+ avoid_elements=_avoid_elements,
+ avoid_hosts=_avoid_hosts,
+ avoid_classes=_avoid_classes):
+ """
+ Turn any URLs into links.
+
+ It will search for links identified by the given regular
+ expressions (by default mailto and http(s) links).
+
+ It won't link text in an element in avoid_elements, or an element
+ with a class in avoid_classes. It won't link to anything with a
+ host that matches one of the regular expressions in avoid_hosts
+ (default localhost and 127.0.0.1).
+
+ If you pass in an element, the element's tail will not be
+ substituted, only the contents of the element.
+ """
+ if el.tag in avoid_elements:
+ return
+ class_name = el.get('class')
+ if class_name:
+ class_name = class_name.split()
+ for match_class in avoid_classes:
+ if match_class in class_name:
+ return
+ for child in list(el):
+ autolink(child, link_regexes=link_regexes,
+ avoid_elements=avoid_elements,
+ avoid_hosts=avoid_hosts,
+ avoid_classes=avoid_classes)
+ if child.tail:
+ text, tail_children = _link_text(
+ child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
+ if tail_children:
+ child.tail = text
+ index = el.index(child)
+ el[index+1:index+1] = tail_children
+ if el.text:
+ text, pre_children = _link_text(
+ el.text, link_regexes, avoid_hosts, factory=el.makeelement)
+ if pre_children:
+ el.text = text
+ el[:0] = pre_children
+
+def _link_text(text, link_regexes, avoid_hosts, factory):
+ leading_text = ''
+ links = []
+ last_pos = 0
+ while 1:
+ best_match, best_pos = None, None
+ for regex in link_regexes:
+ regex_pos = last_pos
+ while 1:
+ match = regex.search(text, pos=regex_pos)
+ if match is None:
+ break
+ host = match.group('host')
+ for host_regex in avoid_hosts:
+ if host_regex.search(host):
+ regex_pos = match.end()
+ break
+ else:
+ break
+ if match is None:
+ continue
+ if best_pos is None or match.start() < best_pos:
+ best_match = match
+ best_pos = match.start()
+ if best_match is None:
+ # No more matches
+ if links:
+ assert not links[-1].tail
+ links[-1].tail = text
+ else:
+ assert not leading_text
+ leading_text = text
+ break
+ link = best_match.group(0)
+ end = best_match.end()
+ if link.endswith('.') or link.endswith(','):
+ # These punctuation marks shouldn't end a link
+ end -= 1
+ link = link[:-1]
+ prev_text = text[:best_match.start()]
+ if links:
+ assert not links[-1].tail
+ links[-1].tail = prev_text
+ else:
+ assert not leading_text
+ leading_text = prev_text
+ anchor = factory('a')
+ anchor.set('href', link)
+ body = best_match.group('body')
+ if not body:
+ body = link
+ if body.endswith('.') or body.endswith(','):
+ body = body[:-1]
+ anchor.text = body
+ links.append(anchor)
+ text = text[end:]
+ return leading_text, links
+
+def autolink_html(html, *args, **kw):
+ result_type = type(html)
+ if isinstance(html, basestring):
+ doc = fromstring(html)
+ else:
+ doc = copy.deepcopy(html)
+ autolink(doc, *args, **kw)
+ return _transform_result(result_type, doc)
+
+autolink_html.__doc__ = autolink.__doc__
+
+############################################################
+## Word wrapping
+############################################################
+
+_avoid_word_break_elements = ['pre', 'textarea', 'code']
+_avoid_word_break_classes = ['nobreak']
+
+def word_break(el, max_width=40,
+ avoid_elements=_avoid_word_break_elements,
+ avoid_classes=_avoid_word_break_classes,
+ break_character=unichr(0x200b)):
+ """
+ Breaks any long words found in the body of the text (not attributes).
+
+ Doesn't effect any of the tags in avoid_elements, by default
+ ``<textarea>`` and ``<pre>``
+
+ Breaks words by inserting &#8203;, which is a unicode character
+ for Zero Width Space character. This generally takes up no space
+ in rendering, but does copy as a space, and in monospace contexts
+ usually takes up space.
+
+ See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
+ """
+ # Character suggestion of &#8203 comes from:
+ # http://www.cs.tut.fi/~jkorpela/html/nobr.html
+ if el.tag in _avoid_word_break_elements:
+ return
+ class_name = el.get('class')
+ if class_name:
+ dont_break = False
+ class_name = class_name.split()
+ for avoid in avoid_classes:
+ if avoid in class_name:
+ dont_break = True
+ break
+ if dont_break:
+ return
+ if el.text:
+ el.text = _break_text(el.text, max_width, break_character)
+ for child in el:
+ word_break(child, max_width=max_width,
+ avoid_elements=avoid_elements,
+ avoid_classes=avoid_classes,
+ break_character=break_character)
+ if child.tail:
+ child.tail = _break_text(child.tail, max_width, break_character)
+
+def word_break_html(html, *args, **kw):
+ result_type = type(html)
+ doc = fromstring(html)
+ word_break(doc, *args, **kw)
+ return _transform_result(result_type, doc)
+
+def _break_text(text, max_width, break_character):
+ words = text.split()
+ for word in words:
+ if len(word) > max_width:
+ replacement = _insert_break(word, max_width, break_character)
+ text = text.replace(word, replacement)
+ return text
+
+_break_prefer_re = re.compile(r'[^a-z]', re.I)
+
+def _insert_break(word, width, break_character):
+ orig_word = word
+ result = ''
+ while len(word) > width:
+ start = word[:width]
+ breaks = list(_break_prefer_re.finditer(start))
+ if breaks:
+ last_break = breaks[-1]
+ # Only walk back up to 10 characters to find a nice break:
+ if last_break.end() > width-10:
+ # FIXME: should the break character be at the end of the
+ # chunk, or the beginning of the next chunk?
+ start = word[:last_break.end()]
+ result += start + break_character
+ word = word[len(start):]
+ result += word
+ return result
+
diff --git a/src/lxml/html/defs.py b/src/lxml/html/defs.py
new file mode 100644
index 0000000..2058ea3
--- /dev/null
+++ b/src/lxml/html/defs.py
@@ -0,0 +1,135 @@
+# FIXME: this should all be confirmed against what a DTD says
+# (probably in a test; this may not match the DTD exactly, but we
+# should document just how it differs).
+
+"""
+Data taken from https://www.w3.org/TR/html401/index/elements.html
+and https://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
+for html5_tags.
+"""
+
+empty_tags = frozenset([
+ 'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
+ 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source', 'track'])
+
+deprecated_tags = frozenset([
+ 'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
+ 'menu', 's', 'strike', 'u'])
+
+# archive actually takes a space-separated list of URIs
+link_attrs = frozenset([
+ 'action', 'archive', 'background', 'cite', 'classid',
+ 'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
+ 'usemap',
+ # Not standard:
+ 'dynsrc', 'lowsrc',
+ # HTML5 formaction
+ 'formaction'
+ ])
+
+# Not in the HTML 4 spec:
+# onerror, onresize
+event_attrs = frozenset([
+ 'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
+ 'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
+ 'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
+ 'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
+ 'onunload',
+ ])
+
+safe_attrs = frozenset([
+ 'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
+ 'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
+ 'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
+ 'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
+ 'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
+ 'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
+ 'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
+ 'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
+ 'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
+ 'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
+
+# From http://htmlhelp.com/reference/html40/olist.html
+top_level_tags = frozenset([
+ 'html', 'head', 'body', 'frameset',
+ ])
+
+head_tags = frozenset([
+ 'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
+ ])
+
+general_block_tags = frozenset([
+ 'address',
+ 'blockquote',
+ 'center',
+ 'del',
+ 'div',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'hr',
+ 'ins',
+ 'isindex',
+ 'noscript',
+ 'p',
+ 'pre',
+ ])
+
+list_tags = frozenset([
+ 'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
+ ])
+
+table_tags = frozenset([
+ 'table', 'caption', 'colgroup', 'col',
+ 'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
+ ])
+
+# just this one from
+# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
+block_tags = general_block_tags | list_tags | table_tags | frozenset([
+ # Partial form tags
+ 'fieldset', 'form', 'legend', 'optgroup', 'option',
+ ])
+
+form_tags = frozenset([
+ 'form', 'button', 'fieldset', 'legend', 'input', 'label',
+ 'select', 'optgroup', 'option', 'textarea',
+ ])
+
+special_inline_tags = frozenset([
+ 'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
+ 'img', 'map', 'area', 'object', 'param', 'q', 'script',
+ 'span', 'sub', 'sup',
+ ])
+
+phrase_tags = frozenset([
+ 'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
+ 'ins', 'kbd', 'samp', 'strong', 'var',
+ ])
+
+font_style_tags = frozenset([
+ 'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
+ ])
+
+frame_tags = frozenset([
+ 'frameset', 'frame', 'noframes',
+ ])
+
+html5_tags = frozenset([
+ 'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
+ 'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
+ 'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
+ 'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
+ 'svg', 'time', 'track', 'video', 'wbr'
+ ])
+
+# These tags aren't standard
+nonstandard_tags = frozenset(['blink', 'marquee'])
+
+
+tags = (top_level_tags | head_tags | general_block_tags | list_tags
+ | table_tags | form_tags | special_inline_tags | phrase_tags
+ | font_style_tags | nonstandard_tags | html5_tags)
diff --git a/src/lxml/html/diff.py b/src/lxml/html/diff.py
new file mode 100644
index 0000000..5d143bd
--- /dev/null
+++ b/src/lxml/html/diff.py
@@ -0,0 +1,884 @@
+# cython: language_level=3
+
+from __future__ import absolute_import
+
+import difflib
+from lxml import etree
+from lxml.html import fragment_fromstring
+import re
+
+__all__ = ['html_annotate', 'htmldiff']
+
+try:
+ from html import escape as html_escape
+except ImportError:
+ from cgi import escape as html_escape
+try:
+ _unicode = unicode
+except NameError:
+ # Python 3
+ _unicode = str
+try:
+ basestring
+except NameError:
+ # Python 3
+ basestring = str
+
+############################################################
+## Annotation
+############################################################
+
+def default_markup(text, version):
+ return '<span title="%s">%s</span>' % (
+ html_escape(_unicode(version), 1), text)
+
+def html_annotate(doclist, markup=default_markup):
+ """
+ doclist should be ordered from oldest to newest, like::
+
+ >>> version1 = 'Hello World'
+ >>> version2 = 'Goodbye World'
+ >>> print(html_annotate([(version1, 'version 1'),
+ ... (version2, 'version 2')]))
+ <span title="version 2">Goodbye</span> <span title="version 1">World</span>
+
+ The documents must be *fragments* (str/UTF8 or unicode), not
+ complete documents
+
+ The markup argument is a function to markup the spans of words.
+ This function is called like markup('Hello', 'version 2'), and
+ returns HTML. The first argument is text and never includes any
+ markup. The default uses a span with a title:
+
+ >>> print(default_markup('Some Text', 'by Joe'))
+ <span title="by Joe">Some Text</span>
+ """
+ # The basic strategy we have is to split the documents up into
+ # logical tokens (which are words with attached markup). We then
+ # do diffs of each of the versions to track when a token first
+ # appeared in the document; the annotation attached to the token
+ # is the version where it first appeared.
+ tokenlist = [tokenize_annotated(doc, version)
+ for doc, version in doclist]
+ cur_tokens = tokenlist[0]
+ for tokens in tokenlist[1:]:
+ html_annotate_merge_annotations(cur_tokens, tokens)
+ cur_tokens = tokens
+
+ # After we've tracked all the tokens, we can combine spans of text
+ # that are adjacent and have the same annotation
+ cur_tokens = compress_tokens(cur_tokens)
+ # And finally add markup
+ result = markup_serialize_tokens(cur_tokens, markup)
+ return ''.join(result).strip()
+
+def tokenize_annotated(doc, annotation):
+ """Tokenize a document and add an annotation attribute to each token
+ """
+ tokens = tokenize(doc, include_hrefs=False)
+ for tok in tokens:
+ tok.annotation = annotation
+ return tokens
+
+def html_annotate_merge_annotations(tokens_old, tokens_new):
+ """Merge the annotations from tokens_old into tokens_new, when the
+ tokens in the new document already existed in the old document.
+ """
+ s = InsensitiveSequenceMatcher(a=tokens_old, b=tokens_new)
+ commands = s.get_opcodes()
+
+ for command, i1, i2, j1, j2 in commands:
+ if command == 'equal':
+ eq_old = tokens_old[i1:i2]
+ eq_new = tokens_new[j1:j2]
+ copy_annotations(eq_old, eq_new)
+
+def copy_annotations(src, dest):
+ """
+ Copy annotations from the tokens listed in src to the tokens in dest
+ """
+ assert len(src) == len(dest)
+ for src_tok, dest_tok in zip(src, dest):
+ dest_tok.annotation = src_tok.annotation
+
+def compress_tokens(tokens):
+ """
+ Combine adjacent tokens when there is no HTML between the tokens,
+ and they share an annotation
+ """
+ result = [tokens[0]]
+ for tok in tokens[1:]:
+ if (not result[-1].post_tags and
+ not tok.pre_tags and
+ result[-1].annotation == tok.annotation):
+ compress_merge_back(result, tok)
+ else:
+ result.append(tok)
+ return result
+
+def compress_merge_back(tokens, tok):
+ """ Merge tok into the last element of tokens (modifying the list of
+ tokens in-place). """
+ last = tokens[-1]
+ if type(last) is not token or type(tok) is not token:
+ tokens.append(tok)
+ else:
+ text = _unicode(last)
+ if last.trailing_whitespace:
+ text += last.trailing_whitespace
+ text += tok
+ merged = token(text,
+ pre_tags=last.pre_tags,
+ post_tags=tok.post_tags,
+ trailing_whitespace=tok.trailing_whitespace)
+ merged.annotation = last.annotation
+ tokens[-1] = merged
+
+def markup_serialize_tokens(tokens, markup_func):
+ """
+ Serialize the list of tokens into a list of text chunks, calling
+ markup_func around text to add annotations.
+ """
+ for token in tokens:
+ for pre in token.pre_tags:
+ yield pre
+ html = token.html()
+ html = markup_func(html, token.annotation)
+ if token.trailing_whitespace:
+ html += token.trailing_whitespace
+ yield html
+ for post in token.post_tags:
+ yield post
+
+
+############################################################
+## HTML Diffs
+############################################################
+
+def htmldiff(old_html, new_html):
+ ## FIXME: this should take parsed documents too, and use their body
+ ## or other content.
+ """ Do a diff of the old and new document. The documents are HTML
+ *fragments* (str/UTF8 or unicode), they are not complete documents
+ (i.e., no <html> tag).
+
+ Returns HTML with <ins> and <del> tags added around the
+ appropriate text.
+
+ Markup is generally ignored, with the markup from new_html
+ preserved, and possibly some markup from old_html (though it is
+ considered acceptable to lose some of the old markup). Only the
+ words in the HTML are diffed. The exception is <img> tags, which
+ are treated like words, and the href attribute of <a> tags, which
+ are noted inside the tag itself when there are changes.
+ """
+ old_html_tokens = tokenize(old_html)
+ new_html_tokens = tokenize(new_html)
+ result = htmldiff_tokens(old_html_tokens, new_html_tokens)
+ result = ''.join(result).strip()
+ return fixup_ins_del_tags(result)
+
+def htmldiff_tokens(html1_tokens, html2_tokens):
+ """ Does a diff on the tokens themselves, returning a list of text
+ chunks (not tokens).
+ """
+ # There are several passes as we do the differences. The tokens
+ # isolate the portion of the content we care to diff; difflib does
+ # all the actual hard work at that point.
+ #
+ # Then we must create a valid document from pieces of both the old
+ # document and the new document. We generally prefer to take
+ # markup from the new document, and only do a best effort attempt
+ # to keep markup from the old document; anything that we can't
+ # resolve we throw away. Also we try to put the deletes as close
+ # to the location where we think they would have been -- because
+ # we are only keeping the markup from the new document, it can be
+ # fuzzy where in the new document the old text would have gone.
+ # Again we just do a best effort attempt.
+ s = InsensitiveSequenceMatcher(a=html1_tokens, b=html2_tokens)
+ commands = s.get_opcodes()
+ result = []
+ for command, i1, i2, j1, j2 in commands:
+ if command == 'equal':
+ result.extend(expand_tokens(html2_tokens[j1:j2], equal=True))
+ continue
+ if command == 'insert' or command == 'replace':
+ ins_tokens = expand_tokens(html2_tokens[j1:j2])
+ merge_insert(ins_tokens, result)
+ if command == 'delete' or command == 'replace':
+ del_tokens = expand_tokens(html1_tokens[i1:i2])
+ merge_delete(del_tokens, result)
+ # If deletes were inserted directly as <del> then we'd have an
+ # invalid document at this point. Instead we put in special
+ # markers, and when the complete diffed document has been created
+ # we try to move the deletes around and resolve any problems.
+ result = cleanup_delete(result)
+
+ return result
+
+def expand_tokens(tokens, equal=False):
+ """Given a list of tokens, return a generator of the chunks of
+ text for the data in the tokens.
+ """
+ for token in tokens:
+ for pre in token.pre_tags:
+ yield pre
+ if not equal or not token.hide_when_equal:
+ if token.trailing_whitespace:
+ yield token.html() + token.trailing_whitespace
+ else:
+ yield token.html()
+ for post in token.post_tags:
+ yield post
+
+def merge_insert(ins_chunks, doc):
+ """ doc is the already-handled document (as a list of text chunks);
+ here we add <ins>ins_chunks</ins> to the end of that. """
+ # Though we don't throw away unbalanced_start or unbalanced_end
+ # (we assume there is accompanying markup later or earlier in the
+ # document), we only put <ins> around the balanced portion.
+ unbalanced_start, balanced, unbalanced_end = split_unbalanced(ins_chunks)
+ doc.extend(unbalanced_start)
+ if doc and not doc[-1].endswith(' '):
+ # Fix up the case where the word before the insert didn't end with
+ # a space
+ doc[-1] += ' '
+ doc.append('<ins>')
+ if balanced and balanced[-1].endswith(' '):
+ # We move space outside of </ins>
+ balanced[-1] = balanced[-1][:-1]
+ doc.extend(balanced)
+ doc.append('</ins> ')
+ doc.extend(unbalanced_end)
+
+# These are sentinals to represent the start and end of a <del>
+# segment, until we do the cleanup phase to turn them into proper
+# markup:
+class DEL_START:
+ pass
+class DEL_END:
+ pass
+
+class NoDeletes(Exception):
+ """ Raised when the document no longer contains any pending deletes
+ (DEL_START/DEL_END) """
+
+def merge_delete(del_chunks, doc):
+ """ Adds the text chunks in del_chunks to the document doc (another
+ list of text chunks) with marker to show it is a delete.
+ cleanup_delete later resolves these markers into <del> tags."""
+ doc.append(DEL_START)
+ doc.extend(del_chunks)
+ doc.append(DEL_END)
+
+def cleanup_delete(chunks):
+ """ Cleans up any DEL_START/DEL_END markers in the document, replacing
+ them with <del></del>. To do this while keeping the document
+ valid, it may need to drop some tags (either start or end tags).
+
+ It may also move the del into adjacent tags to try to move it to a
+ similar location where it was originally located (e.g., moving a
+ delete into preceding <div> tag, if the del looks like (DEL_START,
+ 'Text</div>', DEL_END)"""
+ while 1:
+ # Find a pending DEL_START/DEL_END, splitting the document
+ # into stuff-preceding-DEL_START, stuff-inside, and
+ # stuff-following-DEL_END
+ try:
+ pre_delete, delete, post_delete = split_delete(chunks)
+ except NoDeletes:
+ # Nothing found, we've cleaned up the entire doc
+ break
+ # The stuff-inside-DEL_START/END may not be well balanced
+ # markup. First we figure out what unbalanced portions there are:
+ unbalanced_start, balanced, unbalanced_end = split_unbalanced(delete)
+ # Then we move the span forward and/or backward based on these
+ # unbalanced portions:
+ locate_unbalanced_start(unbalanced_start, pre_delete, post_delete)
+ locate_unbalanced_end(unbalanced_end, pre_delete, post_delete)
+ doc = pre_delete
+ if doc and not doc[-1].endswith(' '):
+ # Fix up case where the word before us didn't have a trailing space
+ doc[-1] += ' '
+ doc.append('<del>')
+ if balanced and balanced[-1].endswith(' '):
+ # We move space outside of </del>
+ balanced[-1] = balanced[-1][:-1]
+ doc.extend(balanced)
+ doc.append('</del> ')
+ doc.extend(post_delete)
+ chunks = doc
+ return chunks
+
+def split_unbalanced(chunks):
+ """Return (unbalanced_start, balanced, unbalanced_end), where each is
+ a list of text and tag chunks.
+
+ unbalanced_start is a list of all the tags that are opened, but
+ not closed in this span. Similarly, unbalanced_end is a list of
+ tags that are closed but were not opened. Extracting these might
+ mean some reordering of the chunks."""
+ start = []
+ end = []
+ tag_stack = []
+ balanced = []
+ for chunk in chunks:
+ if not chunk.startswith('<'):
+ balanced.append(chunk)
+ continue
+ endtag = chunk[1] == '/'
+ name = chunk.split()[0].strip('<>/')
+ if name in empty_tags:
+ balanced.append(chunk)
+ continue
+ if endtag:
+ if tag_stack and tag_stack[-1][0] == name:
+ balanced.append(chunk)
+ name, pos, tag = tag_stack.pop()
+ balanced[pos] = tag
+ elif tag_stack:
+ start.extend([tag for name, pos, tag in tag_stack])
+ tag_stack = []
+ end.append(chunk)
+ else:
+ end.append(chunk)
+ else:
+ tag_stack.append((name, len(balanced), chunk))
+ balanced.append(None)
+ start.extend(
+ [chunk for name, pos, chunk in tag_stack])
+ balanced = [chunk for chunk in balanced if chunk is not None]
+ return start, balanced, end
+
+def split_delete(chunks):
+ """ Returns (stuff_before_DEL_START, stuff_inside_DEL_START_END,
+ stuff_after_DEL_END). Returns the first case found (there may be
+ more DEL_STARTs in stuff_after_DEL_END). Raises NoDeletes if
+ there's no DEL_START found. """
+ try:
+ pos = chunks.index(DEL_START)
+ except ValueError:
+ raise NoDeletes
+ pos2 = chunks.index(DEL_END)
+ return chunks[:pos], chunks[pos+1:pos2], chunks[pos2+1:]
+
+def locate_unbalanced_start(unbalanced_start, pre_delete, post_delete):
+ """ pre_delete and post_delete implicitly point to a place in the
+ document (where the two were split). This moves that point (by
+ popping items from one and pushing them onto the other). It moves
+ the point to try to find a place where unbalanced_start applies.
+
+ As an example::
+
+ >>> unbalanced_start = ['<div>']
+ >>> doc = ['<p>', 'Text', '</p>', '<div>', 'More Text', '</div>']
+ >>> pre, post = doc[:3], doc[3:]
+ >>> pre, post
+ (['<p>', 'Text', '</p>'], ['<div>', 'More Text', '</div>'])
+ >>> locate_unbalanced_start(unbalanced_start, pre, post)
+ >>> pre, post
+ (['<p>', 'Text', '</p>', '<div>'], ['More Text', '</div>'])
+
+ As you can see, we moved the point so that the dangling <div> that
+ we found will be effectively replaced by the div in the original
+ document. If this doesn't work out, we just throw away
+ unbalanced_start without doing anything.
+ """
+ while 1:
+ if not unbalanced_start:
+ # We have totally succeeded in finding the position
+ break
+ finding = unbalanced_start[0]
+ finding_name = finding.split()[0].strip('<>')
+ if not post_delete:
+ break
+ next = post_delete[0]
+ if next is DEL_START or not next.startswith('<'):
+ # Reached a word, we can't move the delete text forward
+ break
+ if next[1] == '/':
+ # Reached a closing tag, can we go further? Maybe not...
+ break
+ name = next.split()[0].strip('<>')
+ if name == 'ins':
+ # Can't move into an insert
+ break
+ assert name != 'del', (
+ "Unexpected delete tag: %r" % next)
+ if name == finding_name:
+ unbalanced_start.pop(0)
+ pre_delete.append(post_delete.pop(0))
+ else:
+ # Found a tag that doesn't match
+ break
+
+def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
+ """ like locate_unbalanced_start, except handling end tags and
+ possibly moving the point earlier in the document. """
+ while 1:
+ if not unbalanced_end:
+ # Success
+ break
+ finding = unbalanced_end[-1]
+ finding_name = finding.split()[0].strip('<>/')
+ if not pre_delete:
+ break
+ next = pre_delete[-1]
+ if next is DEL_END or not next.startswith('</'):
+ # A word or a start tag
+ break
+ name = next.split()[0].strip('<>/')
+ if name == 'ins' or name == 'del':
+ # Can't move into an insert or delete
+ break
+ if name == finding_name:
+ unbalanced_end.pop()
+ post_delete.insert(0, pre_delete.pop())
+ else:
+ # Found a tag that doesn't match
+ break
+
+class token(_unicode):
+ """ Represents a diffable token, generally a word that is displayed to
+ the user. Opening tags are attached to this token when they are
+ adjacent (pre_tags) and closing tags that follow the word
+ (post_tags). Some exceptions occur when there are empty tags
+ adjacent to a word, so there may be close tags in pre_tags, or
+ open tags in post_tags.
+
+ We also keep track of whether the word was originally followed by
+ whitespace, even though we do not want to treat the word as
+ equivalent to a similar word that does not have a trailing
+ space."""
+
+ # When this is true, the token will be eliminated from the
+ # displayed diff if no change has occurred:
+ hide_when_equal = False
+
+ def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
+ obj = _unicode.__new__(cls, text)
+
+ if pre_tags is not None:
+ obj.pre_tags = pre_tags
+ else:
+ obj.pre_tags = []
+
+ if post_tags is not None:
+ obj.post_tags = post_tags
+ else:
+ obj.post_tags = []
+
+ obj.trailing_whitespace = trailing_whitespace
+
+ return obj
+
+ def __repr__(self):
+ return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
+ self.post_tags, self.trailing_whitespace)
+
+ def html(self):
+ return _unicode(self)
+
+class tag_token(token):
+
+ """ Represents a token that is actually a tag. Currently this is just
+ the <img> tag, which takes up visible space just like a word but
+ is only represented in a document by a tag. """
+
+ def __new__(cls, tag, data, html_repr, pre_tags=None,
+ post_tags=None, trailing_whitespace=""):
+ obj = token.__new__(cls, "%s: %s" % (type, data),
+ pre_tags=pre_tags,
+ post_tags=post_tags,
+ trailing_whitespace=trailing_whitespace)
+ obj.tag = tag
+ obj.data = data
+ obj.html_repr = html_repr
+ return obj
+
+ def __repr__(self):
+ return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
+ self.tag,
+ self.data,
+ self.html_repr,
+ self.pre_tags,
+ self.post_tags,
+ self.trailing_whitespace)
+ def html(self):
+ return self.html_repr
+
+class href_token(token):
+
+ """ Represents the href in an anchor tag. Unlike other words, we only
+ show the href when it changes. """
+
+ hide_when_equal = True
+
+ def html(self):
+ return ' Link: %s' % self
+
+def tokenize(html, include_hrefs=True):
+ """
+ Parse the given HTML and returns token objects (words with attached tags).
+
+ This parses only the content of a page; anything in the head is
+ ignored, and the <head> and <body> elements are themselves
+ optional. The content is then parsed by lxml, which ensures the
+ validity of the resulting parsed document (though lxml may make
+ incorrect guesses when the markup is particular bad).
+
+ <ins> and <del> tags are also eliminated from the document, as
+ that gets confusing.
+
+ If include_hrefs is true, then the href attribute of <a> tags is
+ included as a special kind of diffable token."""
+ if etree.iselement(html):
+ body_el = html
+ else:
+ body_el = parse_html(html, cleanup=True)
+ # Then we split the document into text chunks for each tag, word, and end tag:
+ chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
+ # Finally re-joining them into token objects:
+ return fixup_chunks(chunks)
+
+def parse_html(html, cleanup=True):
+ """
+ Parses an HTML fragment, returning an lxml element. Note that the HTML will be
+ wrapped in a <div> tag that was not in the original document.
+
+ If cleanup is true, make sure there's no <head> or <body>, and get
+ rid of any <ins> and <del> tags.
+ """
+ if cleanup:
+ # This removes any extra markup or structure like <head>:
+ html = cleanup_html(html)
+ return fragment_fromstring(html, create_parent=True)
+
+_body_re = re.compile(r'<body.*?>', re.I|re.S)
+_end_body_re = re.compile(r'</body.*?>', re.I|re.S)
+_ins_del_re = re.compile(r'</?(ins|del).*?>', re.I|re.S)
+
+def cleanup_html(html):
+ """ This 'cleans' the HTML, meaning that any page structure is removed
+ (only the contents of <body> are used, if there is any <body).
+ Also <ins> and <del> tags are removed. """
+ match = _body_re.search(html)
+ if match:
+ html = html[match.end():]
+ match = _end_body_re.search(html)
+ if match:
+ html = html[:match.start()]
+ html = _ins_del_re.sub('', html)
+ return html
+
+
+end_whitespace_re = re.compile(r'[ \t\n\r]$')
+
+def split_trailing_whitespace(word):
+ """
+ This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
+ """
+ stripped_length = len(word.rstrip())
+ return word[0:stripped_length], word[stripped_length:]
+
+
+def fixup_chunks(chunks):
+ """
+ This function takes a list of chunks and produces a list of tokens.
+ """
+ tag_accum = []
+ cur_word = None
+ result = []
+ for chunk in chunks:
+ if isinstance(chunk, tuple):
+ if chunk[0] == 'img':
+ src = chunk[1]
+ tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
+ cur_word = tag_token('img', src, html_repr=tag,
+ pre_tags=tag_accum,
+ trailing_whitespace=trailing_whitespace)
+ tag_accum = []
+ result.append(cur_word)
+
+ elif chunk[0] == 'href':
+ href = chunk[1]
+ cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
+ tag_accum = []
+ result.append(cur_word)
+ continue
+
+ if is_word(chunk):
+ chunk, trailing_whitespace = split_trailing_whitespace(chunk)
+ cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
+ tag_accum = []
+ result.append(cur_word)
+
+ elif is_start_tag(chunk):
+ tag_accum.append(chunk)
+
+ elif is_end_tag(chunk):
+ if tag_accum:
+ tag_accum.append(chunk)
+ else:
+ assert cur_word, (
+ "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
+ % (cur_word, result, chunk, chunks))
+ cur_word.post_tags.append(chunk)
+ else:
+ assert False
+
+ if not result:
+ return [token('', pre_tags=tag_accum)]
+ else:
+ result[-1].post_tags.extend(tag_accum)
+
+ return result
+
+
+# All the tags in HTML that don't require end tags:
+empty_tags = (
+ 'param', 'img', 'area', 'br', 'basefont', 'input',
+ 'base', 'meta', 'link', 'col')
+
+block_level_tags = (
+ 'address',
+ 'blockquote',
+ 'center',
+ 'dir',
+ 'div',
+ 'dl',
+ 'fieldset',
+ 'form',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'hr',
+ 'isindex',
+ 'menu',
+ 'noframes',
+ 'noscript',
+ 'ol',
+ 'p',
+ 'pre',
+ 'table',
+ 'ul',
+ )
+
+block_level_container_tags = (
+ 'dd',
+ 'dt',
+ 'frameset',
+ 'li',
+ 'tbody',
+ 'td',
+ 'tfoot',
+ 'th',
+ 'thead',
+ 'tr',
+ )
+
+
+def flatten_el(el, include_hrefs, skip_tag=False):
+ """ Takes an lxml element el, and generates all the text chunks for
+ that tag. Each start tag is a chunk, each word is a chunk, and each
+ end tag is a chunk.
+
+ If skip_tag is true, then the outermost container tag is
+ not returned (just its contents)."""
+ if not skip_tag:
+ if el.tag == 'img':
+ yield ('img', el.get('src'), start_tag(el))
+ else:
+ yield start_tag(el)
+ if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
+ return
+ start_words = split_words(el.text)
+ for word in start_words:
+ yield html_escape(word)
+ for child in el:
+ for item in flatten_el(child, include_hrefs=include_hrefs):
+ yield item
+ if el.tag == 'a' and el.get('href') and include_hrefs:
+ yield ('href', el.get('href'))
+ if not skip_tag:
+ yield end_tag(el)
+ end_words = split_words(el.tail)
+ for word in end_words:
+ yield html_escape(word)
+
+split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
+
+def split_words(text):
+ """ Splits some text into words. Includes trailing whitespace
+ on each word when appropriate. """
+ if not text or not text.strip():
+ return []
+
+ words = split_words_re.findall(text)
+ return words
+
+start_whitespace_re = re.compile(r'^[ \t\n\r]')
+
+def start_tag(el):
+ """
+ The text representation of the start tag for a tag.
+ """
+ return '<%s%s>' % (
+ el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
+ for name, value in el.attrib.items()]))
+
+def end_tag(el):
+ """ The text representation of an end tag for a tag. Includes
+ trailing whitespace when appropriate. """
+ if el.tail and start_whitespace_re.search(el.tail):
+ extra = ' '
+ else:
+ extra = ''
+ return '</%s>%s' % (el.tag, extra)
+
+def is_word(tok):
+ return not tok.startswith('<')
+
+def is_end_tag(tok):
+ return tok.startswith('</')
+
+def is_start_tag(tok):
+ return tok.startswith('<') and not tok.startswith('</')
+
+def fixup_ins_del_tags(html):
+ """ Given an html string, move any <ins> or <del> tags inside of any
+ block-level elements, e.g. transform <ins><p>word</p></ins> to
+ <p><ins>word</ins></p> """
+ doc = parse_html(html, cleanup=False)
+ _fixup_ins_del_tags(doc)
+ html = serialize_html_fragment(doc, skip_outer=True)
+ return html
+
+def serialize_html_fragment(el, skip_outer=False):
+ """ Serialize a single lxml element as HTML. The serialized form
+ includes the elements tail.
+
+ If skip_outer is true, then don't serialize the outermost tag
+ """
+ assert not isinstance(el, basestring), (
+ "You should pass in an element, not a string like %r" % el)
+ html = etree.tostring(el, method="html", encoding=_unicode)
+ if skip_outer:
+ # Get rid of the extra starting tag:
+ html = html[html.find('>')+1:]
+ # Get rid of the extra end tag:
+ html = html[:html.rfind('<')]
+ return html.strip()
+ else:
+ return html
+
+def _fixup_ins_del_tags(doc):
+ """fixup_ins_del_tags that works on an lxml document in-place
+ """
+ for tag in ['ins', 'del']:
+ for el in doc.xpath('descendant-or-self::%s' % tag):
+ if not _contains_block_level_tag(el):
+ continue
+ _move_el_inside_block(el, tag=tag)
+ el.drop_tag()
+ #_merge_element_contents(el)
+
+def _contains_block_level_tag(el):
+ """True if the element contains any block-level elements, like <p>, <td>, etc.
+ """
+ if el.tag in block_level_tags or el.tag in block_level_container_tags:
+ return True
+ for child in el:
+ if _contains_block_level_tag(child):
+ return True
+ return False
+
+def _move_el_inside_block(el, tag):
+ """ helper for _fixup_ins_del_tags; actually takes the <ins> etc tags
+ and moves them inside any block-level tags. """
+ for child in el:
+ if _contains_block_level_tag(child):
+ break
+ else:
+ # No block-level tags in any child
+ children_tag = etree.Element(tag)
+ children_tag.text = el.text
+ el.text = None
+ children_tag.extend(list(el))
+ el[:] = [children_tag]
+ return
+ for child in list(el):
+ if _contains_block_level_tag(child):
+ _move_el_inside_block(child, tag)
+ if child.tail:
+ tail_tag = etree.Element(tag)
+ tail_tag.text = child.tail
+ child.tail = None
+ el.insert(el.index(child)+1, tail_tag)
+ else:
+ child_tag = etree.Element(tag)
+ el.replace(child, child_tag)
+ child_tag.append(child)
+ if el.text:
+ text_tag = etree.Element(tag)
+ text_tag.text = el.text
+ el.text = None
+ el.insert(0, text_tag)
+
+def _merge_element_contents(el):
+ """
+ Removes an element, but merges its contents into its place, e.g.,
+ given <p>Hi <i>there!</i></p>, if you remove the <i> element you get
+ <p>Hi there!</p>
+ """
+ parent = el.getparent()
+ text = el.text or ''
+ if el.tail:
+ if not len(el):
+ text += el.tail
+ else:
+ if el[-1].tail:
+ el[-1].tail += el.tail
+ else:
+ el[-1].tail = el.tail
+ index = parent.index(el)
+ if text:
+ if index == 0:
+ previous = None
+ else:
+ previous = parent[index-1]
+ if previous is None:
+ if parent.text:
+ parent.text += text
+ else:
+ parent.text = text
+ else:
+ if previous.tail:
+ previous.tail += text
+ else:
+ previous.tail = text
+ parent[index:index+1] = el.getchildren()
+
+class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
+ """
+ Acts like SequenceMatcher, but tries not to find very small equal
+ blocks amidst large spans of changes
+ """
+
+ threshold = 2
+
+ def get_matching_blocks(self):
+ size = min(len(self.b), len(self.b))
+ threshold = min(self.threshold, size / 4)
+ actual = difflib.SequenceMatcher.get_matching_blocks(self)
+ return [item for item in actual
+ if item[2] > threshold
+ or not item[2]]
+
+if __name__ == '__main__':
+ from lxml.html import _diffcommand
+ _diffcommand.main()
+
diff --git a/src/lxml/html/formfill.py b/src/lxml/html/formfill.py
new file mode 100644
index 0000000..2499a8e
--- /dev/null
+++ b/src/lxml/html/formfill.py
@@ -0,0 +1,299 @@
+from lxml.etree import XPath, ElementBase
+from lxml.html import fromstring, XHTML_NAMESPACE
+from lxml.html import _forms_xpath, _options_xpath, _nons, _transform_result
+from lxml.html import defs
+import copy
+
+try:
+ basestring
+except NameError:
+ # Python 3
+ basestring = str
+
+__all__ = ['FormNotFound', 'fill_form', 'fill_form_html',
+ 'insert_errors', 'insert_errors_html',
+ 'DefaultErrorCreator']
+
+class FormNotFound(LookupError):
+ """
+ Raised when no form can be found
+ """
+
+_form_name_xpath = XPath('descendant-or-self::form[name=$name]|descendant-or-self::x:form[name=$name]', namespaces={'x':XHTML_NAMESPACE})
+_input_xpath = XPath('|'.join(['descendant-or-self::'+_tag for _tag in ('input','select','textarea','x:input','x:select','x:textarea')]),
+ namespaces={'x':XHTML_NAMESPACE})
+_label_for_xpath = XPath('//label[@for=$for_id]|//x:label[@for=$for_id]',
+ namespaces={'x':XHTML_NAMESPACE})
+_name_xpath = XPath('descendant-or-self::*[@name=$name]')
+
+def fill_form(
+ el,
+ values,
+ form_id=None,
+ form_index=None,
+ ):
+ el = _find_form(el, form_id=form_id, form_index=form_index)
+ _fill_form(el, values)
+
+def fill_form_html(html, values, form_id=None, form_index=None):
+ result_type = type(html)
+ if isinstance(html, basestring):
+ doc = fromstring(html)
+ else:
+ doc = copy.deepcopy(html)
+ fill_form(doc, values, form_id=form_id, form_index=form_index)
+ return _transform_result(result_type, doc)
+
+def _fill_form(el, values):
+ counts = {}
+ if hasattr(values, 'mixed'):
+ # For Paste request parameters
+ values = values.mixed()
+ inputs = _input_xpath(el)
+ for input in inputs:
+ name = input.get('name')
+ if not name:
+ continue
+ if _takes_multiple(input):
+ value = values.get(name, [])
+ if not isinstance(value, (list, tuple)):
+ value = [value]
+ _fill_multiple(input, value)
+ elif name not in values:
+ continue
+ else:
+ index = counts.get(name, 0)
+ counts[name] = index + 1
+ value = values[name]
+ if isinstance(value, (list, tuple)):
+ try:
+ value = value[index]
+ except IndexError:
+ continue
+ elif index > 0:
+ continue
+ _fill_single(input, value)
+
+def _takes_multiple(input):
+ if _nons(input.tag) == 'select' and input.get('multiple'):
+ # FIXME: multiple="0"?
+ return True
+ type = input.get('type', '').lower()
+ if type in ('radio', 'checkbox'):
+ return True
+ return False
+
+def _fill_multiple(input, value):
+ type = input.get('type', '').lower()
+ if type == 'checkbox':
+ v = input.get('value')
+ if v is None:
+ if not value:
+ result = False
+ else:
+ result = value[0]
+ if isinstance(value, basestring):
+ # The only valid "on" value for an unnamed checkbox is 'on'
+ result = result == 'on'
+ _check(input, result)
+ else:
+ _check(input, v in value)
+ elif type == 'radio':
+ v = input.get('value')
+ _check(input, v in value)
+ else:
+ assert _nons(input.tag) == 'select'
+ for option in _options_xpath(input):
+ v = option.get('value')
+ if v is None:
+ # This seems to be the default, at least on IE
+ # FIXME: but I'm not sure
+ v = option.text_content()
+ _select(option, v in value)
+
+def _check(el, check):
+ if check:
+ el.set('checked', '')
+ else:
+ if 'checked' in el.attrib:
+ del el.attrib['checked']
+
+def _select(el, select):
+ if select:
+ el.set('selected', '')
+ else:
+ if 'selected' in el.attrib:
+ del el.attrib['selected']
+
+def _fill_single(input, value):
+ if _nons(input.tag) == 'textarea':
+ input.text = value
+ else:
+ input.set('value', value)
+
+def _find_form(el, form_id=None, form_index=None):
+ if form_id is None and form_index is None:
+ forms = _forms_xpath(el)
+ for form in forms:
+ return form
+ raise FormNotFound(
+ "No forms in page")
+ if form_id is not None:
+ form = el.get_element_by_id(form_id)
+ if form is not None:
+ return form
+ forms = _form_name_xpath(el, name=form_id)
+ if forms:
+ return forms[0]
+ else:
+ raise FormNotFound(
+ "No form with the name or id of %r (forms: %s)"
+ % (id, ', '.join(_find_form_ids(el))))
+ if form_index is not None:
+ forms = _forms_xpath(el)
+ try:
+ return forms[form_index]
+ except IndexError:
+ raise FormNotFound(
+ "There is no form with the index %r (%i forms found)"
+ % (form_index, len(forms)))
+
+def _find_form_ids(el):
+ forms = _forms_xpath(el)
+ if not forms:
+ yield '(no forms)'
+ return
+ for index, form in enumerate(forms):
+ if form.get('id'):
+ if form.get('name'):
+ yield '%s or %s' % (form.get('id'),
+ form.get('name'))
+ else:
+ yield form.get('id')
+ elif form.get('name'):
+ yield form.get('name')
+ else:
+ yield '(unnamed form %s)' % index
+
+############################################################
+## Error filling
+############################################################
+
+class DefaultErrorCreator(object):
+ insert_before = True
+ block_inside = True
+ error_container_tag = 'div'
+ error_message_class = 'error-message'
+ error_block_class = 'error-block'
+ default_message = "Invalid"
+
+ def __init__(self, **kw):
+ for name, value in kw.items():
+ if not hasattr(self, name):
+ raise TypeError(
+ "Unexpected keyword argument: %s" % name)
+ setattr(self, name, value)
+
+ def __call__(self, el, is_block, message):
+ error_el = el.makeelement(self.error_container_tag)
+ if self.error_message_class:
+ error_el.set('class', self.error_message_class)
+ if is_block and self.error_block_class:
+ error_el.set('class', error_el.get('class', '')+' '+self.error_block_class)
+ if message is None or message == '':
+ message = self.default_message
+ if isinstance(message, ElementBase):
+ error_el.append(message)
+ else:
+ assert isinstance(message, basestring), (
+ "Bad message; should be a string or element: %r" % message)
+ error_el.text = message or self.default_message
+ if is_block and self.block_inside:
+ if self.insert_before:
+ error_el.tail = el.text
+ el.text = None
+ el.insert(0, error_el)
+ else:
+ el.append(error_el)
+ else:
+ parent = el.getparent()
+ pos = parent.index(el)
+ if self.insert_before:
+ parent.insert(pos, error_el)
+ else:
+ error_el.tail = el.tail
+ el.tail = None
+ parent.insert(pos+1, error_el)
+
+default_error_creator = DefaultErrorCreator()
+
+
+def insert_errors(
+ el,
+ errors,
+ form_id=None,
+ form_index=None,
+ error_class="error",
+ error_creator=default_error_creator,
+ ):
+ el = _find_form(el, form_id=form_id, form_index=form_index)
+ for name, error in errors.items():
+ if error is None:
+ continue
+ for error_el, message in _find_elements_for_name(el, name, error):
+ assert isinstance(message, (basestring, type(None), ElementBase)), (
+ "Bad message: %r" % message)
+ _insert_error(error_el, message, error_class, error_creator)
+
+def insert_errors_html(html, values, **kw):
+ result_type = type(html)
+ if isinstance(html, basestring):
+ doc = fromstring(html)
+ else:
+ doc = copy.deepcopy(html)
+ insert_errors(doc, values, **kw)
+ return _transform_result(result_type, doc)
+
+def _insert_error(el, error, error_class, error_creator):
+ if _nons(el.tag) in defs.empty_tags or _nons(el.tag) == 'textarea':
+ is_block = False
+ else:
+ is_block = True
+ if _nons(el.tag) != 'form' and error_class:
+ _add_class(el, error_class)
+ if el.get('id'):
+ labels = _label_for_xpath(el, for_id=el.get('id'))
+ if labels:
+ for label in labels:
+ _add_class(label, error_class)
+ error_creator(el, is_block, error)
+
+def _add_class(el, class_name):
+ if el.get('class'):
+ el.set('class', el.get('class')+' '+class_name)
+ else:
+ el.set('class', class_name)
+
+def _find_elements_for_name(form, name, error):
+ if name is None:
+ # An error for the entire form
+ yield form, error
+ return
+ if name.startswith('#'):
+ # By id
+ el = form.get_element_by_id(name[1:])
+ if el is not None:
+ yield el, error
+ return
+ els = _name_xpath(form, name=name)
+ if not els:
+ # FIXME: should this raise an exception?
+ return
+ if not isinstance(error, (list, tuple)):
+ yield els[0], error
+ return
+ # FIXME: if error is longer than els, should it raise an error?
+ for el, err in zip(els, error):
+ if err is None:
+ continue
+ yield el, err
diff --git a/src/lxml/html/html5parser.py b/src/lxml/html/html5parser.py
new file mode 100644
index 0000000..2f7be15
--- /dev/null
+++ b/src/lxml/html/html5parser.py
@@ -0,0 +1,260 @@
+"""
+An interface to html5lib that mimics the lxml.html interface.
+"""
+import sys
+import string
+
+from html5lib import HTMLParser as _HTMLParser
+from html5lib.treebuilders.etree_lxml import TreeBuilder
+from lxml import etree
+from lxml.html import Element, XHTML_NAMESPACE, _contains_block_level_tag
+
+# python3 compatibility
+try:
+ _strings = basestring
+except NameError:
+ _strings = (bytes, str)
+try:
+ from urllib2 import urlopen
+except ImportError:
+ from urllib.request import urlopen
+try:
+ from urlparse import urlparse
+except ImportError:
+ from urllib.parse import urlparse
+
+
+class HTMLParser(_HTMLParser):
+ """An html5lib HTML parser with lxml as tree."""
+
+ def __init__(self, strict=False, **kwargs):
+ _HTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
+
+
+try:
+ from html5lib import XHTMLParser as _XHTMLParser
+except ImportError:
+ pass
+else:
+ class XHTMLParser(_XHTMLParser):
+ """An html5lib XHTML Parser with lxml as tree."""
+
+ def __init__(self, strict=False, **kwargs):
+ _XHTMLParser.__init__(self, strict=strict, tree=TreeBuilder, **kwargs)
+
+ xhtml_parser = XHTMLParser()
+
+
+def _find_tag(tree, tag):
+ elem = tree.find(tag)
+ if elem is not None:
+ return elem
+ return tree.find('{%s}%s' % (XHTML_NAMESPACE, tag))
+
+
+def document_fromstring(html, guess_charset=None, parser=None):
+ """
+ Parse a whole document into a string.
+
+ If `guess_charset` is true, or if the input is not Unicode but a
+ byte string, the `chardet` library will perform charset guessing
+ on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+
+ if parser is None:
+ parser = html_parser
+
+ options = {}
+ if guess_charset is None and isinstance(html, bytes):
+ # html5lib does not accept useChardet as an argument, if it
+ # detected the html argument would produce unicode objects.
+ guess_charset = True
+ if guess_charset is not None:
+ options['useChardet'] = guess_charset
+ return parser.parse(html, **options).getroot()
+
+
+def fragments_fromstring(html, no_leading_text=False,
+ guess_charset=None, parser=None):
+ """Parses several HTML elements, returning a list of elements.
+
+ The first item in the list may be a string. If no_leading_text is true,
+ then it will be an error if there is leading text, and it will always be
+ a list of only elements.
+
+ If `guess_charset` is true, the `chardet` library will perform charset
+ guessing on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+
+ if parser is None:
+ parser = html_parser
+
+ options = {}
+ if guess_charset is None and isinstance(html, bytes):
+ # html5lib does not accept useChardet as an argument, if it
+ # detected the html argument would produce unicode objects.
+ guess_charset = False
+ if guess_charset is not None:
+ options['useChardet'] = guess_charset
+ children = parser.parseFragment(html, 'div', **options)
+ if children and isinstance(children[0], _strings):
+ if no_leading_text:
+ if children[0].strip():
+ raise etree.ParserError('There is leading text: %r' %
+ children[0])
+ del children[0]
+ return children
+
+
+def fragment_fromstring(html, create_parent=False,
+ guess_charset=None, parser=None):
+ """Parses a single HTML element; it is an error if there is more than
+ one element, or if anything but whitespace precedes or follows the
+ element.
+
+ If 'create_parent' is true (or is a tag name) then a parent node
+ will be created to encapsulate the HTML in a single element. In
+ this case, leading or trailing text is allowed.
+
+ If `guess_charset` is true, the `chardet` library will perform charset
+ guessing on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+
+ accept_leading_text = bool(create_parent)
+
+ elements = fragments_fromstring(
+ html, guess_charset=guess_charset, parser=parser,
+ no_leading_text=not accept_leading_text)
+
+ if create_parent:
+ if not isinstance(create_parent, _strings):
+ create_parent = 'div'
+ new_root = Element(create_parent)
+ if elements:
+ if isinstance(elements[0], _strings):
+ new_root.text = elements[0]
+ del elements[0]
+ new_root.extend(elements)
+ return new_root
+
+ if not elements:
+ raise etree.ParserError('No elements found')
+ if len(elements) > 1:
+ raise etree.ParserError('Multiple elements found')
+ result = elements[0]
+ if result.tail and result.tail.strip():
+ raise etree.ParserError('Element followed by text: %r' % result.tail)
+ result.tail = None
+ return result
+
+
+def fromstring(html, guess_charset=None, parser=None):
+ """Parse the html, returning a single element/document.
+
+ This tries to minimally parse the chunk of text, without knowing if it
+ is a fragment or a document.
+
+ 'base_url' will set the document's base_url attribute (and the tree's
+ docinfo.URL)
+
+ If `guess_charset` is true, or if the input is not Unicode but a
+ byte string, the `chardet` library will perform charset guessing
+ on the string.
+ """
+ if not isinstance(html, _strings):
+ raise TypeError('string required')
+ doc = document_fromstring(html, parser=parser,
+ guess_charset=guess_charset)
+
+ # document starts with doctype or <html>, full document!
+ start = html[:50]
+ if isinstance(start, bytes):
+ # Allow text comparison in python3.
+ # Decode as ascii, that also covers latin-1 and utf-8 for the
+ # characters we need.
+ start = start.decode('ascii', 'replace')
+
+ start = start.lstrip().lower()
+ if start.startswith('<html') or start.startswith('<!doctype'):
+ return doc
+
+ head = _find_tag(doc, 'head')
+
+ # if the head is not empty we have a full document
+ if len(head):
+ return doc
+
+ body = _find_tag(doc, 'body')
+
+ # The body has just one element, so it was probably a single
+ # element passed in
+ if (len(body) == 1 and (not body.text or not body.text.strip())
+ and (not body[-1].tail or not body[-1].tail.strip())):
+ return body[0]
+
+ # Now we have a body which represents a bunch of tags which have the
+ # content that was passed in. We will create a fake container, which
+ # is the body tag, except <body> implies too much structure.
+ if _contains_block_level_tag(body):
+ body.tag = 'div'
+ else:
+ body.tag = 'span'
+ return body
+
+
+def parse(filename_url_or_file, guess_charset=None, parser=None):
+ """Parse a filename, URL, or file-like object into an HTML document
+ tree. Note: this returns a tree, not an element. Use
+ ``parse(...).getroot()`` to get the document root.
+
+ If ``guess_charset`` is true, the ``useChardet`` option is passed into
+ html5lib to enable character detection. This option is on by default
+ when parsing from URLs, off by default when parsing from file(-like)
+ objects (which tend to return Unicode more often than not), and on by
+ default when parsing from a file path (which is read in binary mode).
+ """
+ if parser is None:
+ parser = html_parser
+ if not isinstance(filename_url_or_file, _strings):
+ fp = filename_url_or_file
+ if guess_charset is None:
+ # assume that file-like objects return Unicode more often than bytes
+ guess_charset = False
+ elif _looks_like_url(filename_url_or_file):
+ fp = urlopen(filename_url_or_file)
+ if guess_charset is None:
+ # assume that URLs return bytes
+ guess_charset = True
+ else:
+ fp = open(filename_url_or_file, 'rb')
+ if guess_charset is None:
+ guess_charset = True
+
+ options = {}
+ # html5lib does not accept useChardet as an argument, if it
+ # detected the html argument would produce unicode objects.
+ if guess_charset:
+ options['useChardet'] = guess_charset
+ return parser.parse(fp, **options)
+
+
+def _looks_like_url(str):
+ scheme = urlparse(str)[0]
+ if not scheme:
+ return False
+ elif (sys.platform == 'win32' and
+ scheme in string.ascii_letters
+ and len(scheme) == 1):
+ # looks like a 'normal' absolute path
+ return False
+ else:
+ return True
+
+
+html_parser = HTMLParser()
diff --git a/src/lxml/html/soupparser.py b/src/lxml/html/soupparser.py
new file mode 100644
index 0000000..e0cf3a0
--- /dev/null
+++ b/src/lxml/html/soupparser.py
@@ -0,0 +1,314 @@
+"""External interface to the BeautifulSoup HTML parser.
+"""
+
+__all__ = ["fromstring", "parse", "convert_tree"]
+
+import re
+from lxml import etree, html
+
+try:
+ from bs4 import (
+ BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
+ Declaration, Doctype)
+ _DECLARATION_OR_DOCTYPE = (Declaration, Doctype)
+except ImportError:
+ from BeautifulSoup import (
+ BeautifulSoup, Tag, Comment, ProcessingInstruction, NavigableString,
+ Declaration)
+ _DECLARATION_OR_DOCTYPE = Declaration
+
+
+def fromstring(data, beautifulsoup=None, makeelement=None, **bsargs):
+ """Parse a string of HTML data into an Element tree using the
+ BeautifulSoup parser.
+
+ Returns the root ``<html>`` Element of the tree.
+
+ You can pass a different BeautifulSoup parser through the
+ `beautifulsoup` keyword, and a diffent Element factory function
+ through the `makeelement` keyword. By default, the standard
+ ``BeautifulSoup`` class and the default factory of `lxml.html` are
+ used.
+ """
+ return _parse(data, beautifulsoup, makeelement, **bsargs)
+
+
+def parse(file, beautifulsoup=None, makeelement=None, **bsargs):
+ """Parse a file into an ElemenTree using the BeautifulSoup parser.
+
+ You can pass a different BeautifulSoup parser through the
+ `beautifulsoup` keyword, and a diffent Element factory function
+ through the `makeelement` keyword. By default, the standard
+ ``BeautifulSoup`` class and the default factory of `lxml.html` are
+ used.
+ """
+ if not hasattr(file, 'read'):
+ file = open(file)
+ root = _parse(file, beautifulsoup, makeelement, **bsargs)
+ return etree.ElementTree(root)
+
+
+def convert_tree(beautiful_soup_tree, makeelement=None):
+ """Convert a BeautifulSoup tree to a list of Element trees.
+
+ Returns a list instead of a single root Element to support
+ HTML-like soup with more than one root element.
+
+ You can pass a different Element factory through the `makeelement`
+ keyword.
+ """
+ root = _convert_tree(beautiful_soup_tree, makeelement)
+ children = root.getchildren()
+ for child in children:
+ root.remove(child)
+ return children
+
+
+# helpers
+
+def _parse(source, beautifulsoup, makeelement, **bsargs):
+ if beautifulsoup is None:
+ beautifulsoup = BeautifulSoup
+ if hasattr(beautifulsoup, "HTML_ENTITIES"): # bs3
+ if 'convertEntities' not in bsargs:
+ bsargs['convertEntities'] = 'html'
+ if hasattr(beautifulsoup, "DEFAULT_BUILDER_FEATURES"): # bs4
+ if 'features' not in bsargs:
+ bsargs['features'] = 'html.parser' # use Python html parser
+ tree = beautifulsoup(source, **bsargs)
+ root = _convert_tree(tree, makeelement)
+ # from ET: wrap the document in a html root element, if necessary
+ if len(root) == 1 and root[0].tag == "html":
+ return root[0]
+ root.tag = "html"
+ return root
+
+
+_parse_doctype_declaration = re.compile(
+ r'(?:\s|[<!])*DOCTYPE\s*HTML'
+ r'(?:\s+PUBLIC)?(?:\s+(\'[^\']*\'|"[^"]*"))?'
+ r'(?:\s+(\'[^\']*\'|"[^"]*"))?',
+ re.IGNORECASE).match
+
+
+class _PseudoTag:
+ # Minimal imitation of BeautifulSoup.Tag
+ def __init__(self, contents):
+ self.name = 'html'
+ self.attrs = []
+ self.contents = contents
+
+ def __iter__(self):
+ return self.contents.__iter__()
+
+
+def _convert_tree(beautiful_soup_tree, makeelement):
+ if makeelement is None:
+ makeelement = html.html_parser.makeelement
+
+ # Split the tree into three parts:
+ # i) everything before the root element: document type
+ # declaration, comments, processing instructions, whitespace
+ # ii) the root(s),
+ # iii) everything after the root: comments, processing
+ # instructions, whitespace
+ first_element_idx = last_element_idx = None
+ html_root = declaration = None
+ for i, e in enumerate(beautiful_soup_tree):
+ if isinstance(e, Tag):
+ if first_element_idx is None:
+ first_element_idx = i
+ last_element_idx = i
+ if html_root is None and e.name and e.name.lower() == 'html':
+ html_root = e
+ elif declaration is None and isinstance(e, _DECLARATION_OR_DOCTYPE):
+ declaration = e
+
+ # For a nice, well-formatted document, the variable roots below is
+ # a list consisting of a single <html> element. However, the document
+ # may be a soup like '<meta><head><title>Hello</head><body>Hi
+ # all<\p>'. In this example roots is a list containing meta, head
+ # and body elements.
+ if first_element_idx is None:
+ pre_root = post_root = []
+ roots = beautiful_soup_tree.contents
+ else:
+ pre_root = beautiful_soup_tree.contents[:first_element_idx]
+ roots = beautiful_soup_tree.contents[first_element_idx:last_element_idx+1]
+ post_root = beautiful_soup_tree.contents[last_element_idx+1:]
+
+ # Reorganize so that there is one <html> root...
+ if html_root is not None:
+ # ... use existing one if possible, ...
+ i = roots.index(html_root)
+ html_root.contents = roots[:i] + html_root.contents + roots[i+1:]
+ else:
+ # ... otherwise create a new one.
+ html_root = _PseudoTag(roots)
+
+ convert_node = _init_node_converters(makeelement)
+
+ # Process pre_root
+ res_root = convert_node(html_root)
+ prev = res_root
+ for e in reversed(pre_root):
+ converted = convert_node(e)
+ if converted is not None:
+ prev.addprevious(converted)
+ prev = converted
+
+ # ditto for post_root
+ prev = res_root
+ for e in post_root:
+ converted = convert_node(e)
+ if converted is not None:
+ prev.addnext(converted)
+ prev = converted
+
+ if declaration is not None:
+ try:
+ # bs4 provides full Doctype string
+ doctype_string = declaration.output_ready()
+ except AttributeError:
+ doctype_string = declaration.string
+
+ match = _parse_doctype_declaration(doctype_string)
+ if not match:
+ # Something is wrong if we end up in here. Since soupparser should
+ # tolerate errors, do not raise Exception, just let it pass.
+ pass
+ else:
+ external_id, sys_uri = match.groups()
+ docinfo = res_root.getroottree().docinfo
+ # strip quotes and update DOCTYPE values (any of None, '', '...')
+ docinfo.public_id = external_id and external_id[1:-1]
+ docinfo.system_url = sys_uri and sys_uri[1:-1]
+
+ return res_root
+
+
+def _init_node_converters(makeelement):
+ converters = {}
+ ordered_node_types = []
+
+ def converter(*types):
+ def add(handler):
+ for t in types:
+ converters[t] = handler
+ ordered_node_types.append(t)
+ return handler
+ return add
+
+ def find_best_converter(node):
+ for t in ordered_node_types:
+ if isinstance(node, t):
+ return converters[t]
+ return None
+
+ def convert_node(bs_node, parent=None):
+ # duplicated in convert_tag() below
+ try:
+ handler = converters[type(bs_node)]
+ except KeyError:
+ handler = converters[type(bs_node)] = find_best_converter(bs_node)
+ if handler is None:
+ return None
+ return handler(bs_node, parent)
+
+ def map_attrs(bs_attrs):
+ if isinstance(bs_attrs, dict): # bs4
+ attribs = {}
+ for k, v in bs_attrs.items():
+ if isinstance(v, list):
+ v = " ".join(v)
+ attribs[k] = unescape(v)
+ else:
+ attribs = dict((k, unescape(v)) for k, v in bs_attrs)
+ return attribs
+
+ def append_text(parent, text):
+ if len(parent) == 0:
+ parent.text = (parent.text or '') + text
+ else:
+ parent[-1].tail = (parent[-1].tail or '') + text
+
+ # converters are tried in order of their definition
+
+ @converter(Tag, _PseudoTag)
+ def convert_tag(bs_node, parent):
+ attrs = bs_node.attrs
+ if parent is not None:
+ attribs = map_attrs(attrs) if attrs else None
+ res = etree.SubElement(parent, bs_node.name, attrib=attribs)
+ else:
+ attribs = map_attrs(attrs) if attrs else {}
+ res = makeelement(bs_node.name, attrib=attribs)
+
+ for child in bs_node:
+ # avoid double recursion by inlining convert_node(), see above
+ try:
+ handler = converters[type(child)]
+ except KeyError:
+ pass
+ else:
+ if handler is not None:
+ handler(child, res)
+ continue
+ convert_node(child, res)
+ return res
+
+ @converter(Comment)
+ def convert_comment(bs_node, parent):
+ res = html.HtmlComment(bs_node)
+ if parent is not None:
+ parent.append(res)
+ return res
+
+ @converter(ProcessingInstruction)
+ def convert_pi(bs_node, parent):
+ if bs_node.endswith('?'):
+ # The PI is of XML style (<?as df?>) but BeautifulSoup
+ # interpreted it as being SGML style (<?as df>). Fix.
+ bs_node = bs_node[:-1]
+ res = etree.ProcessingInstruction(*bs_node.split(' ', 1))
+ if parent is not None:
+ parent.append(res)
+ return res
+
+ @converter(NavigableString)
+ def convert_text(bs_node, parent):
+ if parent is not None:
+ append_text(parent, unescape(bs_node))
+ return None
+
+ return convert_node
+
+
+# copied from ET's ElementSoup
+
+try:
+ from html.entities import name2codepoint # Python 3
+except ImportError:
+ from htmlentitydefs import name2codepoint
+
+
+handle_entities = re.compile(r"&(\w+);").sub
+
+
+try:
+ unichr
+except NameError:
+ # Python 3
+ unichr = chr
+
+
+def unescape(string):
+ if not string:
+ return ''
+ # work around oddities in BeautifulSoup's entity handling
+ def unescape_entity(m):
+ try:
+ return unichr(name2codepoint[m.group(1)])
+ except KeyError:
+ return m.group(0) # use as is
+ return handle_entities(unescape_entity, string)
diff --git a/src/lxml/html/tests/__init__.py b/src/lxml/html/tests/__init__.py
new file mode 100644
index 0000000..792d600
--- /dev/null
+++ b/src/lxml/html/tests/__init__.py
@@ -0,0 +1 @@
+#
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_applet.data b/src/lxml/html/tests/feedparser-data/entry_content_applet.data
new file mode 100644
index 0000000..3382bc0
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_applet.data
@@ -0,0 +1,7 @@
+Description: entry content contains applet
+Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
+Options:
+
+<div>safe<applet code="foo.class" codebase="http://example.com/"></applet> <b>description</b></div>
+----------
+<div>safe <b>description</b></div> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_blink.data b/src/lxml/html/tests/feedparser-data/entry_content_blink.data
new file mode 100644
index 0000000..ba579f9
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_blink.data
@@ -0,0 +1,8 @@
+Description: entry content contains embed
+Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description'
+Options:
+Notes: <div> wrapper
+
+<div><blink>safe</blink> description</div>
+----------
+<div>safe description</div> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_crazy.data b/src/lxml/html/tests/feedparser-data/entry_content_crazy.data
new file mode 100644
index 0000000..ac7dd48
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_crazy.data
@@ -0,0 +1,84 @@
+Description: entry content is crazy
+Expect: not bozo and entries[0]['content'][0]['value'] == u'Crazy HTML -' + u'- Can Your Regex Parse This?\n\n\n\n<!-' + u'- <script> -' + u'->\n\n<!-' + u'- \n\t<script> \n-' + u'->\n\n\n\nfunction executeMe()\n{\n\n\n\n\n/* \n<h1>Did The Javascript Execute?</h1>\n<div>\nI will execute here, too, if you mouse over me\n</div>'
+Options: -page_structure
+Notes: for some reason the comments in the expected field are acting weird
+
+
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>Crazy HTML -- Can Your Regex Parse This?</title>
+
+</head>
+<body notRealAttribute="value"onload="executeMe();"foo="bar"
+
+>
+<!-- <script> -->
+
+<!--
+ <script>
+-->
+
+</script>
+
+
+<script
+
+
+>
+
+function executeMe()
+{
+
+
+
+
+/* <script>
+function am_i_javascript()
+{
+ var str = "Some innocuously commented out stuff";
+}
+< /script>
+*/
+
+
+
+
+
+
+
+
+
+ alert("Executed");
+}
+
+ </script
+
+
+
+>
+<h1>Did The Javascript Execute?</h1>
+<div notRealAttribute="value
+"onmouseover="
+executeMe();
+"foo="bar">
+I will execute here, too, if you mouse over me
+</div>
+
+</body>
+
+</html>
+
+----------
+<html>
+ <head>
+ <title>Crazy HTML -- Can Your Regex Parse This?</title>
+ </head>
+ <body>
+<h1>Did The Javascript Execute?</h1>
+<div>
+I will execute here, too, if you mouse over me
+</div>
+ </body>
+</html> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_embed.data b/src/lxml/html/tests/feedparser-data/entry_content_embed.data
new file mode 100644
index 0000000..e56849e
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_embed.data
@@ -0,0 +1,8 @@
+Description: entry content contains embed
+Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
+Options:
+Notes: <div> wrapper, close <embed> tag (not closing it lost the <b> tag)
+
+<div>safe<embed src="http://example.com/"></embed> <b>description</b></div>
+----------
+<div>safe <b>description</b></div>
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_frame.data b/src/lxml/html/tests/feedparser-data/entry_content_frame.data
new file mode 100644
index 0000000..a4e4afe
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_frame.data
@@ -0,0 +1,7 @@
+Description: entry content contains frameset
+Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
+Options:
+
+<div>safe<frameset rows="*"><frame src="http://example.com/"></frameset> <b>description</b></div>
+----------
+<div>safe <b>description</b></div>
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_iframe.data b/src/lxml/html/tests/feedparser-data/entry_content_iframe.data
new file mode 100644
index 0000000..2473cfb
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_iframe.data
@@ -0,0 +1,8 @@
+Description: entry content contains iframe
+Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
+Options:
+Notes: div wrapper, close <iframe>
+
+<div>safe<iframe src="http://example.com/"></iframe> <b>description</b></iframe></div>
+----------
+<div>safe <b>description</b></div> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_link.data b/src/lxml/html/tests/feedparser-data/entry_content_link.data
new file mode 100644
index 0000000..371fc6a
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_link.data
@@ -0,0 +1,7 @@
+Description: entry content contains link
+Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
+Options:
+
+<div>safe<link rel="stylesheet" type="text/css" href="http://example.com/evil.css"> <b>description</b></div>
+----------
+<div>safe <b>description</b></div>
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_meta.data b/src/lxml/html/tests/feedparser-data/entry_content_meta.data
new file mode 100644
index 0000000..4385a3c
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_meta.data
@@ -0,0 +1,7 @@
+Description: entry content contains meta
+Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
+Options:
+
+<div>safe<meta http-equiv="Refresh" content="0; URL=http://example.com/"> <b>description</b></div>
+----------
+<div>safe <b>description</b></div> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_object.data b/src/lxml/html/tests/feedparser-data/entry_content_object.data
new file mode 100644
index 0000000..88bc634
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_object.data
@@ -0,0 +1,8 @@
+Description: entry content contains object
+Expect: not bozo and entries[0]['content'][0]['value'] == u'safe <b>description</b>'
+Options:
+Notes: div wrapper, close <object>
+
+<div>safe<object classid="clsid:C932BA85-4374-101B-A56C-00AA003668DC"></object> <b>description</b></div>
+----------
+<div>safe <b>description</b></div> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onabort.data b/src/lxml/html/tests/feedparser-data/entry_content_onabort.data
new file mode 100644
index 0000000..51c9b56
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onabort.data
@@ -0,0 +1,7 @@
+Description: entry content contains onabort
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onabort="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onblur.data b/src/lxml/html/tests/feedparser-data/entry_content_onblur.data
new file mode 100644
index 0000000..7c797fd
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onblur.data
@@ -0,0 +1,7 @@
+Description: entry content contains onblur
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onblur="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onchange.data b/src/lxml/html/tests/feedparser-data/entry_content_onchange.data
new file mode 100644
index 0000000..15582a1
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onchange.data
@@ -0,0 +1,7 @@
+Description: entry content contains onchange
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onchange="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onclick.data b/src/lxml/html/tests/feedparser-data/entry_content_onclick.data
new file mode 100644
index 0000000..e5a28eb
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onclick.data
@@ -0,0 +1,7 @@
+Description: entry content contains onclick
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onclick="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_ondblclick.data b/src/lxml/html/tests/feedparser-data/entry_content_ondblclick.data
new file mode 100644
index 0000000..4d717ba
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_ondblclick.data
@@ -0,0 +1,7 @@
+Description: entry content contains ondblclick
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options: javascript
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" ondblclick="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onerror.data b/src/lxml/html/tests/feedparser-data/entry_content_onerror.data
new file mode 100644
index 0000000..b14d18e
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onerror.data
@@ -0,0 +1,7 @@
+Description: entry content contains onerror
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onerror="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onfocus.data b/src/lxml/html/tests/feedparser-data/entry_content_onfocus.data
new file mode 100644
index 0000000..8b6d5cf
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onfocus.data
@@ -0,0 +1,7 @@
+Description: entry content contains onfocus
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onfocus="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onkeydown.data b/src/lxml/html/tests/feedparser-data/entry_content_onkeydown.data
new file mode 100644
index 0000000..b4dc065
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onkeydown.data
@@ -0,0 +1,7 @@
+Description: entry content contains onkeydown
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onkeydown="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onkeypress.data b/src/lxml/html/tests/feedparser-data/entry_content_onkeypress.data
new file mode 100644
index 0000000..dcdfc01
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onkeypress.data
@@ -0,0 +1,7 @@
+Description: entry content contains onkeypress
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onkeypress="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onkeyup.data b/src/lxml/html/tests/feedparser-data/entry_content_onkeyup.data
new file mode 100644
index 0000000..6237293
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onkeyup.data
@@ -0,0 +1,7 @@
+Description: entry content contains onkeyup
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onkeyup="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onload.data b/src/lxml/html/tests/feedparser-data/entry_content_onload.data
new file mode 100644
index 0000000..aeb800f
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onload.data
@@ -0,0 +1,7 @@
+Description: entry content contains onload
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onload="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onmousedown.data b/src/lxml/html/tests/feedparser-data/entry_content_onmousedown.data
new file mode 100644
index 0000000..51869c0
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onmousedown.data
@@ -0,0 +1,7 @@
+Description: entry content contains onmousedown
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmousedown="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onmouseout.data b/src/lxml/html/tests/feedparser-data/entry_content_onmouseout.data
new file mode 100644
index 0000000..7105283
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onmouseout.data
@@ -0,0 +1,7 @@
+Description: entry content contains onmouseout
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmouseout="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onmouseover.data b/src/lxml/html/tests/feedparser-data/entry_content_onmouseover.data
new file mode 100644
index 0000000..4e77ac7
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onmouseover.data
@@ -0,0 +1,7 @@
+Description: entry content contains onmouseover
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmouseover="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onmouseup.data b/src/lxml/html/tests/feedparser-data/entry_content_onmouseup.data
new file mode 100644
index 0000000..b682bd9
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onmouseup.data
@@ -0,0 +1,7 @@
+Description: entry content contains onmouseup
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onmouseup="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onreset.data b/src/lxml/html/tests/feedparser-data/entry_content_onreset.data
new file mode 100644
index 0000000..d1da5fd
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onreset.data
@@ -0,0 +1,7 @@
+Description: entry content contains onreset
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onreset="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onresize.data b/src/lxml/html/tests/feedparser-data/entry_content_onresize.data
new file mode 100644
index 0000000..5a838e5
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onresize.data
@@ -0,0 +1,7 @@
+Description: entry content contains onresize
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onresize="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onsubmit.data b/src/lxml/html/tests/feedparser-data/entry_content_onsubmit.data
new file mode 100644
index 0000000..4c46d42
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onsubmit.data
@@ -0,0 +1,7 @@
+Description: entry content contains onsubmit
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onsubmit="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_onunload.data b/src/lxml/html/tests/feedparser-data/entry_content_onunload.data
new file mode 100644
index 0000000..690ed95
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_onunload.data
@@ -0,0 +1,7 @@
+Description: entry content contains onunload
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<img src="http://www.ragingplatypus.com/i/cam-full.jpg" />'
+Options:
+
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" onunload="location.href='http://www.ragingplatypus.com/';" />
+----------
+<img src="http://www.ragingplatypus.com/i/cam-full.jpg" /> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_script.data b/src/lxml/html/tests/feedparser-data/entry_content_script.data
new file mode 100644
index 0000000..70df649
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_script.data
@@ -0,0 +1,7 @@
+Description: entry content contains script
+Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description'
+Options:
+
+<div>safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>
+----------
+<div>safe description</div> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_script_cdata.data b/src/lxml/html/tests/feedparser-data/entry_content_script_cdata.data
new file mode 100644
index 0000000..f567cca
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_script_cdata.data
@@ -0,0 +1,13 @@
+Description: entry content contains script (cdata)
+Expect: not bozo and entries[0]['content'][0]['value'] == u'safe description'
+Options:
+Notes: div wrapper. Currently not working because of how HTML() is parsing the CDATA (not in a useful way)
+ The resulting code is safe, it just includes crap from the <script> tag (but not the script tag
+ itself).
+Ignore: true
+
+<div>
+ <![CDATA[safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description]]>
+</div>
+----------
+<div>safe description</div> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_script_inline.data b/src/lxml/html/tests/feedparser-data/entry_content_script_inline.data
new file mode 100644
index 0000000..9c7e69a
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_script_inline.data
@@ -0,0 +1,7 @@
+Description: entry content contains script (inline)
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<div>safe description</div>'
+Options:
+
+<div xmlns="http://www.w3.org/1999/xhtml">safe<script type="text/javascript">location.href='http:/'+'/example.com/';</script> description</div>
+----------
+<div>safe description</div> \ No newline at end of file
diff --git a/src/lxml/html/tests/feedparser-data/entry_content_style.data b/src/lxml/html/tests/feedparser-data/entry_content_style.data
new file mode 100644
index 0000000..8ad9523
--- /dev/null
+++ b/src/lxml/html/tests/feedparser-data/entry_content_style.data
@@ -0,0 +1,7 @@
+Description: entry content contains style
+Expect: not bozo and entries[0]['content'][0]['value'] == u'<a href="http://www.ragingplatypus.com/">never trust your upstream platypus</a>'
+Options: style
+
+<a href="http://www.ragingplatypus.com/" style="display:block; position:absolute; left:0; top:0; width:100%; height:100%; z-index:1; background-color:black; background-image:url(http://www.ragingplatypus.com/i/cam-full.jpg); background-x:center; background-y:center; background-repeat:repeat;">never trust your upstream platypus</a>
+----------
+<a href="http://www.ragingplatypus.com/">never trust your upstream platypus</a> \ No newline at end of file
diff --git a/src/lxml/html/tests/hackers-org-data/background-image-plus.data b/src/lxml/html/tests/hackers-org-data/background-image-plus.data
new file mode 100644
index 0000000..c32a135
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/background-image-plus.data
@@ -0,0 +1,8 @@
+Description: I built a quick XSS fuzzer to detect any erroneous characters that are allowed after the open parenthesis but before the JavaScript directive in IE and Netscape 8.1 in secure site mode. These are in decimal but you can include hex and add padding of course. (Any of the following chars can be used: 1-32, 34, 39, 160, 8192-8.13, 12288, 65279)
+ http://ha.ckers.org/xss.html#XSS_DIV_background-image_plus
+Options: -safe_attrs_only
+Notes: As you see, the CSS gets corrupted, but I don't really care that much.
+
+<DIV STYLE="background-image: url(&#1;javascript:alert('XSS'))">text</div>
+----------
+<div style="background-image: url(">text</div>
diff --git a/src/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data b/src/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data
new file mode 100644
index 0000000..3ec45f9
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/background-image-with-unicoded.data
@@ -0,0 +1,10 @@
+Description: exploit (this has been modified slightly to obfuscate the url parameter). The original vulnerability was found by Renaud Lifchitz as a vulnerability in Hotmail.
+ http://ha.ckers.org/xss.html#XSS_DIV_background_image_unicode
+Options: -safe_attrs_only
+Ignore: true
+Notes: I don't understand how this exploit works. It seems like the description actually refers to
+ the unicode you'd import, but why that matters I don't know.
+
+<DIV STYLE="background-image:\0075\0072\006C\0028'\006a\0061\0076\0061\0073\0063\0072\0069\0070\0074\003a\0061\006c\0065\0072\0074\0028.1027\0058.1053\0053\0027\0029'\0029">text</div>
+----------
+<div style="background-image: ">text</div>
diff --git a/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data b/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data
new file mode 100644
index 0000000..fc6e849
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/downlevel-hidden.data
@@ -0,0 +1,9 @@
+Description: Downlevel-Hidden-Hidden block (only works in IE5.0 and later and Netscape 8.1 in IE rendering engine mode). Some websites consider anything inside a comment block to be safe and therefore does not need to be removed, which allows our Cross Site Scripting vector. Or the system could add comment tags around something to attempt to render it harmless. As we can see, that probably wouldn't do the job
+ http://ha.ckers.org/xss.html#XSS_Downlevel-Hidden
+Options: -comments, -processing_instructions
+
+<div><!--[if gte IE 4]>
+<SCRIPT>alert('XSS');</SCRIPT>
+<![endif]--></div>
+----------
+<div></div>
diff --git a/src/lxml/html/tests/hackers-org-data/html-plus-time.data b/src/lxml/html/tests/hackers-org-data/html-plus-time.data
new file mode 100644
index 0000000..77ccfdb
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/html-plus-time.data
@@ -0,0 +1,12 @@
+Description: HTML+TIME in XML. This is how Grey Magic hacked Hotmail and Yahoo!. This only works in Internet Explorer and Netscape 8.1 in IE rendering engine mode and remember that you need to be between HTML and BODY tags for this to work
+ http://ha.ckers.org/xss.html#XSS_HTML_plus_time
+Ignore: true
+Notes: I don't understand the vector here, or how this is supposed to work.
+
+<div>
+<t:set attributeName="innerHTML" to="XSS&lt;SCRIPT DEFER&gt;alert(&quot;XSS&quot;)&lt;/SCRIPT&gt;">
+</BODY></HTML></div>
+----------
+<div>
+<t:set attributeName="innerHTML" to="XSS&lt;SCRIPT DEFER&gt;alert(&quot;XSS&quot;)&lt;/SCRIPT&gt;">
+</BODY></HTML>x</div>
diff --git a/src/lxml/html/tests/hackers-org-data/javascript-link.data b/src/lxml/html/tests/hackers-org-data/javascript-link.data
new file mode 100644
index 0000000..1eefa1f
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/javascript-link.data
@@ -0,0 +1,15 @@
+Description: javascript: in many forms
+
+<div>
+ <a href="java
+script:alert()">x</a>
+ <a href="j a v a s c r i p t:alert()">x</a>
+ <a href="jscript
+:alert()">x</a>
+</div>
+----------
+<div>
+ <a href="">x</a>
+ <a href="">x</a>
+ <a href="">x</a>
+</div>
diff --git a/src/lxml/html/tests/hackers-org-data/style-comment.data b/src/lxml/html/tests/hackers-org-data/style-comment.data
new file mode 100644
index 0000000..f084e68
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/style-comment.data
@@ -0,0 +1,8 @@
+Description: to break up expression (Thanks to Roman Ivanov for this one)
+ http://ha.ckers.org/xss.html#XSS_STYLE_comment
+Options: -safe_attrs_only
+Notes: Because of the suspicious stuff in there, the style is removed entirely
+
+<IMG STYLE="xss:expr/*XSS*/ession(alert('XSS'))">
+----------
+<img>
diff --git a/src/lxml/html/tests/hackers-org-data/style-expression.data b/src/lxml/html/tests/hackers-org-data/style-expression.data
new file mode 100644
index 0000000..610eefe
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/style-expression.data
@@ -0,0 +1,10 @@
+Description: (this is really a hybrid of the above XSS vectors, but it really does show how hard STYLE tags can be to parse apart, like above this can send IE into a loop)
+ http://ha.ckers.org/xss.html#XSS_IMG_STYLE_expression
+Options: -safe_attrs_only
+Notes: Modified to avoid a parsing in libxml2 that ruins the XSS (the " marks).
+ Also there seemed to be an extra "p" in exppression
+
+<div><img style="xss: ex/*<A STYLE='no\xss:noxss(*//*);
+xss:&#101;x&#x2F;*XSS*//*/*/pression(alert('XSS'))"></div>
+----------
+<div><img></div>
diff --git a/src/lxml/html/tests/hackers-org-data/style-import.data b/src/lxml/html/tests/hackers-org-data/style-import.data
new file mode 100644
index 0000000..d1aac0a
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/style-import.data
@@ -0,0 +1,8 @@
+Description: tags with broken up JavaScript for XSS (this XSS at times sends IE into an infinite loop of alerts)
+ http://ha.ckers.org/xss.html#XSS_STYLE
+Options: -safe_attrs_only
+
+<div><STYLE>@im\port'\ja\vasc\ript:alert("XSS")';</STYLE></div>
+----------
+<div><style>/* deleted */</style></div>
+
diff --git a/src/lxml/html/tests/hackers-org-data/style-js-tag.data b/src/lxml/html/tests/hackers-org-data/style-js-tag.data
new file mode 100644
index 0000000..358adc8
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/style-js-tag.data
@@ -0,0 +1,7 @@
+Description: (Older versions of Netscape only)
+ http://ha.ckers.org/xss.html#XSS_STYLE_tag
+Options: -safe_attrs_only
+
+<div><STYLE TYPE="text/javascript">alert('XSS');</STYLE></div>
+----------
+<div></div>
diff --git a/src/lxml/html/tests/hackers-org-data/style-url-js.data b/src/lxml/html/tests/hackers-org-data/style-url-js.data
new file mode 100644
index 0000000..c96aea1
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/style-url-js.data
@@ -0,0 +1,8 @@
+Description: http://ha.ckers.org/xss.html#XSS_STYLE_background-image
+Options: -style, -safe_attrs_only
+Notes: The CSS is messed up here, but so it goes
+
+<div><STYLE>.XSS{background-image:url("javascript:alert('XSS')");}</STYLE><A CLASS=XSS></A></div>
+----------
+<div><style>.XSS{background-image:url("");}</style><a class="XSS"></a></div>
+
diff --git a/src/lxml/html/tests/hackers-org-data/xml-data-island.data b/src/lxml/html/tests/hackers-org-data/xml-data-island.data
new file mode 100644
index 0000000..a17df9e
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/xml-data-island.data
@@ -0,0 +1,10 @@
+Description: XML data island with comment obfuscation (this is another take on the same exploit that doesn't use CDATA fields, but rather uses comments to break up the javascript directive)
+ http://ha.ckers.org/xss.html#XSS_XML_data_island_comment
+Ignore: true
+Notes: I don't understand the vector here. Maybe datasrc should be filtered?
+
+<div><XML ID="xss"><I><B>&lt;IMG SRC="javas<!-- -->cript:alert('XSS')"&gt;</B></I></XML>
+<SPAN DATASRC="#xss" DATAFLD="B" DATAFORMATAS="HTML"></SPAN></div>
+----------
+<div><XML ID="xss"><I><B>&lt;IMG SRC="javas<!-- -->cript:alert('XSS')"&gt;</B></I></XML>
+<SPAN DATASRC="#xss" DATAFLD="B" DATAFORMATAS="HTML"></SPAN>x</div>
diff --git a/src/lxml/html/tests/hackers-org-data/xml-embedded-js.data b/src/lxml/html/tests/hackers-org-data/xml-embedded-js.data
new file mode 100644
index 0000000..8f809db
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/xml-embedded-js.data
@@ -0,0 +1,9 @@
+Description: Locally hosted XML with embedded JavaScript#XSS_Local_XML that is generated using an XML data island. This is the same as above but instead refers to a locally hosted (must be on the same server) XML file that contains your cross site scripting vector. You can see the result here <http://ha.ckers.org/xssxmltest.html>
+ http://ha.ckers.org/xss.html#XSS_Local_XML
+
+<div><XML SRC="xsstest.xml" ID=I></XML>
+<SPAN DATASRC=#I DATAFLD=C DATAFORMATAS=HTML></SPAN></div>
+----------
+<div>
+ <span></span>
+</div>
diff --git a/src/lxml/html/tests/hackers-org-data/xml-namespace.data.BROKEN b/src/lxml/html/tests/hackers-org-data/xml-namespace.data.BROKEN
new file mode 100644
index 0000000..2bc999f
--- /dev/null
+++ b/src/lxml/html/tests/hackers-org-data/xml-namespace.data.BROKEN
@@ -0,0 +1,16 @@
+Description: XML namespace. The htc file must be located on the same server as your XSS vector
+ http://ha.ckers.org/xss.html#XSS_XML_namespace
+Note: I don't completely understand the vector here. page_structure is what does this.
+
+<HTML xmlns:xss>
+ <body>
+ <?import namespace="xss" implementation="http://ha.ckers.org/xss.htc">
+ <xss:xss>XSS</xss:xss>
+ </body>
+</HTML>
+----------
+<HTML>
+ <body>
+ <div>XSS</div>
+ </body>
+</HTML>
diff --git a/src/lxml/html/tests/test_autolink.py b/src/lxml/html/tests/test_autolink.py
new file mode 100644
index 0000000..7a782be
--- /dev/null
+++ b/src/lxml/html/tests/test_autolink.py
@@ -0,0 +1,10 @@
+import unittest
+from lxml.tests.common_imports import make_doctest
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([make_doctest('test_autolink.txt')])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/lxml/html/tests/test_autolink.txt b/src/lxml/html/tests/test_autolink.txt
new file mode 100644
index 0000000..cafc620
--- /dev/null
+++ b/src/lxml/html/tests/test_autolink.txt
@@ -0,0 +1,79 @@
+This tests autolink::
+
+ >>> from lxml.html import usedoctest
+ >>> from lxml.html.clean import autolink_html
+ >>> print(autolink_html('''
+ ... <div>Link here: http://test.com/foo.html.</div>
+ ... '''))
+ <div>Link here: <a href="http://test.com/foo.html">http://test.com/foo.html</a>.</div>
+ >>> print(autolink_html('''
+ ... <div>Mail me at mailto:ianb@test.com or http://myhome.com</div>
+ ... '''))
+ <div>Mail me at <a href="mailto:ianb@test.com">ianb@test.com</a>
+ or <a href="http://myhome.com">http://myhome.com</a></div>
+ >>> print(autolink_html('''
+ ... <div>The <b>great</b> thing is the http://link.com links <i>and</i>
+ ... the http://foobar.com links.</div>'''))
+ <div>The <b>great</b> thing is the <a href="http://link.com">http://link.com</a> links <i>and</i>
+ the <a href="http://foobar.com">http://foobar.com</a> links.</div>
+ >>> print(autolink_html('''
+ ... <div>Link: &lt;http://foobar.com&gt;</div>'''))
+ <div>Link: &lt;<a href="http://foobar.com">http://foobar.com</a>&gt;</div>
+ >>> print(autolink_html('''
+ ... <div>Link: (http://foobar.com)</div>'''))
+ <div>Link: (<a href="http://foobar.com">http://foobar.com</a>)</div>
+
+Parenthesis are tricky, we'll do our best::
+
+ >>> print(autolink_html('''
+ ... <div>(Link: http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software))</div>
+ ... '''))
+ <div>(Link: <a href="http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software)">http://en.wikipedia.org/wiki/PC_Tools_(Central_Point_Software)</a>)</div>
+ >>> print(autolink_html('''
+ ... <div>... a link: http://foo.com)</div>
+ ... '''))
+ <div>... a link: <a href="http://foo.com">http://foo.com</a>)</div>
+
+Some cases that won't be caught (on purpose)::
+
+ >>> print(autolink_html('''
+ ... <div>A link to http://localhost/foo/bar won't, but a link to
+ ... http://test.com will</div>'''))
+ <div>A link to http://localhost/foo/bar won't, but a link to
+ <a href="http://test.com">http://test.com</a> will</div>
+ >>> print(autolink_html('''
+ ... <div>A link in <textarea>http://test.com</textarea></div>'''))
+ <div>A link in <textarea>http://test.com</textarea></div>
+ >>> print(autolink_html('''
+ ... <div>A link in <a href="http://foo.com">http://bar.com</a></div>'''))
+ <div>A link in <a href="http://foo.com">http://bar.com</a></div>
+ >>> print(autolink_html('''
+ ... <div>A link in <code>http://foo.com</code> or
+ ... <span class="nolink">http://bar.com</span></div>'''))
+ <div>A link in <code>http://foo.com</code> or
+ <span class="nolink">http://bar.com</span></div>
+
+There's also a word wrapping function, that should probably be run
+after autolink::
+
+ >>> from lxml.html.clean import word_break_html
+ >>> def pascii(s):
+ ... print(s.encode('ascii', 'xmlcharrefreplace').decode('ascii'))
+ >>> pascii(word_break_html( u'''
+ ... <div>Hey you
+ ... 12345678901234567890123456789012345678901234567890</div>'''))
+ <div>Hey you
+ 1234567890123456789012345678901234567890&#8203;1234567890</div>
+
+Not everything is broken:
+
+ >>> pascii(word_break_html('''
+ ... <div>Hey you
+ ... <code>12345678901234567890123456789012345678901234567890</code></div>'''))
+ <div>Hey you
+ <code>12345678901234567890123456789012345678901234567890</code></div>
+ >>> pascii(word_break_html('''
+ ... <a href="12345678901234567890123456789012345678901234567890">text</a>'''))
+ <a href="12345678901234567890123456789012345678901234567890">text</a>
+
+
diff --git a/src/lxml/html/tests/test_basic.py b/src/lxml/html/tests/test_basic.py
new file mode 100644
index 0000000..6e35c27
--- /dev/null
+++ b/src/lxml/html/tests/test_basic.py
@@ -0,0 +1,12 @@
+import unittest
+from lxml.tests.common_imports import make_doctest, doctest
+import lxml.html
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([make_doctest('test_basic.txt')])
+ suite.addTests([doctest.DocTestSuite(lxml.html)])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/lxml/html/tests/test_basic.txt b/src/lxml/html/tests/test_basic.txt
new file mode 100644
index 0000000..1e85c1a
--- /dev/null
+++ b/src/lxml/html/tests/test_basic.txt
@@ -0,0 +1,236 @@
+lxml.html adds a find_class method to elements::
+
+ >>> from lxml.etree import Comment
+ >>> from lxml.html import document_fromstring, fragment_fromstring, tostring
+ >>> from lxml.html import fragments_fromstring, fromstring
+ >>> from lxml.html.clean import clean, clean_html
+ >>> from lxml.html import usedoctest
+ >>> try: unicode = unicode
+ ... except NameError: unicode = str
+
+ >>> h = document_fromstring('''
+ ... <html><head></head>
+ ... <body>
+ ... <a class="vcard
+ ... fn url" href="foobar">P1</a>
+ ... <a class="not-fn vcard" href="baz">P2</a>
+ ... </body></html>''')
+ >>> print(tostring(h, encoding=unicode))
+ <html>
+ <head></head>
+ <body>
+ <a class="vcard
+ fn url" href="foobar">P1</a>
+ <a class="not-fn vcard" href="baz">P2</a>
+ </body>
+ </html>
+ >>> print([e.text for e in h.find_class('fn')])
+ ['P1']
+ >>> print([e.text for e in h.find_class('vcard')])
+ ['P1', 'P2']
+
+as well as the ability to toggle classes using a set-like interface
+
+ >>> el = fragment_fromstring('<span class="foo bar"></span>')
+ >>> 'foo' in el.classes
+ True
+ >>> 'f00' in el.classes
+ False
+ >>> el.classes.update(('qux', 'quux'))
+ >>> sorted(el.get('class').split())
+ ['bar', 'foo', 'quux', 'qux']
+ >>> el.classes.clear()
+ >>> el.get('class')
+ >>> list(el.classes)
+ []
+ >>> el.classes.add('a')
+ >>> el.classes.add('b')
+ >>> el.classes.remove('a')
+ >>> el.classes.remove('c')
+ Traceback (most recent call last):
+ ...
+ KeyError: 'c'
+ >>> el.classes.discard('c')
+ >>> el.get('class')
+ 'b'
+ >>> el.classes.add('b')
+ >>> el.get('class')
+ 'b'
+ >>> el.classes |= ('a', 'b')
+ >>> el.get('class')
+ 'b a'
+ >>> el.classes -= ('b', 'c', 'd')
+ >>> el.get('class')
+ 'a'
+
+with an extra toggle method to switch the state of classes
+
+ >>> el.get('class')
+ 'a'
+ >>> el.classes.toggle('a')
+ False
+ >>> el.get('class')
+ >>> el.classes.toggle('foo')
+ True
+ >>> el.get('class')
+ 'foo'
+ >>> el.classes.toggle('foo')
+ False
+ >>> el.get('class')
+ >>> el.classes.add("foo\n")
+ Traceback (most recent call last):
+ ...
+ ValueError: Invalid class name: 'foo\n'
+ >>> el.classes.remove("foo ")
+ Traceback (most recent call last):
+ ...
+ ValueError: Invalid class name: 'foo '
+
+Also added is a get_rel_links, which you can use to search for links
+like ``<a rel="$something">``::
+
+ >>> h = document_fromstring('''
+ ... <a href="1">test 1</a>
+ ... <a href="2" rel="tag">item 2</a>
+ ... <a href="3" rel="tagging">item 3</a>
+ ... <a href="4" rel="TAG">item 4</a>''')
+ >>> print([e.attrib['href'] for e in h.find_rel_links('tag')])
+ ['2', '4']
+ >>> print([e.attrib['href'] for e in h.find_rel_links('nofollow')])
+ []
+
+Another method is ``get_element_by_id`` that does what it says::
+
+ >>> print(tostring(fragment_fromstring('''
+ ... <div>
+ ... <span id="test">stuff</span>
+ ... </div>''').get_element_by_id('test'), encoding=unicode))
+ <span id="test">stuff</span>
+
+Or to get the content of an element without the tags, use text_content()::
+
+ >>> el = fragment_fromstring('''
+ ... <div>This is <a href="foo">a <b>bold</b> link</a></div>''')
+ >>> el.text_content()
+ 'This is a bold link'
+
+Or drop an element (leaving its content) or the entire tree, like::
+
+ >>> doc = document_fromstring('''
+ ... <html>
+ ... <body>
+ ... <div id="body">
+ ... This is a <a href="foo" id="link">test</a> of stuff.
+ ... </div>
+ ... <!-- a comment -->
+ ... <div>footer</div>
+ ... </body>
+ ... </html>''')
+ >>> doc.get_element_by_id('link').drop_tag()
+ >>> print(tostring(doc, encoding=unicode))
+ <html>
+ <body>
+ <div id="body">
+ This is a test of stuff.
+ </div>
+ <!-- a comment -->
+ <div>footer</div>
+ </body>
+ </html>
+ >>> doc.get_element_by_id('body').drop_tree()
+ >>> print(tostring(doc, encoding=unicode))
+ <html>
+ <body>
+ <!-- a comment -->
+ <div>footer</div>
+ </body>
+ </html>
+
+Note, however, that comment text will not be merged into the tree when you
+drop the comment. Here, ``drop_tag()`` behaves exactly like ``drop_tree()``:
+
+ >>> for comment in doc.getiterator(Comment):
+ ... comment.drop_tag()
+ >>> print(tostring(doc, encoding=unicode))
+ <html>
+ <body>
+ <div>footer</div>
+ </body>
+ </html>
+
+In Python3 it should be possible to parse strings given as bytes objects, at
+least if an encoding is given.
+
+ >>> from lxml.html import HTMLParser
+ >>> enc = 'utf-8'
+ >>> html_parser = HTMLParser(encoding=enc)
+ >>> src = '<html><body>Test</body></html>'.encode(enc)
+
+ >>> doc = fromstring(src, parser=html_parser)
+ >>> print(tostring(doc, encoding=unicode))
+ <html><body>Test</body></html>
+
+ >>> docs = fragments_fromstring(src, parser=html_parser)
+ >>> len(docs)
+ 1
+ >>> print(docs[0])
+ Test
+
+Bug 599318: Call fromstring with a frameset fragment should not raise an error,
+the whole document is returned.
+
+ >>> import lxml.html
+ >>> content='''
+ ... <frameset>
+ ... <frame src="main.php" name="srcpg">
+ ... </frameset>'''
+ >>> etree_document = lxml.html.fromstring(content)
+ >>> print(tostring(etree_document, encoding=unicode))
+ <html><frameset><frame src="main.php" name="srcpg"></frameset></html>
+
+Bug 599318: Call fromstring with a div fragment should not raise an error,
+only the element is returned
+
+ >>> import lxml.html
+ >>> content='<div></div>'
+ >>> etree_document = lxml.html.fromstring(content)
+ >>> print(tostring(etree_document, encoding=unicode))
+ <div></div>
+
+Bug 599318: Call fromstring with a head fragment should not raise an error,
+the whole document is returned.
+
+ >>> import lxml.html
+ >>> content='<head></head>'
+ >>> etree_document = lxml.html.fromstring(content)
+ >>> print(tostring(etree_document, encoding=unicode))
+ <html><head></head></html>
+
+Bug 690319: Leading whitespace before doctype declaration should not raise an error.
+
+ >>> import lxml.html
+ >>> content='''
+ ... <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+ ... <html>
+ ... </html>'''
+ >>> etree_document = lxml.html.fromstring(content)
+ >>> print(tostring(etree_document, encoding=unicode))
+ <html></html>
+
+Feature https://github.com/lxml/lxml/pull/140: ensure_head_body option:
+
+ >>> from lxml.html import document_fromstring, tostring
+ >>> from functools import partial
+ >>> tos = partial(tostring, encoding=unicode)
+ >>> print(tos(document_fromstring('<p>test</p>')))
+ <html><body><p>test</p></body></html>
+ >>> print(tos(document_fromstring('<p>test</p>', ensure_head_body=True)))
+ <html><head></head><body><p>test</p></body></html>
+ >>> print(tos(document_fromstring('<meta>')))
+ <html><head><meta></head></html>
+ >>> print(tos(document_fromstring('<meta>', ensure_head_body=True)))
+ <html><head><meta></head><body></body></html>
+ >>> print(tos(document_fromstring('<html></html>')))
+ <html></html>
+ >>> print(tos(document_fromstring('<html></html>', ensure_head_body=True)))
+ <html><head></head><body></body></html>
diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py
new file mode 100644
index 0000000..45c2e83
--- /dev/null
+++ b/src/lxml/html/tests/test_clean.py
@@ -0,0 +1,147 @@
+import unittest
+from lxml.tests.common_imports import make_doctest
+
+import lxml.html
+from lxml.html.clean import Cleaner, clean_html
+
+
+class CleanerTest(unittest.TestCase):
+ def test_allow_tags(self):
+ html = """
+ <html>
+ <head>
+ </head>
+ <body>
+ <p>some text</p>
+ <table>
+ <tr>
+ <td>hello</td><td>world</td>
+ </tr>
+ <tr>
+ <td>hello</td><td>world</td>
+ </tr>
+ </table>
+ <img>
+ </body>
+ </html>
+ """
+
+ html_root = lxml.html.document_fromstring(html)
+ cleaner = Cleaner(
+ remove_unknown_tags = False,
+ allow_tags = ['table', 'tr', 'td'])
+ result = cleaner.clean_html(html_root)
+
+ self.assertEqual(12-5+1, len(list(result.iter())))
+
+ def test_allow_and_remove(self):
+ with self.assertRaises(ValueError):
+ Cleaner(allow_tags=['a'], remove_unknown_tags=True)
+
+ def test_remove_unknown_tags(self):
+ html = """<div><bun>lettuce, tomato, veggie patty</bun></div>"""
+ clean_html = """<div>lettuce, tomato, veggie patty</div>"""
+ cleaner = Cleaner(remove_unknown_tags=True)
+ result = cleaner.clean_html(html)
+ self.assertEqual(
+ result,
+ clean_html,
+ msg="Unknown tags not removed. Got: %s" % result,
+ )
+
+ def test_safe_attrs_included(self):
+ html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
+
+ safe_attrs=set(lxml.html.defs.safe_attrs)
+ safe_attrs.add('style')
+
+ cleaner = Cleaner(
+ safe_attrs_only=True,
+ safe_attrs=safe_attrs)
+ result = cleaner.clean_html(html)
+
+ self.assertEqual(html, result)
+
+ def test_safe_attrs_excluded(self):
+ html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
+ expected = """<p><span>Cyan</span></p>"""
+
+ safe_attrs=set()
+
+ cleaner = Cleaner(
+ safe_attrs_only=True,
+ safe_attrs=safe_attrs)
+ result = cleaner.clean_html(html)
+
+ self.assertEqual(expected, result)
+
+ def test_clean_invalid_root_tag(self):
+ # only testing that cleaning with invalid root tags works at all
+ s = lxml.html.fromstring('parent <invalid tag>child</another>')
+ self.assertEqual('parent child', clean_html(s).text_content())
+
+ s = lxml.html.fromstring('<invalid tag>child</another>')
+ self.assertEqual('child', clean_html(s).text_content())
+
+ def test_clean_with_comments(self):
+ html = """<p><span style="color: #00ffff;">Cy<!-- xx -->an</span><!-- XXX --></p>"""
+ s = lxml.html.fragment_fromstring(html)
+
+ self.assertEqual(
+ b'<p><span>Cyan</span></p>',
+ lxml.html.tostring(clean_html(s)))
+ self.assertEqual(
+ '<p><span>Cyan</span></p>',
+ clean_html(html))
+
+ cleaner = Cleaner(comments=False)
+ result = cleaner.clean_html(s)
+ self.assertEqual(
+ b'<p><span>Cy<!-- xx -->an</span><!-- XXX --></p>',
+ lxml.html.tostring(result))
+ self.assertEqual(
+ '<p><span>Cy<!-- xx -->an</span><!-- XXX --></p>',
+ cleaner.clean_html(html))
+
+ def test_sneaky_noscript_in_style(self):
+ # This gets parsed as <noscript> -> <style>"...</noscript>..."</style>
+ # thus passing the </noscript> through into the output.
+ html = '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+ s = lxml.html.fragment_fromstring(html)
+
+ self.assertEqual(
+ b'<noscript><style>/* deleted */</style></noscript>',
+ lxml.html.tostring(clean_html(s)))
+
+ def test_sneaky_js_in_math_style(self):
+ # This gets parsed as <math> -> <style>"..."</style>
+ # thus passing any tag/script/whatever content through into the output.
+ html = '<math><style><img src=x onerror=alert(1)></style></math>'
+ s = lxml.html.fragment_fromstring(html)
+
+ self.assertEqual(
+ b'<math><style>/* deleted */</style></math>',
+ lxml.html.tostring(clean_html(s)))
+
+ def test_formaction_attribute_in_button_input(self):
+ # The formaction attribute overrides the form's action and should be
+ # treated as a malicious link attribute
+ html = ('<form id="test"><input type="submit" formaction="javascript:alert(1)"></form>'
+ '<button form="test" formaction="javascript:alert(1)">X</button>')
+ expected = ('<div><form id="test"><input type="submit" formaction=""></form>'
+ '<button form="test" formaction="">X</button></div>')
+ cleaner = Cleaner(
+ forms=False,
+ safe_attrs_only=False,
+ )
+ self.assertEqual(
+ expected,
+ cleaner.clean_html(html))
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([make_doctest('test_clean.txt')])
+ suite.addTests([make_doctest('test_clean_embed.txt')])
+ suite.addTests(unittest.makeSuite(CleanerTest))
+ return suite
diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt
new file mode 100644
index 0000000..18e6c7e
--- /dev/null
+++ b/src/lxml/html/tests/test_clean.txt
@@ -0,0 +1,221 @@
+>>> import re
+>>> from lxml.html import fromstring, tostring
+>>> from lxml.html.clean import clean, clean_html, Cleaner
+>>> from lxml.html import usedoctest
+
+>>> doc = '''<html>
+... <head>
+... <script type="text/javascript" src="evil-site"></script>
+... <link rel="alternate" type="text/rss" src="evil-rss">
+... <link rel="alternate" type="text/rss" href="http://example.com">
+... <link rel="stylesheet" type="text/rss" href="http://example.com">
+... <style>
+... body {background-image: url(javascript:do_evil)};
+... div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
+... div {color: expression(evil)};
+... </style>
+... </head>
+... <body onload="evil_function()">
+... <!-- I am interpreted for EVIL! -->
+... <a href="javascript:evil_function()">a link</a>
+... <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t%20:evil_function()">a control char link</a>
+... <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
+... <a href="#" onclick="evil_function()">another link</a>
+... <p onclick="evil_function()">a paragraph</p>
+... <div style="display: none">secret EVIL!</div>
+... <object> of EVIL! </object>
+... <iframe src="evil-site"></iframe>
+... <form action="evil-site">
+... Password: <input type="password" name="password">
+... </form>
+... <a href="evil-site">spam spam SPAM!</a>
+... <a href="http://example.com" rel="author">Author</a>
+... <a href="http://example.com" rel="nofollow">Text</a>
+... <img src="evil!">
+... </body>
+... </html>'''
+
+>>> print(re.sub('[\x00-\x07\x0E]', '', doc))
+<html>
+ <head>
+ <script type="text/javascript" src="evil-site"></script>
+ <link rel="alternate" type="text/rss" src="evil-rss">
+ <link rel="alternate" type="text/rss" href="http://example.com">
+ <link rel="stylesheet" type="text/rss" href="http://example.com">
+ <style>
+ body {background-image: url(javascript:do_evil)};
+ div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
+ div {color: expression(evil)};
+ </style>
+ </head>
+ <body onload="evil_function()">
+ <!-- I am interpreted for EVIL! -->
+ <a href="javascript:evil_function()">a link</a>
+ <a href="javascrip t%20:evil_function()">a control char link</a>
+ <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
+ <a href="#" onclick="evil_function()">another link</a>
+ <p onclick="evil_function()">a paragraph</p>
+ <div style="display: none">secret EVIL!</div>
+ <object> of EVIL! </object>
+ <iframe src="evil-site"></iframe>
+ <form action="evil-site">
+ Password: <input type="password" name="password">
+ </form>
+ <a href="evil-site">spam spam SPAM!</a>
+ <a href="http://example.com" rel="author">Author</a>
+ <a href="http://example.com" rel="nofollow">Text</a>
+ <img src="evil!">
+ </body>
+</html>
+
+>>> print(tostring(fromstring(doc)).decode("utf-8"))
+<html>
+ <head>
+ <script type="text/javascript" src="evil-site"></script>
+ <link rel="alternate" type="text/rss" src="evil-rss">
+ <link rel="alternate" type="text/rss" href="http://example.com">
+ <link rel="stylesheet" type="text/rss" href="http://example.com">
+ <style>
+ body {background-image: url(javascript:do_evil)};
+ div {background-image: url(data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==)};
+ div {color: expression(evil)};
+ </style>
+ </head>
+ <body onload="evil_function()">
+ <!-- I am interpreted for EVIL! -->
+ <a href="javascript:evil_function()">a link</a>
+ <a href="javascrip%20t%20:evil_function()">a control char link</a>
+ <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
+ <a href="#" onclick="evil_function()">another link</a>
+ <p onclick="evil_function()">a paragraph</p>
+ <div style="display: none">secret EVIL!</div>
+ <object> of EVIL! </object>
+ <iframe src="evil-site"></iframe>
+ <form action="evil-site">
+ Password: <input type="password" name="password">
+ </form>
+ <a href="evil-site">spam spam SPAM!</a>
+ <a href="http://example.com" rel="author">Author</a>
+ <a href="http://example.com" rel="nofollow">Text</a>
+ <img src="evil!">
+ </body>
+</html>
+
+>>> print(Cleaner(page_structure=False, comments=False).clean_html(doc))
+<html>
+ <head>
+ <style>
+ body {background-image: url()};
+ div {background-image: url()};
+ div {color: };
+ </style>
+ </head>
+ <body>
+ <!-- I am interpreted for EVIL! -->
+ <a href="">a link</a>
+ <a href="">a control char link</a>
+ <a href="">data</a>
+ <a href="#">another link</a>
+ <p>a paragraph</p>
+ <div>secret EVIL!</div>
+ of EVIL!
+ Password:
+ <a href="evil-site">spam spam SPAM!</a>
+ <a href="http://example.com" rel="author">Author</a>
+ <a href="http://example.com" rel="nofollow">Text</a>
+ <img src="evil!">
+ </body>
+</html>
+
+>>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc))
+<html>
+ <head>
+ <style>
+ body {background-image: url()};
+ div {background-image: url()};
+ div {color: };
+ </style>
+ </head>
+ <body>
+ <a href="">a link</a>
+ <a href="">a control char link</a>
+ <a href="">data</a>
+ <a href="#">another link</a>
+ <p>a paragraph</p>
+ <div style="display: none">secret EVIL!</div>
+ of EVIL!
+ Password:
+ <a href="evil-site">spam spam SPAM!</a>
+ <a href="http://example.com" rel="author">Author</a>
+ <a href="http://example.com" rel="nofollow">Text</a>
+ <img src="evil!">
+ </body>
+</html>
+
+>>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
+<html>
+ <head>
+ </head>
+ <body>
+ <a href="">a link</a>
+ <a href="">a control char link</a>
+ <a href="">data</a>
+ <a href="#">another link</a>
+ <p>a paragraph</p>
+ <div>secret EVIL!</div>
+ of EVIL!
+ Password:
+ <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
+ <a href="http://example.com" rel="author nofollow">Author</a>
+ <a href="http://example.com" rel="nofollow">Text</a>
+ <img src="evil!">
+ </body>
+</html>
+
+>>> print(Cleaner(style=True, inline_style=False, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc))
+<html>
+ <head>
+ </head>
+ <body>
+ <a href="">a link</a>
+ <a href="">a control char link</a>
+ <a href="">data</a>
+ <a href="#">another link</a>
+ <p>a paragraph</p>
+ <div style="display: none">secret EVIL!</div>
+ of EVIL!
+ Password:
+ <a href="evil-site" rel="nofollow">spam spam SPAM!</a>
+ <a href="http://example.com" rel="author nofollow">Author</a>
+ <a href="http://example.com" rel="nofollow">Text</a>
+ <img src="evil!">
+ </body>
+</html>
+
+>>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc))
+<html>
+ <head>
+ <link rel="alternate" type="text/rss" src="evil-rss">
+ <link rel="alternate" type="text/rss" href="http://example.com">
+ <link rel="stylesheet" type="text/rss" href="http://example.com">
+ <style>
+ body {background-image: url()};
+ div {background-image: url()};
+ div {color: };
+ </style>
+ </head>
+ <body>
+ <a href="">a link</a>
+ <a href="">a control char link</a>
+ <a href="">data</a>
+ <a href="#">another link</a>
+ <p>a paragraph</p>
+ <div>secret EVIL!</div>
+ of EVIL!
+ Password:
+ <a href="evil-site">spam spam SPAM!</a>
+ <a href="http://example.com" rel="author">Author</a>
+ <a href="http://example.com" rel="nofollow">Text</a>
+ <img src="evil!">
+ </body>
+</html>
diff --git a/src/lxml/html/tests/test_clean_embed.txt b/src/lxml/html/tests/test_clean_embed.txt
new file mode 100644
index 0000000..59a4055
--- /dev/null
+++ b/src/lxml/html/tests/test_clean_embed.txt
@@ -0,0 +1,39 @@
+THIS FAILS IN libxml2 2.6.29 AND 2.6.30 !!
+
+
+>>> from lxml.html import fromstring, tostring
+>>> from lxml.html.clean import clean, clean_html, Cleaner
+>>> from lxml.html import usedoctest
+
+>>> def tostring(el): # work-around for Py3 'bytes' type
+... from lxml.html import tostring
+... s = tostring(el)
+... if not isinstance(s, str):
+... s = s.decode('UTF-8')
+... return s
+
+>>> doc_embed = '''<div>
+... <embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
+... <embed src="http://anothersite.com/v/another"></embed>
+... <script src="http://www.youtube.com/example.js"></script>
+... <script src="/something-else.js"></script>
+... </div>'''
+>>> print(tostring(fromstring(doc_embed)))
+<div>
+<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
+<embed src="http://anothersite.com/v/another"></embed>
+<script src="http://www.youtube.com/example.js"></script>
+<script src="/something-else.js"></script>
+</div>
+>>> print(Cleaner().clean_html(doc_embed))
+<div>
+</div>
+>>> print(Cleaner(host_whitelist=['www.youtube.com']).clean_html(doc_embed))
+<div>
+<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
+</div>
+>>> print(Cleaner(host_whitelist=['www.youtube.com'], whitelist_tags=None).clean_html(doc_embed))
+<div>
+<embed src="http://www.youtube.com/v/183tVH1CZpA" type="application/x-shockwave-flash"></embed>
+<script src="http://www.youtube.com/example.js"></script>
+</div>
diff --git a/src/lxml/html/tests/test_diff.py b/src/lxml/html/tests/test_diff.py
new file mode 100644
index 0000000..c1adbd6
--- /dev/null
+++ b/src/lxml/html/tests/test_diff.py
@@ -0,0 +1,13 @@
+import unittest
+from lxml.tests.common_imports import make_doctest, doctest
+
+from lxml.html import diff
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([make_doctest('test_diff.txt'),
+ doctest.DocTestSuite(diff)])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/lxml/html/tests/test_diff.txt b/src/lxml/html/tests/test_diff.txt
new file mode 100644
index 0000000..9057a2b
--- /dev/null
+++ b/src/lxml/html/tests/test_diff.txt
@@ -0,0 +1,252 @@
+lxml.html.diff does HTML comparisons. These are word-based comparisons.
+
+First, a handy function for normalizing whitespace and doing word wrapping::
+
+ >>> import re, textwrap
+ >>> def pwrapped(text):
+ ... text = re.sub(r'[ \n\t\r]+', ' ', text)
+ ... text = textwrap.fill(text)
+ ... print(text)
+ >>> def pdiff(text1, text2):
+ ... pwrapped(htmldiff(text1, text2))
+
+Example::
+
+ >>> from lxml.html.diff import htmldiff, html_annotate
+ >>> html1 = '<p>This is some test text with some changes and some same stuff</p>'
+ >>> html2 = '''<p>This is some test textual writing with some changed stuff
+ ... and some same stuff</p>'''
+ >>> pdiff(html1, html2)
+ <p>This is some test <ins>textual writing with some changed stuff
+ </ins> <del>text with some changes</del> and some same stuff</p>
+
+Style tags are largely ignored in terms of differences, though markup is not eliminated::
+
+ >>> html1 = '<p>Hi <i>you guys</i></p>'
+ >>> html2 = '<p>Hi <i>you</i> guys</p>'
+ >>> pdiff(html1, html2)
+ <p>Hi <i>you</i> guys</p>
+ >>> pdiff('text', '<p>text</p>')
+ <p>text</p>
+ >>> pdiff('<i>Hi guys</i> !!', '<i>Hi guy</i> !!')
+ <i>Hi <ins>guy</ins> <del>guys</del> </i> !!
+ >>> pdiff('H<i>i</i>', 'Hi')
+ <ins>Hi</ins> <del>H<i>i</i></del>
+ >>> pdiff('<i>A B</i> C', '<i>A</i> C')
+ <i>A <del>B</del> </i> C
+ >>> pdiff('<i>A B</i> C', '<i>B</i> C')
+ <i> <del>A</del> B</i> C
+ >>> pdiff('<p></p>', '<p></p>')
+ <p></p>
+ >>> pdiff('<p>Hi</p>', '<p>Bye</p>')
+ <p><ins>Bye</ins></p> <p><del>Hi</del></p>
+ >>> pdiff('<p>Hi Guy</p>', '<p>Bye Guy</p>')
+ <p> <ins>Bye</ins> <del>Hi</del> Guy</p>
+ >>> pdiff('<p>Hey there</p>', '')
+ <ins></ins> <p><del>Hey there</del></p>
+
+Movement between paragraphs is ignored, as tag-based changes are generally ignored::
+ >>>
+ >>> pdiff('<p>Hello</p><p>World</p>', '<p>Hello World</p>')
+ <p>Hello World</p>
+
+As a special case, changing the href of a link is displayed, and
+images are treated like words:
+
+ >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://google.com">search</a>')
+ <a href="http://google.com">search <ins> Link: http://google.com</ins>
+ <del> Link: http://yahoo.com</del> </a>
+ >>> pdiff('<p>Print this <img src="print.gif"></p>', '<p>Print this</p>')
+ <p>Print this <del><img src="print.gif"></del> </p>
+ >>> pdiff('<a href="http://yahoo.com">search</a>', '<a href="http://yahoo.com">search</a>')
+ <a href="http://yahoo.com">search</a>
+
+Images may sometimes not have 'src' attributes:
+
+ >>> pdiff('<img src="tease"> <img> test <img src="test">', '<img> test <img src="toast">')
+ <del><img src="tease"></del> <img> test <ins><img src="toast"></ins>
+ <del><img src="test"></del>
+
+A test of empty elements:
+
+ >>> pdiff('some <br> text', 'some <br> test')
+ some <ins><br> test</ins> <del><br> text</del>
+
+Whitespace is generally ignored for the diff but preserved during the diff:
+
+ >>> print(htmldiff('<p> first\nsecond\nthird</p>', '<p> &#xA0; first\n second\nthird </p>'))
+ <p>first
+ second
+ third </p>
+ >>> print(htmldiff('<pre>first\nsecond\nthird</pre>', '<pre>first\nsecond\nthird</pre>'))
+ <pre>first
+ second
+ third</pre>
+ >>> print(htmldiff('<pre>first\nsecond</pre>', '<pre>first\nsecond\n third</pre>'))
+ <pre>first
+ second
+ <ins>third</ins> </pre>
+
+The sixteen combinations::
+
+First "insert start" (del start/middle/end/none):
+
+ >>> pdiff('<b>A B C</b>', '<b>D B C</b')
+ <b> <ins>D</ins> <del>A</del> B C</b>
+ >>> pdiff('<b>A B C</b>', '<b>D A C</b>')
+ <b> <ins>D</ins> A <del>B</del> C</b>
+ >>> pdiff('<b>A B C</b>', '<b>D A B</b>')
+ <b> <ins>D</ins> A B <del>C</del> </b>
+ >>> pdiff('<b>A B C</b>', '<b>D A B C</b>')
+ <b> <ins>D</ins> A B C</b>
+
+Next, "insert middle" (del start/middle/end/none):
+
+ >>> pdiff('<b>A B C</b>', '<b>D B C</b>')
+ <b> <ins>D</ins> <del>A</del> B C</b>
+ >>> pdiff('<b>A B C</b>', '<b>A D C</b>')
+ <b>A <ins>D</ins> <del>B</del> C</b>
+ >>> pdiff('<b>A B C</b>', '<b>A D B</b>')
+ <b>A <ins>D</ins> B <del>C</del> </b>
+
+This one case hits the threshold of our insensitive matching:
+
+ >>> pdiff('<b>A B C</b>', '<b>A D B C</b>')
+ <b> <ins>A D</ins> <del>A</del> B C</b>
+
+
+Then "insert end" (del start/middle/end/none):
+
+ >>> pdiff('<b>A B C</b>', '<b>B C D</b>')
+ <b> <del>A</del> B C <ins>D</ins> </b>
+ >>> pdiff('<b>A B C</b>', '<b>A C D</b>')
+ <b>A <del>B</del> C <ins>D</ins> </b>
+ >>> pdiff('<b>A B C</b>', '<b>A B D</b>')
+ <b>A B <ins>D</ins> <del>C</del> </b>
+ >>> pdiff('<b>A B C</b>', '<b>A B C D</b>')
+ <b>A B C <ins>D</ins> </b>
+
+Then no insert (del start/middle/end):
+
+ >>> pdiff('<b>A B C</b>', '<b>B C</b>')
+ <b> <del>A</del> B C</b>
+ >>> pdiff('<b>A B C</b>', '<b>A C</b>')
+ <b>A <del>B</del> C</b>
+ >>> pdiff('<b>A B C</b>', '<b>A B</b>')
+ <b>A B <del>C</del> </b>
+
+ >>> pdiff('<b>A B</b> C', '<b>A B</b>')
+ <b>A B</b> <del>C</del>
+ >>> pdiff('<b>A B</b> <b>C</b>', '<b>A B</b>')
+ <b>A B</b> <del><b>C</b></del>
+ >>> pdiff('A <p><b>hey there</b> <i>how are you?</i></p>', 'A')
+ A <p><del><b>hey there</b> <i>how are you?</i></del></p>
+
+Testing a larger document, to make sure there are not weird
+unnecessary parallels found:
+
+ >>> pdiff('''
+ ... <p>This is a test document with many words in it that goes on
+ ... for a while and doesn't have anything do to with the next
+ ... document that we match this against</p>''', '''
+ ... <p>This is another document with few similarities to the preceding
+ ... one, but enough that it may have overlap that could turn into
+ ... a confusing series of deletes and inserts.
+ ... </p>''')
+ <p><ins>This is another document with few similarities to the
+ preceding one, but enough that it may have overlap that could turn
+ into a confusing series of deletes and inserts. </ins></p>
+ <p><del>This is a test document with many words in it that goes on for
+ a while and doesn't have anything do to with the next document that we
+ match this against</del></p>
+
+
+
+Annotation of content can also be done, where every bit of content is
+marked up with information about where it came from.
+
+First, some setup; note that html_annotate is called with a sequence
+of documents and the annotation associated with that document. We'll
+just use indexes, but you could use author or timestamp information.
+
+ >>> def markup(text, annotation):
+ ... return '<span version="%s">%s</span>' % (annotation, text)
+ >>> def panno(*docs):
+ ... pwrapped(html_annotate([(doc, index) for index, doc in enumerate(docs)],
+ ... markup=markup))
+
+Now, a sequence of documents:
+
+ >>> panno('Hello cruel world', 'Hi cruel world', 'Hi world')
+ <span version="1">Hi</span> <span version="0">world</span>
+ >>> panno('A similar document', 'A similar document',
+ ... 'A similar document here')
+ <span version="0">A similar document</span> <span
+ version="2">here</span>
+ >>> panno('<p>P1 para</p><p>P2 para</p>', '<p>P1 para</p><p>P3 foo</p>')
+ <p><span version="0">P1 para</span></p><p><span version="1">P3
+ foo</span></p>
+ >>> panno('Hello<p>There World</p>','Hello<p>There Town</p>')
+ <span version="0">Hello</span><p><span version="0">There</span> <span
+ version="1">Town</span></p>
+ >>> panno('<p>Hello</p>There World','<p>Hello</p>There Town')
+ <p><span version="0">Hello</span></p><span version="0">There</span>
+ <span version="1">Town</span>
+ >>> panno('<p>Hello</p><p>There World</p>','<p>Hello</p><p>There Town</p>')
+ <p><span version="0">Hello</span></p><p><span version="0">There</span>
+ <span version="1">Town</span></p>
+ >>> panno('<p>Hi <img src="/foo"> You</p>',
+ ... '<p>Hi You</p>',
+ ... '<p>Hi You <img src="/bar"></p>')
+ <p><span version="0">Hi You</span> <span version="2"><img
+ src="/bar"></span></p>
+ >>> panno('<p><a href="/foo">Hey</a></p>',
+ ... '<p><a href="/bar">Hey</a></p>')
+ <p><a href="/bar"><span version="0">Hey</span></a></p>
+ >>> panno('<p><a href="/foo">Hey You</a></p>',
+ ... '<p><a href="/foo">Hey Guy</a></p>')
+ <p><a href="/foo"><span version="0">Hey</span> <span
+ version="1">Guy</span></a></p>
+
+Internals
+---------
+
+
+Some utility functions::
+
+ >>> from lxml.html.diff import fixup_ins_del_tags, split_unbalanced, split_trailing_whitespace
+ >>> def pfixup(text):
+ ... print(fixup_ins_del_tags(text).strip())
+ >>> pfixup('<ins><p>some text <b>and more text</b> and more</p></ins>')
+ <p><ins>some text <b>and more text</b> and more</ins></p>
+ >>> pfixup('<p><ins>Hi!</ins> you</p>')
+ <p><ins>Hi!</ins> you</p>
+ >>> pfixup('<div>Some text <ins>and <p>more text</p></ins> </div>')
+ <div>Some text <ins>and </ins><p><ins>more text</ins></p> </div>
+ >>> pfixup('''
+ ... <ins><table><tr><td>One table</td><td>More stuff</td></tr></table></ins>''')
+ <table><tr><td><ins>One table</ins></td><td><ins>More stuff</ins></td></tr></table>
+
+
+Testing split_unbalanced::
+
+ >>> split_unbalanced(['<a href="blah">', 'hey', '</a>'])
+ ([], ['<a href="blah">', 'hey', '</a>'], [])
+ >>> split_unbalanced(['<a href="blah">', 'hey'])
+ (['<a href="blah">'], ['hey'], [])
+ >>> split_unbalanced(['Hey', '</i>', 'You', '</b>'])
+ ([], ['Hey', 'You'], ['</i>', '</b>'])
+ >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There', '</b>'])
+ ([], ['So', 'Hi', '<b>', 'There', '</b>'], ['</i>'])
+ >>> split_unbalanced(['So', '</i>', 'Hi', '<b>', 'There'])
+ (['<b>'], ['So', 'Hi', 'There'], ['</i>'])
+
+
+Testing split_trailing_whitespace::
+
+ >>> split_trailing_whitespace('test\n\n')
+ ('test', '\n\n')
+ >>> split_trailing_whitespace(' test\n ')
+ (' test', '\n ')
+ >>> split_trailing_whitespace('test')
+ ('test', '')
diff --git a/src/lxml/html/tests/test_elementsoup.py b/src/lxml/html/tests/test_elementsoup.py
new file mode 100644
index 0000000..553586b
--- /dev/null
+++ b/src/lxml/html/tests/test_elementsoup.py
@@ -0,0 +1,128 @@
+import unittest, sys
+from lxml.tests.common_imports import make_doctest, HelperTestCase
+
+try:
+ import lxml.html.soupparser
+ BS_INSTALLED = True
+except ImportError:
+ if 'bs4' in sys.modules or 'BeautifulSoup' in sys.modules:
+ raise # seems we managed to import BS but not soupparser
+ BS_INSTALLED = False
+
+from lxml.html import tostring
+
+
+if BS_INSTALLED:
+ class SoupParserTestCase(HelperTestCase):
+ soupparser = lxml.html.soupparser
+
+ def test_broken_attribute(self):
+ html = """\
+ <html><head></head><body>
+ <form><input type='text' disabled size='10'></form>
+ </body></html>
+ """
+ root = self.soupparser.fromstring(html)
+ self.assertTrue(root.find('.//input').get('disabled') is not None)
+
+ def test_empty(self):
+ tree = self.soupparser.fromstring('')
+ res = b'''<html></html>'''
+ self.assertEqual(tostring(tree), res)
+
+ def test_text(self):
+ tree = self.soupparser.fromstring('huhu')
+ res = b'''<html>huhu</html>'''
+ self.assertEqual(tostring(tree), res)
+
+ def test_body(self):
+ html = '''<body><p>test</p></body>'''
+ res = b'''<html><body><p>test</p></body></html>'''
+ tree = self.soupparser.fromstring(html)
+ self.assertEqual(tostring(tree), res)
+
+ def test_head_body(self):
+ # HTML tag missing, parser should fix that
+ html = '<head><title>test</title></head><body><p>test</p></body>'
+ res = b'<html><head><title>test</title></head><body><p>test</p></body></html>'
+ tree = self.soupparser.fromstring(html)
+ self.assertEqual(tostring(tree), res)
+
+ def test_wrap_html(self):
+ # <head> outside <html>, parser should fix that
+ html = '<head><title>title</test></head><html><body/></html>'
+ res = b'<html><head><title>title</title></head><body></body></html>'
+ tree = self.soupparser.fromstring(html)
+ self.assertEqual(tostring(tree), res)
+
+ def test_comment_hyphen(self):
+ # These are really invalid XML as per specification
+ # https://www.w3.org/TR/REC-xml/#sec-comments
+ html = b'<html><!-- comment -- with double-hyphen --></html>'
+ tree = self.soupparser.fromstring(html)
+ self.assertEqual(tostring(tree), html)
+
+ html = b'<html><!-- comment ends with hyphen ---></html>'
+ tree = self.soupparser.fromstring(html)
+ self.assertEqual(tostring(tree), html)
+
+ def test_comment_pi(self):
+ html = '''<!-- comment -->
+<?test asdf?>
+<head><title>test</title></head><body><p>test</p></body>
+<!-- another comment -->'''
+ res = b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
+<!-- comment --><?test asdf?><html><head><title>test</title></head><body><p>test</p></body></html><!-- another comment -->'''
+ tree = self.soupparser.fromstring(html).getroottree()
+ self.assertEqual(tostring(tree, method='html'), res)
+
+ def test_doctype1(self):
+ # Test document type declaration, comments and PI's
+ # outside the root
+ html = \
+'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar>'''
+
+ res = \
+b'''<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''
+
+ tree = self.soupparser.fromstring(html).getroottree()
+ self.assertEqual(tree.docinfo.public_id, "-//W3C//DTD HTML 4.01//EN")
+ self.assertEqual(tostring(tree), res)
+
+ def test_doctype2(self):
+ # Test document type declaration, comments and PI's
+ # outside the root
+ html = \
+'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
+<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''
+
+ res = \
+b'''<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">
+<!--another comment--><html><head><title>My first HTML document</title></head><body><p>Hello world!</p></body></html><?foo bar?>'''
+
+ tree = self.soupparser.fromstring(html).getroottree()
+ self.assertEqual(tree.docinfo.public_id, "-//IETF//DTD HTML//EN")
+ self.assertEqual(tostring(tree), res)
+
+ def test_doctype_html5(self):
+ # html 5 doctype declaration
+ html = b'<!DOCTYPE html>\n<html lang="en"></html>'
+
+ tree = self.soupparser.fromstring(html).getroottree()
+ self.assertTrue(tree.docinfo.public_id is None)
+ self.assertEqual(tostring(tree), html)
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ if BS_INSTALLED:
+ suite.addTests([unittest.makeSuite(SoupParserTestCase)])
+ if sys.version_info[0] < 3:
+ suite.addTests([make_doctest('../../../../doc/elementsoup.txt')])
+ return suite
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/lxml/html/tests/test_feedparser_data.py b/src/lxml/html/tests/test_feedparser_data.py
new file mode 100644
index 0000000..29a500f
--- /dev/null
+++ b/src/lxml/html/tests/test_feedparser_data.py
@@ -0,0 +1,95 @@
+import os
+import re
+try:
+ from rfc822 import Message
+except ImportError:
+ # Python 3
+ from email import message_from_file as Message
+import unittest
+from lxml.tests.common_imports import doctest
+from lxml.doctestcompare import LHTMLOutputChecker
+
+from lxml.html.clean import clean, Cleaner
+
+feed_dirs = [
+ os.path.join(os.path.dirname(__file__), 'feedparser-data'),
+ os.path.join(os.path.dirname(__file__), 'hackers-org-data'),
+ ]
+bar_re = re.compile(r"-----+")
+
+class DummyInput:
+ def __init__(self, **kw):
+ for name, value in kw.items():
+ setattr(self, name, value)
+
+class FeedTestCase(unittest.TestCase):
+
+ def __init__(self, filename):
+ self.filename = filename
+ unittest.TestCase.__init__(self)
+
+ def parse(self):
+ f = open(self.filename, 'r')
+ headers = Message(f)
+ c = f.read()
+ f.close()
+ if not c.strip():
+ c = headers.get_payload()
+ if not headers.keys():
+ raise Exception(
+ "File %s has no headers" % self.filename)
+ self.description = headers['Description']
+ self.expect = headers.get('Expect', '')
+ self.ignore = headers.get('Ignore')
+ self.options = [
+ o.strip() for o in headers.get('Options', '').split(',')
+ if o.strip()]
+ parts = bar_re.split(c)
+ self.input = parts[0].rstrip() + '\n'
+ if parts[1:]:
+ self.expect = parts[1].rstrip() + '\n'
+ else:
+ self.expect = None
+
+ def runTest(self):
+ self.parse()
+ if self.ignore:
+ # We've marked this test to be ignored.
+ return
+ kw = {}
+ for name in self.options:
+ if name.startswith('-'):
+ kw[name[1:]] = False
+ else:
+ kw[name] = True
+ if kw.get('clean', True):
+ transformed = Cleaner(**kw).clean_html(self.input)
+ else:
+ transformed = self.input
+ assert self.expect is not None, (
+ "No expected output in %s" % self.filename)
+ checker = LHTMLOutputChecker()
+ if not checker.check_output(self.expect, transformed, 0):
+ result = checker.output_difference(
+ DummyInput(want=self.expect), transformed, 0)
+ #result += '\noptions: %s %r' % (', '.join(self.options), kw)
+ #result += repr(transformed)
+ raise Exception("\n"+result)
+
+ def shortDescription(self):
+ return self.filename
+
+def test_suite():
+ suite = unittest.TestSuite()
+ for dir in feed_dirs:
+ for fn in os.listdir(dir):
+ fn = os.path.join(dir, fn)
+ if fn.endswith('.data'):
+ case = FeedTestCase(fn)
+ suite.addTests([case])
+ # This is my lazy way of stopping on first error:
+ try:
+ case.runTest()
+ except:
+ break
+ return suite
diff --git a/src/lxml/html/tests/test_formfill.py b/src/lxml/html/tests/test_formfill.py
new file mode 100644
index 0000000..0f53518
--- /dev/null
+++ b/src/lxml/html/tests/test_formfill.py
@@ -0,0 +1,7 @@
+import unittest
+from lxml.tests.common_imports import make_doctest
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([make_doctest('test_formfill.txt')])
+ return suite
diff --git a/src/lxml/html/tests/test_formfill.txt b/src/lxml/html/tests/test_formfill.txt
new file mode 100644
index 0000000..7e1ed6f
--- /dev/null
+++ b/src/lxml/html/tests/test_formfill.txt
@@ -0,0 +1,112 @@
+Some basic imports:
+
+ >>> from lxml.html import usedoctest
+ >>> from lxml.html.formfill import fill_form_html
+
+The simplest kind of filling is just filling an input with a value:
+
+ >>> print(fill_form_html('''
+ ... <form><input type="text" name="foo"></form>''', dict(foo='bar')))
+ <form><input type="text" name="foo" value="bar"></form>
+
+You can also fill multiple inputs, like:
+
+ >>> print(fill_form_html('''
+ ... <form>
+ ... <input type="text" name="foo">
+ ... <input type="text" name="foo">
+ ... </form>''', dict(foo=['bar1', 'bar2'])))
+ <form>
+ <input type="text" name="foo" value="bar1">
+ <input type="text" name="foo" value="bar2">
+ </form>
+
+Checkboxes can work either as boolean true/false, or be selected based
+on their inclusion in a set of values::
+
+ >>> print(fill_form_html('''
+ ... <form>
+ ... Would you like to be spammed?
+ ... <input type="checkbox" name="spam_me"> <br>
+ ... Spam you'd like to receive:<br>
+ ... Viagra spam:
+ ... <input type="checkbox" name="type" value="viagra"><br>
+ ... Stock spam:
+ ... <input type="checkbox" name="type" value="stock"><br>
+ ... Other spam:
+ ... <input type="checkbox" name="type" value="other"><br>
+ ... <input type="submit" value="Spam!">
+ ... </form>''', dict(spam_me=True, type=['viagra', 'other'])))
+ <form>
+ Would you like to be spammed?
+ <input type="checkbox" name="spam_me" checked> <br>
+ Spam you'd like to receive:<br>
+ Viagra spam:
+ <input type="checkbox" name="type" value="viagra" checked><br>
+ Stock spam:
+ <input type="checkbox" name="type" value="stock"><br>
+ Other spam:
+ <input type="checkbox" name="type" value="other" checked><br>
+ <input type="submit" value="Spam!">
+ </form>
+
+FIXME: I need to test more of this. But I'm lazy and want to use the
+coverage report for some of this.
+
+
+This module also allows you to add error messages to the form. The errors
+add an "error" class to the input fields, and any labels if the field
+has a label. It also inserts an error message into the form, using a
+function you can provide (or the default function).
+
+Example::
+
+ >>> from lxml.html.formfill import insert_errors_html
+ >>> print(insert_errors_html('''
+ ... <form>
+ ... <fieldset id="fieldset">
+ ... <input name="v1"><br>
+ ... <label for="v2">label</label>
+ ... <input name="v2" id="v2"><br>
+ ... </fieldset>
+ ... <input name="v3" class="foo">
+ ... <input name="v3" class="foo">
+ ... <input name="v4">
+ ... <input name="v4">
+ ... </form>''', {
+ ... 'v1': "err1",
+ ... 'v2': "err2",
+ ... 'v3': [None, "err3-2"],
+ ... 'v4': "err4",
+ ... None: 'general error',
+ ... '#fieldset': 'area error',
+ ... }))
+ <form>
+ <div class="error-message error-block">general error</div>
+ <fieldset id="fieldset" class="error">
+ <div class="error-message error-block">area error</div>
+ <div class="error-message">err1</div>
+ <input name="v1" class="error"><br>
+ <label for="v2" class="error">label</label>
+ <div class="error-message">err2</div>
+ <input name="v2" id="v2" class="error"><br>
+ </fieldset>
+ <input name="v3" class="foo">
+ <div class="error-message">err3-2</div>
+ <input name="v3" class="foo error">
+ <div class="error-message">err4</div>
+ <input name="v4" class="error">
+ <input name="v4">
+ </form>
+
+
+REGRESSION: When filling textareas, the "name" attribute used to
+be removed. The "name" attribute should be kept::
+
+ >>> print(fill_form_html('''
+ ... <form>
+ ... <textarea name="foo">Initial value</textarea>
+ ... </form>''', dict(foo="Bar")))
+ <form>
+ <textarea name="foo">Bar</textarea>
+ </form>
diff --git a/src/lxml/html/tests/test_forms.py b/src/lxml/html/tests/test_forms.py
new file mode 100644
index 0000000..37a0327
--- /dev/null
+++ b/src/lxml/html/tests/test_forms.py
@@ -0,0 +1,10 @@
+import unittest
+from lxml.tests.common_imports import make_doctest
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([make_doctest('test_forms.txt')])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/lxml/html/tests/test_forms.txt b/src/lxml/html/tests/test_forms.txt
new file mode 100644
index 0000000..5d7d513
--- /dev/null
+++ b/src/lxml/html/tests/test_forms.txt
@@ -0,0 +1,239 @@
+>>> from lxml.html import usedoctest
+>>> from lxml.html import fromstring, tostring
+>>> h = fromstring('''<html><body>
+... <form action="test">
+... <input type="hidden" name="hidden_field" value="hidden_value">
+... <input type="text" name="text_field" value="text_value">
+... <input type="checkbox" name="single_checkbox">
+... <input type="checkbox" name="single_checkbox2" value="good">
+... <input type="checkbox" name="check_group" value="1">
+... <input type="checkbox" name="check_group" value="2" checked>
+... <input type="checkbox" name="check_group" value="3" checked>
+... <input type="checkbox" name="check_group" value="4">
+... <textarea name="textarea_field">some text</textarea>
+... <label for="value1">value 1</label>
+... <input type="radio" name="radios" value="value1" id="value1">
+... <label for="value2">value 2</label>
+... <input type="radio" name="radios" value="value2" id="value2">
+... <label for="value3">value 3</label>
+... <input type="radio" name="radios" value="value3" id="value3" checked>
+... <select name="select1">
+... <option> No value </option>
+... <option value="">Empty</option>
+... <option value="1">number 1</option>
+... </select>
+... <select name="select2" multiple>
+... <option value="1">number 1</option>
+... <option value="2">number 2</option>
+... <option value="3">number 3</option>
+... <option>number 4</option>
+... </select>
+... <select name="select3">
+... <option value="01 " selected>text 1</option>
+... <option value=" 02">text 2</option>
+... </select>
+... <select name="select4" multiple>
+... <option value="01 " selected>text 1</option>
+... <option value=" 02">text 2</option>
+... </select>
+... <input type="file" name="file_field" value="nonsense_value">
+... <input type="submit" name="submit1" value="submit">
+... <input type="submit" name="submit2" value="submit">
+... <input type="reset" name="reset1">linksys
+... </form>
+... </body></html>''', base_url='http://example.org/form.html')
+>>> h.base_url
+u'http://example.org/form.html'
+>>> f = h.forms[0]
+>>> f.action
+u'http://example.org/test'
+>>> f.method
+'GET'
+
+>>> f.inputs # doctest:+NOPARSE_MARKUP
+<InputGetter for form 0>
+>>> len(f.inputs)
+20
+>>> len(list(f.inputs))
+20
+>>> len(f.inputs.keys())
+15
+>>> len(f.inputs.items())
+15
+>>> len([f.inputs[name] for name in f.inputs.keys()])
+15
+
+>>> hidden = f.inputs['hidden_field']
+>>> hidden.checkable
+False
+>>> hidden.value
+'hidden_value'
+>>> hidden.value = 'new value'
+>>> tostring(hidden, with_tail=False)
+b'<input type="hidden" name="hidden_field" value="new value">'
+>>> checkbox = f.inputs['single_checkbox']
+>>> checkbox.checkable
+True
+>>> checkbox.type
+'checkbox'
+>>> checkbox.checked
+False
+>>> print(checkbox.value)
+None
+>>> checkbox.checked = True
+>>> checkbox.value
+'on'
+>>> tostring(checkbox, with_tail=False)
+b'<input type="checkbox" name="single_checkbox" checked>'
+>>> checkbox2 = f.inputs['single_checkbox2']
+>>> checkbox2.checked = True
+>>> checkbox2.value
+'good'
+>>> group = f.inputs['check_group']
+>>> group.value # doctest:+NOPARSE_MARKUP
+<CheckboxValues {'2', '3'} for checkboxes name='check_group'>
+>>> group.value.add('1')
+>>> group.value # doctest:+NOPARSE_MARKUP
+<CheckboxValues {'1', '2', '3'} for checkboxes name='check_group'>
+>>> tostring(group[0], with_tail=False)
+b'<input type="checkbox" name="check_group" value="1" checked>'
+>>> group.value_options
+['1', '2', '3', '4']
+>>> group.value.add('doesnotexist')
+Traceback (most recent call last):
+ ...
+KeyError: "No checkbox with value 'doesnotexist'"
+>>> textarea = f.inputs['textarea_field']
+>>> textarea.value
+'some text'
+>>> radios = f.inputs['radios']
+>>> radios[0].label.text
+'value 1'
+>>> radios.value
+'value3'
+>>> radios.value = 'value1'
+>>> radios.value
+'value1'
+>>> tostring(radios[0], with_tail=False)
+b'<input type="radio" name="radios" value="value1" id="value1" checked>'
+>>> radios.value = None
+>>> tostring(radios[0], with_tail=False)
+b'<input type="radio" name="radios" value="value1" id="value1">'
+>>> radios.value_options
+['value1', 'value2', 'value3']
+>>> select = f.inputs['select1']
+>>> print(select.value)
+No value
+>>> select.value = ""
+>>> select.value
+''
+>>> select.value = 'asdf'
+Traceback (most recent call last):
+ ...
+ValueError: There is no option with the value of 'asdf'
+>>> select.value_options
+['No value', '', '1']
+>>> select.value = 'No value'
+>>> select.value
+'No value'
+>>> select = f.inputs['select2']
+>>> select.value # doctest:+NOPARSE_MARKUP
+<MultipleSelectOptions {} for select name='select2'>
+>>> select.value.update(['2', '3'])
+>>> select.value # doctest:+NOPARSE_MARKUP
+<MultipleSelectOptions {'2', '3'} for select name='select2'>
+>>> select.value.remove('3')
+>>> select.value.add('asdf')
+Traceback (most recent call last):
+ ...
+ValueError: There is no option with the value 'asdf'
+>>> select.value.add('number 4')
+>>> select.value # doctest:+NOPARSE_MARKUP
+<MultipleSelectOptions {'2', 'number 4'} for select name='select2'>
+>>> select.value.remove('number 4')
+>>> select.value_options
+['1', '2', '3', 'number 4']
+>>> select = f.inputs['select3']
+>>> select.value
+'01 '
+>>> select.value_options
+['01 ', ' 02']
+>>> select.value = " 02"
+>>> select.value
+' 02'
+>>> select = f.inputs['select4']
+>>> select.value # doctest:+NOPARSE_MARKUP
+<MultipleSelectOptions {'01 '} for select name='select4'>
+>>> select.value.add(' 02')
+>>> select.value # doctest:+NOPARSE_MARKUP
+<MultipleSelectOptions {'01 ', ' 02'} for select name='select4'>
+>>> try: from urllib import urlencode
+... except ImportError: from urllib.parse import urlencode
+>>> print(urlencode(f.form_values()))
+hidden_field=new+value&text_field=text_value&single_checkbox=on&single_checkbox2=good&check_group=1&check_group=2&check_group=3&textarea_field=some+text&select1=No+value&select2=2&select3=+02&select4=01+&select4=+02
+>>> fields = f.fields
+>>> fields # doctest:+NOPARSE_MARKUP
+<FieldsDict for form 0>
+>>> len(fields)
+20
+>>> for name, value in sorted(fields.items()):
+... print('%s: %r' % (name, value))
+check_group: <CheckboxValues {'1', '2', '3'} for checkboxes name='check_group'>
+file_field: 'nonsense_value'
+hidden_field: 'new value'
+radios: None
+reset1: None
+select1: 'No value'
+select2: <MultipleSelectOptions {'2'} for select name='select2'>
+select3: ' 02'
+select4: <MultipleSelectOptions {'01 ', ' 02'} for select name='select4'>
+single_checkbox: 'on'
+single_checkbox2: 'good'
+submit1: 'submit'
+submit2: 'submit'
+text_field: 'text_value'
+textarea_field: 'some text'
+
+>>> import lxml.html
+>>> tree = lxml.html.fromstring('''
+... <html><body>
+... <form>
+... <input name="foo" value="bar" disabled/>
+... <input type="submit" />
+... </form>
+... </body></html>
+... ''')
+>>> tree # doctest: +ELLIPSIS
+<Element html at ...>
+>>> tree.forms[0] # doctest: +ELLIPSIS
+<Element form at ...>
+>>> tree.forms[0].fields # doctest: +NOPARSE_MARKUP
+<FieldsDict for form 0>
+>>> len(tree.forms[0].fields)
+2
+>>> list(tree.forms[0].fields.keys())
+['foo']
+>>> list(tree.forms[0].fields.items())
+[('foo', 'bar')]
+>>> list(tree.forms[0].fields.values())
+['bar']
+
+>>> ('foo', 'bar') not in tree.forms[0].form_values()
+True
+>>> tree = lxml.html.fromstring('''
+... <html><body>
+... <form>
+... <textarea name="foo">some <b>text<br>content</b> with tags</textarea>
+... </form>
+... </body></html>
+... ''')
+>>> list(tree.forms[0].fields.keys())
+['foo']
+>>> ta = tree.forms[0].inputs['foo']
+>>> print(ta.value)
+some <b>text<br>content</b> with tags
+>>> ta.value = 'abc<br>def'
+>>> print(ta.value)
+abc<br>def
+>>> len(ta)
+0
diff --git a/src/lxml/html/tests/test_frames.py b/src/lxml/html/tests/test_frames.py
new file mode 100644
index 0000000..2eeb844
--- /dev/null
+++ b/src/lxml/html/tests/test_frames.py
@@ -0,0 +1,36 @@
+import unittest, sys
+from lxml.tests.common_imports import make_doctest, doctest
+import lxml.html
+from lxml.html import html_parser, XHTML_NAMESPACE
+
+class FrameTest(unittest.TestCase):
+
+ def test_parse_fragments_fromstring(self):
+ parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True)
+ html = """<frameset>
+ <frame src="main.php" name="srcpg" id="srcpg" frameborder="0" rolling="Auto" marginwidth="" marginheight="0">
+ </frameset>"""
+ etree_document = lxml.html.fragments_fromstring(html, parser=parser)
+ self.assertEqual(len(etree_document), 1)
+ root = etree_document[0]
+ self.assertEqual(root.tag, "frameset")
+ frame_element = root[0]
+ self.assertEqual(frame_element.tag, 'frame')
+
+ def test_parse_fromstring(self):
+ parser = lxml.html.HTMLParser(encoding='utf-8', remove_comments=True)
+ html = """<html><frameset>
+ <frame src="main.php" name="srcpg" id="srcpg" frameborder="0" rolling="Auto" marginwidth="" marginheight="0">
+ </frameset></html>"""
+ etree_document = lxml.html.fromstring(html, parser=parser)
+ self.assertEqual(etree_document.tag, 'html')
+ self.assertEqual(len(etree_document), 1)
+ frameset_element = etree_document[0]
+ self.assertEqual(len(frameset_element), 1)
+ frame_element = frameset_element[0]
+ self.assertEqual(frame_element.tag, 'frame')
+
+
+def test_suite():
+ loader = unittest.TestLoader()
+ return loader.loadTestsFromModule(sys.modules[__name__]) \ No newline at end of file
diff --git a/src/lxml/html/tests/test_html5parser.py b/src/lxml/html/tests/test_html5parser.py
new file mode 100644
index 0000000..56afe98
--- /dev/null
+++ b/src/lxml/html/tests/test_html5parser.py
@@ -0,0 +1,430 @@
+import os
+import imp
+try:
+ from StringIO import StringIO
+except ImportError: # python 3
+ from io import StringIO
+import sys
+import tempfile
+import unittest
+from unittest import skipUnless
+
+from lxml.builder import ElementMaker
+from lxml.etree import Element, ElementTree, ParserError
+from lxml.html import html_parser, XHTML_NAMESPACE
+
+try:
+ import urlparse
+except ImportError:
+ import urllib.parse as urlparse
+
+try:
+ from urllib import pathname2url
+except ImportError:
+ from urllib.request import pathname2url
+
+
+def path2url(path):
+ return urlparse.urljoin(
+ 'file:', pathname2url(path))
+
+
+try:
+ import html5lib
+except ImportError:
+ html5lib = None
+
+ class BogusModules(object):
+ # See PEP 302 for details on how this works
+ def __init__(self, mocks):
+ self.mocks = mocks
+
+ def find_module(self, fullname, path=None):
+ if fullname in self.mocks:
+ return self
+ return None
+
+ def load_module(self, fullname):
+ mod = sys.modules.setdefault(fullname, imp.new_module(fullname))
+ mod.__file__, mod.__loader__, mod.__path__ = "<dummy>", self, []
+ mod.__dict__.update(self.mocks[fullname])
+ return mod
+
+ # Fake just enough of html5lib so that html5parser.py is importable
+ # without errors.
+ sys.meta_path.append(BogusModules({
+ 'html5lib': {
+ # A do-nothing HTMLParser class
+ 'HTMLParser': type('HTMLParser', (object,), {
+ '__init__': lambda self, **kw: None,
+ }),
+ },
+ 'html5lib.treebuilders': {
+ },
+ 'html5lib.treebuilders.etree_lxml': {
+ 'TreeBuilder': 'dummy treebuilder',
+ },
+ }))
+
+
+class Test_HTMLParser(unittest.TestCase):
+ def make_one(self, **kwargs):
+ from lxml.html.html5parser import HTMLParser
+ return HTMLParser(**kwargs)
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration(self):
+ parser = self.make_one(strict=True)
+ tree = parser.parse(XHTML_TEST_DOCUMENT)
+ root = tree.getroot()
+ self.assertEqual(root.tag, xhtml_tag('html'))
+
+
+class Test_XHTMLParser(unittest.TestCase):
+ def make_one(self, **kwargs):
+ from lxml.html.html5parser import XHTMLParser
+ return XHTMLParser(**kwargs)
+
+ @skipUnless(hasattr(html5lib, 'XHTMLParser'),
+ 'xhtml5lib does not have XHTMLParser')
+ def test_integration(self):
+ # XXX: This test are untested. (html5lib no longer has an XHTMLParser)
+ parser = self.make_one(strict=True)
+ tree = parser.parse(XHTML_TEST_DOCUMENT)
+ root = tree.getroot()
+ self.assertEqual(root.tag, xhtml_tag('html'))
+
+
+class Test_document_fromstring(unittest.TestCase):
+ def call_it(self, *args, **kwargs):
+ from lxml.html.html5parser import document_fromstring
+ return document_fromstring(*args, **kwargs)
+
+ def test_basic(self):
+ parser = DummyParser(doc=DummyElementTree(root='dummy root'))
+ elem = self.call_it(b'dummy input', parser=parser)
+ self.assertEqual(elem, 'dummy root')
+ self.assertEqual(parser.parse_args, (b'dummy input',))
+ self.assertEqual(parser.parse_kwargs, {'useChardet': True})
+
+ def test_guess_charset_not_used_for_unicode(self):
+ parser = DummyParser()
+ elem = self.call_it(b''.decode('ascii'), parser=parser)
+ self.assertEqual(parser.parse_kwargs, {})
+
+ def test_guess_charset_arg_gets_passed_to_parser(self):
+ parser = DummyParser()
+ elem = self.call_it(b'', guess_charset='gc_arg', parser=parser)
+ self.assertEqual(parser.parse_kwargs, {'useChardet': 'gc_arg'})
+
+ def test_raises_type_error_on_nonstring_input(self):
+ not_a_string = None
+ self.assertRaises(TypeError, self.call_it, not_a_string)
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration(self):
+ elem = self.call_it(XHTML_TEST_DOCUMENT)
+ self.assertEqual(elem.tag, xhtml_tag('html'))
+
+
+class Test_fragments_fromstring(unittest.TestCase):
+ def call_it(self, *args, **kwargs):
+ from lxml.html.html5parser import fragments_fromstring
+ return fragments_fromstring(*args, **kwargs)
+
+ def test_basic(self):
+ parser = DummyParser(fragments='fragments')
+ fragments = self.call_it(b'dummy input', parser=parser)
+ self.assertEqual(fragments, 'fragments')
+ self.assertEqual(parser.parseFragment_kwargs, {'useChardet': False})
+
+ def test_guess_charset_arg_gets_passed_to_parser(self):
+ parser = DummyParser()
+ elem = self.call_it(b'', guess_charset='gc_arg', parser=parser)
+ self.assertEqual(parser.parseFragment_kwargs, {'useChardet': 'gc_arg'})
+
+ def test_guess_charset_not_used_for_unicode(self):
+ parser = DummyParser()
+ elem = self.call_it(b''.decode('ascii'), parser=parser)
+ self.assertEqual(parser.parseFragment_kwargs, {})
+
+ def test_raises_type_error_on_nonstring_input(self):
+ not_a_string = None
+ self.assertRaises(TypeError, self.call_it, not_a_string)
+
+ def test_no_leading_text_strips_empty_leading_text(self):
+ parser = DummyParser(fragments=['', 'tail'])
+ fragments = self.call_it('', parser=parser, no_leading_text=True)
+ self.assertEqual(fragments, ['tail'])
+
+ def test_no_leading_text_raises_error_if_leading_text(self):
+ parser = DummyParser(fragments=['leading text', 'tail'])
+ self.assertRaises(ParserError, self.call_it,
+ '', parser=parser, no_leading_text=True)
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration(self):
+ fragments = self.call_it('a<b>c</b>')
+ self.assertEqual(len(fragments), 2)
+ self.assertEqual(fragments[0], 'a')
+ self.assertEqual(fragments[1].tag, xhtml_tag('b'))
+
+
+class Test_fragment_fromstring(unittest.TestCase):
+ def call_it(self, *args, **kwargs):
+ from lxml.html.html5parser import fragment_fromstring
+ return fragment_fromstring(*args, **kwargs)
+
+ def test_basic(self):
+ element = DummyElement()
+ parser = DummyParser(fragments=[element])
+ self.assertEqual(self.call_it('html', parser=parser), element)
+
+ def test_raises_type_error_on_nonstring_input(self):
+ not_a_string = None
+ self.assertRaises(TypeError, self.call_it, not_a_string)
+
+ def test_create_parent(self):
+ parser = DummyParser(fragments=['head', Element('child')])
+ elem = self.call_it('html', parser=parser, create_parent='parent')
+ self.assertEqual(elem.tag, 'parent')
+ self.assertEqual(elem.text, 'head')
+ self.assertEqual(elem[0].tag, 'child')
+
+ def test_create_parent_default_type_no_ns(self):
+ parser = DummyParser(fragments=[], namespaceHTMLElements=False)
+ elem = self.call_it('html', parser=parser, create_parent=True)
+ self.assertEqual(elem.tag, 'div')
+
+ def test_raises_error_on_leading_text(self):
+ parser = DummyParser(fragments=['leading text'])
+ self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
+
+ def test_raises_error_if_no_elements_found(self):
+ parser = DummyParser(fragments=[])
+ self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
+
+ def test_raises_error_if_multiple_elements_found(self):
+ parser = DummyParser(fragments=[DummyElement(), DummyElement()])
+ self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
+
+ def test_raises_error_if_tail(self):
+ parser = DummyParser(fragments=[DummyElement(tail='tail')])
+ self.assertRaises(ParserError, self.call_it, 'html', parser=parser)
+
+
+class Test_fromstring(unittest.TestCase):
+ def call_it(self, *args, **kwargs):
+ from lxml.html.html5parser import fromstring
+ return fromstring(*args, **kwargs)
+
+ def test_returns_whole_doc_if_input_contains_html_tag(self):
+ parser = DummyParser(root='the doc')
+ self.assertEqual(self.call_it('<html></html>', parser=parser),
+ 'the doc')
+
+ def test_returns_whole_doc_if_input_contains_doctype(self):
+ parser = DummyParser(root='the doc')
+ self.assertEqual(self.call_it('<!DOCTYPE html>', parser=parser),
+ 'the doc')
+
+ def test_returns_whole_doc_if_input_is_encoded(self):
+ parser = DummyParser(root='the doc')
+ input = '<!DOCTYPE html>'.encode('ascii')
+ self.assertEqual(self.call_it(input, parser=parser),
+ 'the doc')
+
+ def test_returns_whole_doc_if_head_not_empty(self, use_ns=True):
+ E = HTMLElementMaker(namespaceHTMLElements=use_ns)
+ root = E.html(E.head(E.title()))
+ parser = DummyParser(root=root)
+ self.assertEqual(self.call_it('', parser=parser), root)
+
+ def test_returns_whole_doc_if_head_not_empty_no_ns(self):
+ self.test_returns_whole_doc_if_head_not_empty(use_ns=False)
+
+ def test_returns_unwraps_body_if_single_element(self):
+ E = HTMLElementMaker()
+ elem = E.p('test')
+ root = E.html(E.head(), E.body(elem))
+ parser = DummyParser(root=root)
+ self.assertEqual(self.call_it('', parser=parser), elem)
+
+ def test_returns_body_if_has_text(self):
+ E = HTMLElementMaker()
+ elem = E.p('test')
+ body = E.body('text', elem)
+ root = E.html(E.head(), body)
+ parser = DummyParser(root=root)
+ self.assertEqual(self.call_it('', parser=parser), body)
+
+ def test_returns_body_if_single_element_has_tail(self):
+ E = HTMLElementMaker()
+ elem = E.p('test')
+ elem.tail = 'tail'
+ body = E.body(elem)
+ root = E.html(E.head(), body)
+ parser = DummyParser(root=root)
+ self.assertEqual(self.call_it('', parser=parser), body)
+
+ def test_wraps_multiple_fragments_in_div_no_ns(self):
+ E = HTMLElementMaker(namespaceHTMLElements=False)
+ parser = DummyParser(root=E.html(E.head(), E.body(E.h1(), E.p())),
+ namespaceHTMLElements=False)
+ elem = self.call_it('', parser=parser)
+ self.assertEqual(elem.tag, 'div')
+
+ def test_wraps_multiple_fragments_in_span_no_ns(self):
+ E = HTMLElementMaker(namespaceHTMLElements=False)
+ parser = DummyParser(root=E.html(E.head(), E.body('foo', E.a('link'))),
+ namespaceHTMLElements=False)
+ elem = self.call_it('', parser=parser)
+ self.assertEqual(elem.tag, 'span')
+
+ def test_raises_type_error_on_nonstring_input(self):
+ not_a_string = None
+ self.assertRaises(TypeError, self.call_it, not_a_string)
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration_whole_doc(self):
+ elem = self.call_it(XHTML_TEST_DOCUMENT)
+ self.assertEqual(elem.tag, xhtml_tag('html'))
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration_single_fragment(self):
+ elem = self.call_it('<p></p>')
+ self.assertEqual(elem.tag, xhtml_tag('p'))
+
+
+class Test_parse(unittest.TestCase):
+ def call_it(self, *args, **kwargs):
+ from lxml.html.html5parser import parse
+ return parse(*args, **kwargs)
+
+ def make_temp_file(self, contents=''):
+ tmpfile = tempfile.NamedTemporaryFile(delete=False)
+ try:
+ tmpfile.write(contents.encode('utf8'))
+ tmpfile.flush()
+ tmpfile.seek(0)
+ return tmpfile
+ except Exception:
+ try:
+ tmpfile.close()
+ finally:
+ os.unlink(tmpfile.name)
+ raise
+
+ def test_with_file_object(self):
+ parser = DummyParser(doc='the doc')
+ fp = open(__file__)
+ try:
+ self.assertEqual(self.call_it(fp, parser=parser), 'the doc')
+ self.assertEqual(parser.parse_args, (fp,))
+ finally:
+ fp.close()
+
+ def test_with_file_name(self):
+ parser = DummyParser(doc='the doc')
+ tmpfile = self.make_temp_file('data')
+ try:
+ data = tmpfile.read()
+ finally:
+ tmpfile.close()
+ try:
+ self.assertEqual(self.call_it(tmpfile.name, parser=parser), 'the doc')
+ fp, = parser.parse_args
+ try:
+ self.assertEqual(fp.read(), data)
+ finally:
+ fp.close()
+ finally:
+ os.unlink(tmpfile.name)
+
+ def test_with_url(self):
+ parser = DummyParser(doc='the doc')
+ tmpfile = self.make_temp_file('content')
+ try:
+ data = tmpfile.read()
+ finally:
+ tmpfile.close()
+ try:
+ url = path2url(tmpfile.name)
+ self.assertEqual(self.call_it(url, parser=parser), 'the doc')
+ fp, = parser.parse_args
+ try:
+ self.assertEqual(fp.read(), data)
+ finally:
+ fp.close()
+ finally:
+ os.unlink(tmpfile.name)
+
+ @skipUnless(html5lib, 'html5lib is not installed')
+ def test_integration(self):
+ doc = self.call_it(StringIO(XHTML_TEST_DOCUMENT))
+ root = doc.getroot()
+ self.assertEqual(root.tag, xhtml_tag('html'))
+
+
+def test_suite():
+ loader = unittest.TestLoader()
+ return loader.loadTestsFromModule(sys.modules[__name__])
+
+
+class HTMLElementMaker(ElementMaker):
+ def __init__(self, namespaceHTMLElements=True):
+ initargs = dict(makeelement=html_parser.makeelement)
+ if namespaceHTMLElements:
+ initargs.update(namespace=XHTML_NAMESPACE,
+ nsmap={None: XHTML_NAMESPACE})
+ ElementMaker.__init__(self, **initargs)
+
+
+class DummyParser(object):
+ def __init__(self, doc=None, root=None,
+ fragments=None, namespaceHTMLElements=True):
+ self.doc = doc or DummyElementTree(root=root)
+ self.fragments = fragments
+ self.tree = DummyTreeBuilder(namespaceHTMLElements)
+
+ def parse(self, *args, **kwargs):
+ self.parse_args = args
+ self.parse_kwargs = kwargs
+ return self.doc
+
+ def parseFragment(self, *args, **kwargs):
+ self.parseFragment_args = args
+ self.parseFragment_kwargs = kwargs
+ return self.fragments
+
+
+class DummyTreeBuilder(object):
+ def __init__(self, namespaceHTMLElements=True):
+ self.namespaceHTMLElements = namespaceHTMLElements
+
+
+class DummyElementTree(object):
+ def __init__(self, root):
+ self.root = root
+
+ def getroot(self):
+ return self.root
+
+
+class DummyElement(object):
+ def __init__(self, tag='tag', tail=None):
+ self.tag = tag
+ self.tail = tail
+
+
+def xhtml_tag(tag):
+ return '{%s}%s' % (XHTML_NAMESPACE, tag)
+
+
+XHTML_TEST_DOCUMENT = '''
+ <!DOCTYPE html>
+ <html>
+ <head><title>TITLE</title></head>
+ <body></body>
+ </html>
+ '''
diff --git a/src/lxml/html/tests/test_rewritelinks.py b/src/lxml/html/tests/test_rewritelinks.py
new file mode 100644
index 0000000..100105f
--- /dev/null
+++ b/src/lxml/html/tests/test_rewritelinks.py
@@ -0,0 +1,10 @@
+import unittest
+from lxml.tests.common_imports import make_doctest
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([make_doctest('test_rewritelinks.txt')])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/lxml/html/tests/test_rewritelinks.txt b/src/lxml/html/tests/test_rewritelinks.txt
new file mode 100644
index 0000000..9bd60af
--- /dev/null
+++ b/src/lxml/html/tests/test_rewritelinks.txt
@@ -0,0 +1,264 @@
+
+Setup::
+
+ >>> import lxml.html
+
+We'll define a link translation function:
+
+ >>> base_href = 'http://old/base/path.html'
+ >>> try: import urlparse
+ ... except ImportError: import urllib.parse as urlparse
+ >>> def relocate_href(link):
+ ... link = urlparse.urljoin(base_href, link)
+ ... if link.startswith('http://old'):
+ ... return 'https://new' + link[len('http://old'):]
+ ... else:
+ ... return link
+
+Now for content. First, to make it easier on us, we need to trim the
+normalized HTML we get from these functions::
+
+Some basics::
+
+ >>> from lxml.html import usedoctest, tostring
+ >>> from lxml.html import rewrite_links
+ >>> print(rewrite_links(
+ ... '<a href="http://old/blah/blah.html">link</a>', relocate_href))
+ <a href="https://new/blah/blah.html">link</a>
+ >>> print(rewrite_links(
+ ... '<script src="http://old/foo.js"></script>', relocate_href))
+ <script src="https://new/foo.js"></script>
+ >>> print(rewrite_links(
+ ... '<link href="foo.css">', relocate_href))
+ <link href="https://new/base/foo.css">
+ >>> print(rewrite_links('''\
+ ... <base href="http://blah/stuff/index.html">
+ ... <link href="foo.css">
+ ... <a href="http://old/bar.html">x</a>\
+ ... ''', relocate_href))
+ <link href="http://blah/stuff/foo.css">
+ <a href="https://new/bar.html">x</a>
+
+Links in CSS are also handled::
+
+ >>> print(rewrite_links('''
+ ... <style>
+ ... body {background-image: url(http://old/image.gif)};
+ ... @import "http://old/other-style.css";
+ ... </style>''', relocate_href))
+ <html><head><style>
+ body {background-image: url(https://new/image.gif)};
+ @import "https://new/other-style.css";
+ </style></head></html>
+ >>> print(rewrite_links('''
+ ... <style>
+ ... body {background-image: url("http://old/image.gif")};
+ ... @import "http://old/other-style.css";
+ ... </style>''', relocate_href))
+ <html><head><style>
+ body {background-image: url("https://new/image.gif")};
+ @import "https://new/other-style.css";
+ </style></head></html>
+
+Those links in style attributes are also rewritten::
+
+ >>> print(rewrite_links('''
+ ... <div style="background-image: url(http://old/image.gif)">text</div>
+ ... ''', relocate_href))
+ <div style="background-image: url(https://new/image.gif)">text</div>
+
+The ``<base href>`` tag is also respected (but also removed)::
+
+ >>> print(rewrite_links('''
+ ... <html><head>
+ ... <base href="http://old/">
+ ... </head>
+ ... <body>
+ ... <a href="foo.html">link</a>
+ ... </body></html>''', relocate_href))
+ <html>
+ <head></head>
+ <body>
+ <a href="https://new/foo.html">link</a>
+ </body>
+ </html>
+
+The ``iterlinks`` method (and function) gives you all the links in
+the document, along with the element and attribute the link comes
+from. This makes it fairly easy to see what resources the document
+references or embeds (an ``<a>`` tag is a reference, an ``<img>`` tag
+is something embedded). It returns a generator of ``(element, attrib,
+link)``, which is awkward to test here, so we'll make a printer::
+
+ >>> from lxml.html import iterlinks, document_fromstring, tostring
+ >>> def print_iter(seq):
+ ... for element, attrib, link, pos in seq:
+ ... if pos:
+ ... extra = '@%s' % pos
+ ... else:
+ ... extra = ''
+ ... print('%s %s="%s"%s' % (element.tag, attrib, link, extra))
+ >>> print_iter(iterlinks('''
+ ... <html>
+ ... <head>
+ ... <meta http-equiv="refresh" content="0;url=/redirect">
+ ... <meta http-equiv="refresh" content="10;url='/quoted_url'">
+ ... <link rel="stylesheet" href="style.css">
+ ... <style type="text/css">
+ ... body {
+ ... background-image: url(/bg.gif);
+ ... }
+ ... @import "/other-styles.css";
+ ... </style>
+ ... <script src="/js-funcs.js"></script>
+ ... </head>
+ ... <body>
+ ... <table>
+ ... <tr><td><ul>
+ ... <li><a href="/test.html">Test stuff</a></li>
+ ... <li><a href="/other.html">Other stuff</a></li>
+ ... </td></tr>
+ ... <td style="background-image: url(/td-bg.png)">
+ ... <img src="/logo.gif">
+ ... Hi world!
+ ... </td>
+ ... <td style="background-image: url('/quoted.png')">
+ ... </td></tr>
+ ... </table>
+ ... </body></html>'''))
+ meta content="/redirect"@6
+ meta content="/quoted_url"@8
+ link href="style.css"
+ style None="/other-styles.css"@69
+ style None="/bg.gif"@40
+ script src="/js-funcs.js"
+ a href="/test.html"
+ a href="/other.html"
+ td style="/td-bg.png"@22
+ img src="/logo.gif"
+ td style="/quoted.png"@23
+
+An application of ``iterlinks()`` is ``make_links_absolute()``::
+
+ >>> from lxml.html import make_links_absolute
+ >>> print(make_links_absolute('''
+ ... <html>
+ ... <head>
+ ... <meta http-equiv="refresh" content=" broken ">
+ ... <meta http-equiv="refresh" content="0; url = ">
+ ... <meta http-equiv="refresh" content="0;url=/redirect">
+ ... <meta http-equiv="refresh" content="5; url='/quoted_url'">
+ ... <meta http-equiv="refresh" content="10;url='http://example.com/absolute'">
+ ... <meta http-equiv="refresh" content="15; url=http://example.com/">
+ ... <link rel="stylesheet" href="style.css">
+ ... <style type="text/css">
+ ... body {
+ ... background-image: url(/bg.gif);
+ ... }
+ ... @import "/other-styles.css";
+ ... </style>
+ ... <script src="/js-funcs.js"></script>
+ ... </head>
+ ... <body>
+ ... <table>
+ ... <tr><td><ul>
+ ... <li><a href=" /test.html">Test stuff</a></li>
+ ... <li><a href="/other.html ">Other stuff</a></li>
+ ... </td></tr>
+ ... <tr><td style="background-image: url( /td-bg.png )">
+ ... <img src="logo.gif">
+ ... Hi world!
+ ... </td></tr>
+ ... </table>
+ ... </body></html>''',
+ ... base_url="http://my.little.server/url/"))
+ <html>
+ <head>
+ <meta http-equiv="refresh" content=" http://my.little.server/url/broken ">
+ <meta http-equiv="refresh" content="0; url = ">
+ <meta http-equiv="refresh" content="0;url=http://my.little.server/redirect">
+ <meta http-equiv="refresh" content="5; url='http://my.little.server/quoted_url'">
+ <meta http-equiv="refresh" content="10;url='http://example.com/absolute'">
+ <meta http-equiv="refresh" content="15; url=http://example.com/">
+ <link rel="stylesheet" href="http://my.little.server/url/style.css">
+ <style type="text/css">
+ body {
+ background-image: url(http://my.little.server/bg.gif);
+ }
+ @import "http://my.little.server/other-styles.css";
+ </style>
+ <script src="http://my.little.server/js-funcs.js"></script>
+ </head>
+ <body>
+ <table>
+ <tr><td><ul>
+ <li><a href="http://my.little.server/test.html">Test stuff</a></li>
+ <li><a href="http://my.little.server/other.html">Other stuff</a></li>
+ </ul></td></tr>
+ <tr>
+ <td style="background-image: url(http://my.little.server/td-bg.png)">
+ <img src="http://my.little.server/url/logo.gif">
+ Hi world!
+ </td></tr>
+ </table>
+ </body>
+ </html>
+
+### Test disabled to support Py2.6 and earlier
+#If the document contains invalid links, you may choose to "discard" or "ignore"
+#them by passing the respective option into the ``handle_failures`` argument::
+#
+# >>> html = lxml.html.fromstring ('''\
+# ... <html><body><div>
+# ... <a href="http://fancybase.com]Buy">test2</a>
+# ... </div></body></html>''')
+#
+# >>> html.make_links_absolute(base_url="http://my.little.server/url/",
+# ... handle_failures="discard")
+#
+# >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
+# <html><body><div>
+# <a>test2</a>
+# </div></body></html>
+
+Check if we can replace multiple links inside of the same text string::
+
+ >>> html = lxml.html.fromstring ("""\
+ ... <html>
+ ... <head>
+ ... <title>Test</title>
+ ... <style type='text/css'>
+ ... .bg1 {
+ ... background: url(images/bg1.png);
+ ... }
+ ... .bg2 {
+ ... background: url(images/bg2.png);
+ ... }
+ ... </style>
+ ... </head>
+ ... <body>
+ ... <p>Hi</p>
+ ... </body>
+ ... </html>
+ ... """,
+ ... base_url = 'http://www.example.com/')
+
+ >>> html.make_links_absolute ()
+
+ >>> print(lxml.html.tostring (html, pretty_print=True, encoding='unicode'))
+ <html>
+ <head>
+ <title>Test</title>
+ <style type="text/css">
+ .bg1 {
+ background: url(http://www.example.com/images/bg1.png);
+ }
+ .bg2 {
+ background: url(http://www.example.com/images/bg2.png);
+ }
+ </style>
+ </head>
+ <body>
+ <p>Hi</p>
+ </body>
+ </html>
diff --git a/src/lxml/html/tests/test_select.py b/src/lxml/html/tests/test_select.py
new file mode 100644
index 0000000..499ff7d
--- /dev/null
+++ b/src/lxml/html/tests/test_select.py
@@ -0,0 +1,47 @@
+import sys
+import unittest
+
+import lxml.html
+
+
+class SelectTest(unittest.TestCase):
+ @staticmethod
+ def _evaluate_select(options, multiple=False):
+ options = ''.join('<option' + (' selected="selected"' if selected else '') + '>' + option + '</option>'
+ for option, selected in options)
+ string = '<title>test</title><form><select%s>%s</select></form>' % \
+ (' multiple="multiple"' if multiple else '', options)
+ return lxml.html.fromstring(string).find('.//select').value
+
+ def test_single_select_value_no_options(self):
+ self.assertEqual(
+ self._evaluate_select([]),
+ None)
+
+ def test_single_select_value_no_selected_option(self):
+ # If no option is selected, the HTML5 specification requires the first option to get selected.
+ self.assertEqual(
+ self._evaluate_select([('a', False), ('b', False)]),
+ 'a')
+
+ def test_single_select_value_multiple_selected_options(self):
+ # If multiple options are selected, the proposed HTML 5.1 specification
+ # requires all but the last selected options to get deselected.
+ self.assertEqual(
+ self._evaluate_select([('a', True), ('b', True)]),
+ 'b')
+
+ def test_multiple_select_value_no_selected_option(self):
+ self.assertEqual(
+ self._evaluate_select([('a', False), ('b', False)], multiple=True),
+ set())
+
+ def test_multiple_select_value_multiple_selected_options(self):
+ self.assertEqual(
+ self._evaluate_select([('a', True), ('b', True)], multiple=True),
+ {'a', 'b'})
+
+
+def test_suite():
+ loader = unittest.TestLoader()
+ return loader.loadTestsFromModule(sys.modules[__name__])
diff --git a/src/lxml/html/tests/test_xhtml.py b/src/lxml/html/tests/test_xhtml.py
new file mode 100644
index 0000000..cc66170
--- /dev/null
+++ b/src/lxml/html/tests/test_xhtml.py
@@ -0,0 +1,10 @@
+import unittest
+from lxml.tests.common_imports import make_doctest
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([make_doctest('test_xhtml.txt')])
+ return suite
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/lxml/html/tests/test_xhtml.txt b/src/lxml/html/tests/test_xhtml.txt
new file mode 100644
index 0000000..db02210
--- /dev/null
+++ b/src/lxml/html/tests/test_xhtml.txt
@@ -0,0 +1,30 @@
+ >>> from lxml.html import document_fromstring, fragment_fromstring, tostring
+
+lxml.html has two parsers, one for HTML, one for XHTML:
+
+ >>> from lxml.html import HTMLParser, XHTMLParser
+ >>> html = "<html><body><p>Hi!</p></body></html>"
+
+ >>> root = document_fromstring(html, parser=HTMLParser())
+ >>> print(root.tag)
+ html
+
+ >>> root = document_fromstring(html, parser=XHTMLParser())
+ >>> print(root.tag)
+ html
+
+There are two functions for converting between HTML and XHTML:
+
+ >>> from lxml.html import xhtml_to_html, html_to_xhtml
+
+ >>> doc = document_fromstring(html, parser=HTMLParser())
+ >>> tostring(doc)
+ b'<html><body><p>Hi!</p></body></html>'
+
+ >>> html_to_xhtml(doc)
+ >>> tostring(doc)
+ b'<html:html xmlns:html="http://www.w3.org/1999/xhtml"><html:body><html:p>Hi!</html:p></html:body></html:html>'
+
+ >>> xhtml_to_html(doc)
+ >>> tostring(doc)
+ b'<html xmlns:html="http://www.w3.org/1999/xhtml"><body><p>Hi!</p></body></html>'
diff --git a/src/lxml/html/tests/transform_feedparser_data.py b/src/lxml/html/tests/transform_feedparser_data.py
new file mode 100644
index 0000000..38ced24
--- /dev/null
+++ b/src/lxml/html/tests/transform_feedparser_data.py
@@ -0,0 +1,109 @@
+"""
+This takes the feedparser tests from here:
+
+ http://feedparser.org/tests/wellformed/sanitize/
+
+and rewrites them to be easier to handle (not using the internal model
+of feedparser). The input format is::
+
+ <!--
+ Description: {description}
+ Expect: {expression}
+ -->
+ ...
+ <content ...>{content}</content>
+ ...
+
+The Expect expression is checked for
+``entries[0]['content'][0]['value'] == {data}``.
+
+The output format is::
+
+ Description: {description}
+ Expect: {expression} (if data couldn't be parsed)
+ Options:
+
+ {content, unescaped}
+ ----------
+ {data, unescaped, if found}
+
+"""
+
+import re
+import os
+import traceback
+
+_desc_re = re.compile(r'\s*Description:\s*(.*)')
+_expect_re = re.compile(r'\s*Expect:\s*(.*)')
+_data_expect_re = re.compile(r"entries\[0\]\['[^']+'\](?:\[0\]\['value'\])?\s*==\s*(.*)")
+_feed_data_expect_re = re.compile(r"feed\['[^']+'\]\s*==\s*(.*)")
+
+def parse_content(content):
+ match = _desc_re.search(content)
+ desc = match.group(1)
+ match = _expect_re.search(content)
+ expect = match.group(1)
+ data = None
+ for regex in [_data_expect_re, _feed_data_expect_re]:
+ match = regex.search(expect)
+ if match:
+ # Icky, but I'll trust it
+ data = eval(match.group(1).strip())
+ break
+ c = None
+ for tag in ['content', 'summary', 'title', 'copyright', 'tagline', 'info', 'subtitle', 'fullitem', 'body', 'description', 'content:encoded']:
+ regex = re.compile(r"<%s.*?>(.*)</%s>" % (tag, tag), re.S)
+ match = regex.search(content)
+ if match:
+ c = match.group(1)
+ break
+ assert c is not None
+ # Seems like body isn't quoted
+ if tag != 'body':
+ c = c.replace('&lt;', '<')
+ c = c.replace('&amp;', '&')
+ # FIXME: I should really do more unescaping...
+ return {
+ 'Description': desc,
+ 'Expect': expect,
+ 'data': data,
+ 'content': c}
+
+def serialize_content(d):
+ s = '''\
+Description: %(Description)s
+Expect: %(Expect)s
+Options:
+
+%(content)s
+''' % d
+ if d.get('data') is not None:
+ s += '----------\n%s' % d['data']
+ return s
+
+def translate_file(filename):
+ f = open(filename, 'rb')
+ c = f.read()
+ f.close()
+ try:
+ output = serialize_content(parse_content(c))
+ except:
+ print('Bad data in %s:' % filename)
+ print(c)
+ traceback.print_exc()
+ print('-'*60)
+ return
+ new = os.path.splitext(filename)[0] + '.data'
+ f = open(new, 'wb')
+ f.write(output)
+ f.close()
+
+def translate_all(dir):
+ for fn in os.listdir(dir):
+ fn = os.path.join(dir, fn)
+ if fn.endswith('.xml'):
+ translate_file(fn)
+
+if __name__ == '__main__':
+ translate_all(os.path.join(os.path.dirname(__file__), 'feedparser-data'))
+
diff --git a/src/lxml/html/usedoctest.py b/src/lxml/html/usedoctest.py
new file mode 100644
index 0000000..f352a1c
--- /dev/null
+++ b/src/lxml/html/usedoctest.py
@@ -0,0 +1,13 @@
+"""Doctest module for HTML comparison.
+
+Usage::
+
+ >>> import lxml.html.usedoctest
+ >>> # now do your HTML doctests ...
+
+See `lxml.doctestcompare`.
+"""
+
+from lxml import doctestcompare
+
+doctestcompare.temp_install(html=True, del_module=__name__)
diff --git a/src/lxml/includes/__init__.pxd b/src/lxml/includes/__init__.pxd
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/lxml/includes/__init__.pxd
diff --git a/src/lxml/includes/__init__.py b/src/lxml/includes/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/lxml/includes/__init__.py
diff --git a/src/lxml/includes/c14n.pxd b/src/lxml/includes/c14n.pxd
new file mode 100644
index 0000000..d075e90
--- /dev/null
+++ b/src/lxml/includes/c14n.pxd
@@ -0,0 +1,26 @@
+from lxml.includes.tree cimport xmlDoc, xmlOutputBuffer, xmlChar
+from lxml.includes.xpath cimport xmlNodeSet
+
+cdef extern from "libxml/c14n.h":
+ cdef int xmlC14NDocDumpMemory(xmlDoc* doc,
+ xmlNodeSet* nodes,
+ int exclusive,
+ xmlChar** inclusive_ns_prefixes,
+ int with_comments,
+ xmlChar** doc_txt_ptr) nogil
+
+ cdef int xmlC14NDocSave(xmlDoc* doc,
+ xmlNodeSet* nodes,
+ int exclusive,
+ xmlChar** inclusive_ns_prefixes,
+ int with_comments,
+ char* filename,
+ int compression) nogil
+
+ cdef int xmlC14NDocSaveTo(xmlDoc* doc,
+ xmlNodeSet* nodes,
+ int exclusive,
+ xmlChar** inclusive_ns_prefixes,
+ int with_comments,
+ xmlOutputBuffer* buffer) nogil
+
diff --git a/src/lxml/includes/config.pxd b/src/lxml/includes/config.pxd
new file mode 100644
index 0000000..9c04438
--- /dev/null
+++ b/src/lxml/includes/config.pxd
@@ -0,0 +1,3 @@
+cdef extern from "etree_defs.h":
+ cdef bint ENABLE_THREADING
+ cdef bint ENABLE_SCHEMATRON
diff --git a/src/lxml/includes/dtdvalid.pxd b/src/lxml/includes/dtdvalid.pxd
new file mode 100644
index 0000000..ae94dc6
--- /dev/null
+++ b/src/lxml/includes/dtdvalid.pxd
@@ -0,0 +1,18 @@
+from lxml.includes cimport tree
+from lxml.includes.tree cimport xmlDoc, xmlDtd
+
+cdef extern from "libxml/valid.h" nogil:
+ ctypedef void (*xmlValidityErrorFunc)(void * ctx, const char * msg, ...)
+ ctypedef void (*xmlValidityWarningFunc)(void * ctx, const char * msg, ...)
+
+ ctypedef struct xmlValidCtxt:
+ void *userData
+ xmlValidityErrorFunc error
+ xmlValidityWarningFunc warning
+
+ cdef xmlValidCtxt* xmlNewValidCtxt()
+ cdef void xmlFreeValidCtxt(xmlValidCtxt* cur)
+
+ cdef int xmlValidateDtd(xmlValidCtxt* ctxt, xmlDoc* doc, xmlDtd* dtd)
+ cdef tree.xmlElement* xmlGetDtdElementDesc(
+ xmlDtd* dtd, tree.const_xmlChar* name)
diff --git a/src/lxml/includes/etree_defs.h b/src/lxml/includes/etree_defs.h
new file mode 100644
index 0000000..20d4b9d
--- /dev/null
+++ b/src/lxml/includes/etree_defs.h
@@ -0,0 +1,418 @@
+#ifndef HAS_ETREE_DEFS_H
+#define HAS_ETREE_DEFS_H
+
+/* quick check for Python/libxml2/libxslt devel setup */
+#include "Python.h"
+#ifndef PY_VERSION_HEX
+# error the development package of Python (header files etc.) is not installed correctly
+#else
+# if PY_VERSION_HEX < 0x02070000 || PY_MAJOR_VERSION >= 3 && PY_VERSION_HEX < 0x03050000
+# error this version of lxml requires Python 2.7, 3.5 or later
+# endif
+#endif
+
+#include "libxml/xmlversion.h"
+#ifndef LIBXML_VERSION
+# error the development package of libxml2 (header files etc.) is not installed correctly
+#else
+#if LIBXML_VERSION < 20700
+# error minimum required version of libxml2 is 2.7.0
+#endif
+#endif
+
+#include "libxslt/xsltconfig.h"
+#ifndef LIBXSLT_VERSION
+# error the development package of libxslt (header files etc.) is not installed correctly
+#else
+#if LIBXSLT_VERSION < 10123
+# error minimum required version of libxslt is 1.1.23
+#endif
+#endif
+
+
+/* v_arg functions */
+#define va_int(ap) va_arg(ap, int)
+#define va_charptr(ap) va_arg(ap, char *)
+
+#ifdef PYPY_VERSION
+# define IS_PYPY 1
+#else
+# define IS_PYPY 0
+#endif
+
+#if PY_MAJOR_VERSION >= 3
+# define IS_PYTHON2 0 /* prefer for special casing Python 2.x */
+# define IS_PYTHON3 1 /* avoid */
+#else
+# define IS_PYTHON2 1
+# define IS_PYTHON3 0
+#endif
+
+#if IS_PYTHON2
+#ifndef LXML_UNICODE_STRINGS
+#define LXML_UNICODE_STRINGS 0
+#endif
+#else
+#undef LXML_UNICODE_STRINGS
+#define LXML_UNICODE_STRINGS 1
+#endif
+
+#if !IS_PYPY
+# define PyWeakref_LockObject(obj) (NULL)
+#endif
+
+/* Threading is not currently supported by PyPy */
+#if IS_PYPY
+# ifndef WITHOUT_THREADING
+# define WITHOUT_THREADING
+# endif
+#endif
+
+#if IS_PYPY
+# undef PyFile_AsFile
+# define PyFile_AsFile(o) (NULL)
+# undef PyByteArray_Check
+# define PyByteArray_Check(o) (0)
+#elif !IS_PYTHON2
+ /* Python 3+ doesn't have PyFile_*() anymore */
+# define PyFile_AsFile(o) (NULL)
+#endif
+
+#if PY_VERSION_HEX <= 0x03030000 && !(defined(CYTHON_PEP393_ENABLED) && CYTHON_PEP393_ENABLED)
+ #define PyUnicode_IS_READY(op) (0)
+ #define PyUnicode_GET_LENGTH(u) PyUnicode_GET_SIZE(u)
+ #define PyUnicode_KIND(u) (sizeof(Py_UNICODE))
+ #define PyUnicode_DATA(u) ((void*)PyUnicode_AS_UNICODE(u))
+#endif
+
+#if IS_PYPY
+# ifndef PyUnicode_FromFormat
+# define PyUnicode_FromFormat PyString_FromFormat
+# endif
+# if !IS_PYTHON2 && !defined(PyBytes_FromFormat)
+# ifdef PyString_FromFormat
+# define PyBytes_FromFormat PyString_FromFormat
+# else
+#include <stdarg.h>
+static PyObject* PyBytes_FromFormat(const char* format, ...) {
+ PyObject *string;
+ va_list vargs;
+#ifdef HAVE_STDARG_PROTOTYPES
+ va_start(vargs, format);
+#else
+ va_start(vargs);
+#endif
+ string = PyUnicode_FromFormatV(format, vargs);
+ va_end(vargs);
+ if (string && PyUnicode_Check(string)) {
+ PyObject *bstring = PyUnicode_AsUTF8String(string);
+ Py_DECREF(string);
+ string = bstring;
+ }
+ if (string && !PyBytes_CheckExact(string)) {
+ Py_DECREF(string);
+ string = NULL;
+ PyErr_SetString(PyExc_TypeError, "String formatting and encoding failed to return bytes object");
+ }
+ return string;
+}
+# endif
+# endif
+#endif
+
+/* PySlice_GetIndicesEx() has wrong signature in Py<=3.1 */
+#if PY_VERSION_HEX >= 0x03020000
+# define _lx_PySlice_GetIndicesEx(o, l, b, e, s, sl) PySlice_GetIndicesEx(o, l, b, e, s, sl)
+#else
+# define _lx_PySlice_GetIndicesEx(o, l, b, e, s, sl) PySlice_GetIndicesEx(((PySliceObject*)o), l, b, e, s, sl)
+#endif
+
+#ifdef WITHOUT_THREADING
+# undef PyEval_SaveThread
+# define PyEval_SaveThread() (NULL)
+# undef PyEval_RestoreThread
+# define PyEval_RestoreThread(state) if (state); else {}
+# undef PyGILState_Ensure
+# define PyGILState_Ensure() (PyGILState_UNLOCKED)
+# undef PyGILState_Release
+# define PyGILState_Release(state) if (state); else {}
+# undef Py_UNBLOCK_THREADS
+# define Py_UNBLOCK_THREADS _save = NULL;
+# undef Py_BLOCK_THREADS
+# define Py_BLOCK_THREADS if (_save); else {}
+#endif
+
+#ifdef WITHOUT_THREADING
+# define ENABLE_THREADING 0
+#else
+# define ENABLE_THREADING 1
+#endif
+
+#if LIBXML_VERSION < 20704
+/* FIXME: hack to make new error reporting compile in old libxml2 versions */
+# define xmlStructuredErrorContext NULL
+# define xmlXIncludeProcessTreeFlagsData(n,o,d) xmlXIncludeProcessTreeFlags(n,o)
+#endif
+
+/* schematron was added in libxml2 2.6.21 */
+#ifdef LIBXML_SCHEMATRON_ENABLED
+# define ENABLE_SCHEMATRON 1
+#else
+# define ENABLE_SCHEMATRON 0
+# define XML_SCHEMATRON_OUT_QUIET 0
+# define XML_SCHEMATRON_OUT_XML 0
+# define XML_SCHEMATRON_OUT_ERROR 0
+ typedef void xmlSchematron;
+ typedef void xmlSchematronParserCtxt;
+ typedef void xmlSchematronValidCtxt;
+# define xmlSchematronNewDocParserCtxt(doc) NULL
+# define xmlSchematronNewParserCtxt(file) NULL
+# define xmlSchematronParse(ctxt) NULL
+# define xmlSchematronFreeParserCtxt(ctxt)
+# define xmlSchematronFree(schema)
+# define xmlSchematronNewValidCtxt(schema, options) NULL
+# define xmlSchematronValidateDoc(ctxt, doc) 0
+# define xmlSchematronFreeValidCtxt(ctxt)
+# define xmlSchematronSetValidStructuredErrors(ctxt, errorfunc, data)
+#endif
+
+#if LIBXML_VERSION < 20708
+# define HTML_PARSE_NODEFDTD 4
+#endif
+#if LIBXML_VERSION < 20900
+# define XML_PARSE_BIG_LINES 4194304
+#endif
+
+#include "libxml/tree.h"
+#ifndef LIBXML2_NEW_BUFFER
+ typedef xmlBuffer xmlBuf;
+# define xmlBufContent(buf) xmlBufferContent(buf)
+# define xmlBufUse(buf) xmlBufferLength(buf)
+#endif
+
+/* libexslt 1.1.25+ support EXSLT functions in XPath */
+#if LIBXSLT_VERSION < 10125
+#define exsltDateXpathCtxtRegister(ctxt, prefix)
+#define exsltSetsXpathCtxtRegister(ctxt, prefix)
+#define exsltMathXpathCtxtRegister(ctxt, prefix)
+#define exsltStrXpathCtxtRegister(ctxt, prefix)
+#endif
+
+#define LXML_GET_XSLT_ENCODING(result_var, style) XSLT_GET_IMPORT_PTR(result_var, style, encoding)
+
+/* work around MSDEV 6.0 */
+#if (_MSC_VER == 1200) && (WINVER < 0x0500)
+long _ftol( double ); //defined by VC6 C libs
+long _ftol2( double dblSource ) { return _ftol( dblSource ); }
+#endif
+
+#ifdef __GNUC__
+/* Test for GCC > 2.95 */
+#if __GNUC__ > 2 || (__GNUC__ == 2 && (__GNUC_MINOR__ > 95))
+#define unlikely_condition(x) __builtin_expect((x), 0)
+#else /* __GNUC__ > 2 ... */
+#define unlikely_condition(x) (x)
+#endif /* __GNUC__ > 2 ... */
+#else /* __GNUC__ */
+#define unlikely_condition(x) (x)
+#endif /* __GNUC__ */
+
+#ifndef Py_TYPE
+ #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)
+#endif
+
+#define PY_NEW(T) \
+ (((PyTypeObject*)(T))->tp_new( \
+ (PyTypeObject*)(T), __pyx_empty_tuple, NULL))
+
+#define _fqtypename(o) ((Py_TYPE(o))->tp_name)
+
+#define lxml_malloc(count, item_size) \
+ (unlikely_condition((size_t)(count) > (size_t) (PY_SSIZE_T_MAX / item_size)) ? NULL : \
+ (PyMem_Malloc((count) * item_size)))
+
+#define lxml_realloc(mem, count, item_size) \
+ (unlikely_condition((size_t)(count) > (size_t) (PY_SSIZE_T_MAX / item_size)) ? NULL : \
+ (PyMem_Realloc(mem, (count) * item_size)))
+
+#define lxml_free(mem) PyMem_Free(mem)
+
+#if PY_MAJOR_VERSION < 3
+#define _isString(obj) (PyString_CheckExact(obj) || \
+ PyUnicode_CheckExact(obj) || \
+ PyType_IsSubtype(Py_TYPE(obj), &PyBaseString_Type))
+#else
+/* builtin subtype type checks are almost as fast as exact checks in Py2.7+
+ * and Unicode is more common in Py3 */
+#define _isString(obj) (PyUnicode_Check(obj) || PyBytes_Check(obj))
+#endif
+
+#define _isElement(c_node) \
+ (((c_node)->type == XML_ELEMENT_NODE) || \
+ ((c_node)->type == XML_COMMENT_NODE) || \
+ ((c_node)->type == XML_ENTITY_REF_NODE) || \
+ ((c_node)->type == XML_PI_NODE))
+
+#define _isElementOrXInclude(c_node) \
+ (_isElement(c_node) || \
+ ((c_node)->type == XML_XINCLUDE_START) || \
+ ((c_node)->type == XML_XINCLUDE_END))
+
+#define _getNs(c_node) \
+ (((c_node)->ns == 0) ? 0 : ((c_node)->ns->href))
+
+
+#include "string.h"
+static void* lxml_unpack_xmldoc_capsule(PyObject* capsule, int* is_owned) {
+ xmlDoc *c_doc;
+ void *context;
+ *is_owned = 0;
+ if (unlikely_condition(!PyCapsule_IsValid(capsule, (const char*)"libxml2:xmlDoc"))) {
+ PyErr_SetString(
+ PyExc_TypeError,
+ "Not a valid capsule. The capsule argument must be a capsule object with name libxml2:xmlDoc");
+ return NULL;
+ }
+ c_doc = (xmlDoc*) PyCapsule_GetPointer(capsule, (const char*)"libxml2:xmlDoc");
+ if (unlikely_condition(!c_doc)) return NULL;
+
+ if (unlikely_condition(c_doc->type != XML_DOCUMENT_NODE && c_doc->type != XML_HTML_DOCUMENT_NODE)) {
+ PyErr_Format(
+ PyExc_ValueError,
+ "Illegal document provided: expected XML or HTML, found %d", (int)c_doc->type);
+ return NULL;
+ }
+
+ context = PyCapsule_GetContext(capsule);
+ if (unlikely_condition(!context && PyErr_Occurred())) return NULL;
+ if (context && strcmp((const char*) context, "destructor:xmlFreeDoc") == 0) {
+ /* take ownership by setting destructor to NULL */
+ if (PyCapsule_SetDestructor(capsule, NULL) == 0) {
+ /* ownership transferred => invalidate capsule by clearing its name */
+ if (unlikely_condition(PyCapsule_SetName(capsule, NULL))) {
+ /* this should never happen since everything above succeeded */
+ xmlFreeDoc(c_doc);
+ return NULL;
+ }
+ *is_owned = 1;
+ }
+ }
+ return c_doc;
+}
+
+/* Macro pair implementation of a depth first tree walker
+ *
+ * Calls the code block between the BEGIN and END macros for all elements
+ * below c_tree_top (exclusively), starting at c_node (inclusively iff
+ * 'inclusive' is 1). The _ELEMENT_ variants will only stop on nodes
+ * that match _isElement(), the normal variant will stop on every node
+ * except text nodes.
+ *
+ * To traverse the node and all of its children and siblings in Pyrex, call
+ * cdef xmlNode* some_node
+ * BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 1)
+ * # do something with some_node
+ * END_FOR_EACH_ELEMENT_FROM(some_node)
+ *
+ * To traverse only the children and siblings of a node, call
+ * cdef xmlNode* some_node
+ * BEGIN_FOR_EACH_ELEMENT_FROM(some_node.parent, some_node, 0)
+ * # do something with some_node
+ * END_FOR_EACH_ELEMENT_FROM(some_node)
+ *
+ * To traverse only the children, do:
+ * cdef xmlNode* some_node
+ * some_node = parent_node.children
+ * BEGIN_FOR_EACH_ELEMENT_FROM(parent_node, some_node, 1)
+ * # do something with some_node
+ * END_FOR_EACH_ELEMENT_FROM(some_node)
+ *
+ * NOTE: 'some_node' MUST be a plain 'xmlNode*' !
+ *
+ * NOTE: parent modification during the walk can divert the iterator, but
+ * should not segfault !
+ */
+
+#define _LX__ELEMENT_MATCH(c_node, only_elements) \
+ ((only_elements) ? (_isElement(c_node)) : 1)
+
+#define _LX__ADVANCE_TO_NEXT(c_node, only_elements) \
+ while ((c_node != 0) && (!_LX__ELEMENT_MATCH(c_node, only_elements))) \
+ c_node = c_node->next;
+
+#define _LX__TRAVERSE_TO_NEXT(c_stop_node, c_node, only_elements) \
+{ \
+ /* walk through children first */ \
+ xmlNode* _lx__next = c_node->children; \
+ if (_lx__next != 0) { \
+ if (c_node->type == XML_ENTITY_REF_NODE || c_node->type == XML_DTD_NODE) { \
+ _lx__next = 0; \
+ } else { \
+ _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
+ } \
+ } \
+ if ((_lx__next == 0) && (c_node != c_stop_node)) { \
+ /* try siblings */ \
+ _lx__next = c_node->next; \
+ _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
+ /* back off through parents */ \
+ while (_lx__next == 0) { \
+ c_node = c_node->parent; \
+ if (c_node == 0) \
+ break; \
+ if (c_node == c_stop_node) \
+ break; \
+ if ((only_elements) && !_isElement(c_node)) \
+ break; \
+ /* we already traversed the parents -> siblings */ \
+ _lx__next = c_node->next; \
+ _LX__ADVANCE_TO_NEXT(_lx__next, only_elements) \
+ } \
+ } \
+ c_node = _lx__next; \
+}
+
+#define _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, only_elements) \
+{ \
+ if (c_node != 0) { \
+ const xmlNode* _lx__tree_top = (c_tree_top); \
+ const int _lx__only_elements = (only_elements); \
+ /* make sure we start at an element */ \
+ if (!_LX__ELEMENT_MATCH(c_node, _lx__only_elements)) { \
+ /* we skip the node, so 'inclusive' is irrelevant */ \
+ if (c_node == _lx__tree_top) \
+ c_node = 0; /* nothing to traverse */ \
+ else { \
+ c_node = c_node->next; \
+ _LX__ADVANCE_TO_NEXT(c_node, _lx__only_elements) \
+ } \
+ } else if (! (inclusive)) { \
+ /* skip the first node */ \
+ _LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements) \
+ } \
+ \
+ /* now run the user code on the elements we find */ \
+ while (c_node != 0) { \
+ /* here goes the code to be run for each element */
+
+#define _LX__END_FOR_EACH_FROM(c_node) \
+ _LX__TRAVERSE_TO_NEXT(_lx__tree_top, c_node, _lx__only_elements) \
+ } \
+ } \
+}
+
+
+#define BEGIN_FOR_EACH_ELEMENT_FROM(c_tree_top, c_node, inclusive) \
+ _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 1)
+
+#define END_FOR_EACH_ELEMENT_FROM(c_node) \
+ _LX__END_FOR_EACH_FROM(c_node)
+
+#define BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive) \
+ _LX__BEGIN_FOR_EACH_FROM(c_tree_top, c_node, inclusive, 0)
+
+#define END_FOR_EACH_FROM(c_node) \
+ _LX__END_FOR_EACH_FROM(c_node)
+
+
+#endif /* HAS_ETREE_DEFS_H */
diff --git a/src/lxml/includes/etreepublic.pxd b/src/lxml/includes/etreepublic.pxd
new file mode 100644
index 0000000..94fe2e8
--- /dev/null
+++ b/src/lxml/includes/etreepublic.pxd
@@ -0,0 +1,237 @@
+# public Cython/C interface to lxml.etree
+
+from lxml.includes cimport tree
+from lxml.includes.tree cimport const_xmlChar
+
+cdef extern from "lxml-version.h":
+ cdef char* LXML_VERSION_STRING
+
+cdef extern from "etree_defs.h":
+ # test if c_node is considered an Element (i.e. Element, Comment, etc.)
+ cdef bint _isElement(tree.xmlNode* c_node) nogil
+
+ # return the namespace URI of the node or NULL
+ cdef const_xmlChar* _getNs(tree.xmlNode* node) nogil
+
+ # pair of macros for tree traversal
+ cdef void BEGIN_FOR_EACH_ELEMENT_FROM(tree.xmlNode* tree_top,
+ tree.xmlNode* start_node,
+ int start_node_inclusive) nogil
+ cdef void END_FOR_EACH_ELEMENT_FROM(tree.xmlNode* start_node) nogil
+
+cdef extern from "etree_api.h":
+
+ # first function to call!
+ cdef int import_lxml__etree() except -1
+
+ ##########################################################################
+ # public ElementTree API classes
+
+ cdef class lxml.etree._Document [ object LxmlDocument ]:
+ cdef tree.xmlDoc* _c_doc
+
+ cdef class lxml.etree._Element [ object LxmlElement ]:
+ cdef _Document _doc
+ cdef tree.xmlNode* _c_node
+
+ cdef class lxml.etree.ElementBase(_Element) [ object LxmlElementBase ]:
+ pass
+
+ cdef class lxml.etree._ElementTree [ object LxmlElementTree ]:
+ cdef _Document _doc
+ cdef _Element _context_node
+
+ cdef class lxml.etree.ElementClassLookup [ object LxmlElementClassLookup ]:
+ cdef object (*_lookup_function)(object, _Document, tree.xmlNode*)
+
+ cdef class lxml.etree.FallbackElementClassLookup(ElementClassLookup) \
+ [ object LxmlFallbackElementClassLookup ]:
+ cdef ElementClassLookup fallback
+ cdef object (*_fallback_function)(object, _Document, tree.xmlNode*)
+
+ ##########################################################################
+ # creating Element objects
+
+ # create an Element for a C-node in the Document
+ cdef _Element elementFactory(_Document doc, tree.xmlNode* c_node)
+
+ # create an ElementTree for an Element
+ cdef _ElementTree elementTreeFactory(_Element context_node)
+
+ # create an ElementTree subclass for an Element
+ cdef _ElementTree newElementTree(_Element context_node, object subclass)
+
+ # create an ElementTree from an external document
+ cdef _ElementTree adoptExternalDocument(tree.xmlDoc* c_doc, parser, bint is_owned)
+
+ # create a new Element for an existing or new document (doc = None)
+ # builds Python object after setting text, tail, namespaces and attributes
+ cdef _Element makeElement(tag, _Document doc, parser,
+ text, tail, attrib, nsmap)
+
+ # create a new SubElement for an existing parent
+ # builds Python object after setting text, tail, namespaces and attributes
+ cdef _Element makeSubElement(_Element parent, tag, text, tail,
+ attrib, nsmap)
+
+ # deep copy a node to include it in the Document
+ cdef _Element deepcopyNodeToDocument(_Document doc, tree.xmlNode* c_root)
+
+ # set the internal lookup function for Element/Comment/PI classes
+ # use setElementClassLookupFunction(NULL, None) to reset it
+ # note that the lookup function *must always* return an _Element subclass!
+ cdef void setElementClassLookupFunction(
+ object (*function)(object, _Document, tree.xmlNode*), object state)
+
+ # lookup function that always returns the default Element class
+ # note that the first argument is expected to be None!
+ cdef object lookupDefaultElementClass(_1, _Document _2,
+ tree.xmlNode* c_node)
+
+ # lookup function for namespace/tag specific Element classes
+ # note that the first argument is expected to be None!
+ cdef object lookupNamespaceElementClass(_1, _Document _2,
+ tree.xmlNode* c_node)
+
+ # call the fallback lookup function of a FallbackElementClassLookup
+ cdef object callLookupFallback(FallbackElementClassLookup lookup,
+ _Document doc, tree.xmlNode* c_node)
+
+ ##########################################################################
+ # XML attribute access
+
+ # return an attribute value for a C attribute on a C element node
+ cdef object attributeValue(tree.xmlNode* c_element,
+ tree.xmlAttr* c_attrib_node)
+
+ # return the value of the attribute with 'ns' and 'name' (or None)
+ cdef object attributeValueFromNsName(tree.xmlNode* c_element,
+ const_xmlChar* c_ns, const_xmlChar* c_name)
+
+ # return the value of attribute "{ns}name", or the default value
+ cdef object getAttributeValue(_Element element, key, default)
+
+ # return an iterator over attribute names (1), values (2) or items (3)
+ # attributes must not be removed during iteration!
+ cdef object iterattributes(_Element element, int keysvalues)
+
+ # return the list of all attribute names (1), values (2) or items (3)
+ cdef list collectAttributes(tree.xmlNode* c_element, int keysvalues)
+
+ # set an attribute value on an element
+ # on failure, sets an exception and returns -1
+ cdef int setAttributeValue(_Element element, key, value) except -1
+
+ # delete an attribute
+ # on failure, sets an exception and returns -1
+ cdef int delAttribute(_Element element, key) except -1
+
+ # delete an attribute based on name and namespace URI
+ # returns -1 if the attribute was not found (no exception)
+ cdef int delAttributeFromNsName(tree.xmlNode* c_element,
+ const_xmlChar* c_href, const_xmlChar* c_name)
+
+ ##########################################################################
+ # XML node helper functions
+
+ # check if the element has at least one child
+ cdef bint hasChild(tree.xmlNode* c_node) nogil
+
+ # find child element number 'index' (supports negative indexes)
+ cdef tree.xmlNode* findChild(tree.xmlNode* c_node,
+ Py_ssize_t index) nogil
+
+ # find child element number 'index' starting at first one
+ cdef tree.xmlNode* findChildForwards(tree.xmlNode* c_node,
+ Py_ssize_t index) nogil
+
+ # find child element number 'index' starting at last one
+ cdef tree.xmlNode* findChildBackwards(tree.xmlNode* c_node,
+ Py_ssize_t index) nogil
+
+ # return next/previous sibling element of the node
+ cdef tree.xmlNode* nextElement(tree.xmlNode* c_node) nogil
+ cdef tree.xmlNode* previousElement(tree.xmlNode* c_node) nogil
+
+ ##########################################################################
+ # iterators (DEPRECATED API, don't use in new code!)
+
+ cdef class lxml.etree._ElementTagMatcher [ object LxmlElementTagMatcher ]:
+ cdef char* _href
+ cdef char* _name
+
+ # store "{ns}tag" (or None) filter for this matcher or element iterator
+ # ** unless _href *and* _name are set up 'by hand', this function *must*
+ # ** be called when subclassing the iterator below!
+ cdef void initTagMatch(_ElementTagMatcher matcher, tag)
+
+ cdef class lxml.etree._ElementIterator(_ElementTagMatcher) [
+ object LxmlElementIterator ]:
+ cdef _Element _node
+ cdef tree.xmlNode* (*_next_element)(tree.xmlNode*)
+
+ # store the initial node of the iterator if it matches the required tag
+ # or its next matching sibling if not
+ cdef void iteratorStoreNext(_ElementIterator iterator, _Element node)
+
+ ##########################################################################
+ # other helper functions
+
+ # check if a C node matches a tag name and namespace
+ # (NULL allowed for each => always matches)
+ cdef int tagMatches(tree.xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name)
+
+ # convert a UTF-8 char* to a Python string or unicode string
+ cdef object pyunicode(const_xmlChar* s)
+
+ # convert the string to UTF-8 using the normal lxml.etree semantics
+ cdef bytes utf8(object s)
+
+ # split a tag into a (URI, name) tuple, return None as URI for '{}tag'
+ cdef tuple getNsTag(object tag)
+
+ # split a tag into a (URI, name) tuple, return b'' as URI for '{}tag'
+ cdef tuple getNsTagWithEmptyNs(object tag)
+
+ # get the "{ns}tag" string for a C node
+ cdef object namespacedName(tree.xmlNode* c_node)
+
+ # get the "{ns}tag" string for a href/tagname pair (c_ns may be NULL)
+ cdef object namespacedNameFromNsName(const_xmlChar* c_ns, const_xmlChar* c_tag)
+
+ # check if the node has a text value (which may be '')
+ cdef bint hasText(tree.xmlNode* c_node) nogil
+
+ # check if the node has a tail value (which may be '')
+ cdef bint hasTail(tree.xmlNode* c_node) nogil
+
+ # get the text content of an element (or None)
+ cdef object textOf(tree.xmlNode* c_node)
+
+ # get the tail content of an element (or None)
+ cdef object tailOf(tree.xmlNode* c_node)
+
+ # set the text value of an element
+ cdef int setNodeText(tree.xmlNode* c_node, text) except -1
+
+ # set the tail text value of an element
+ cdef int setTailText(tree.xmlNode* c_node, text) except -1
+
+ # append an element to the children of a parent element
+ # deprecated: don't use, does not propagate exceptions!
+ # use appendChildToElement() instead
+ cdef void appendChild(_Element parent, _Element child)
+
+ # added in lxml 3.3 as a safe replacement for appendChild()
+ # return -1 for exception, 0 for ok
+ cdef int appendChildToElement(_Element parent, _Element child) except -1
+
+ # recursively lookup a namespace in element or ancestors, or create it
+ cdef tree.xmlNs* findOrBuildNodeNsPrefix(
+ _Document doc, tree.xmlNode* c_node, const_xmlChar* href, const_xmlChar* prefix)
+
+ # find the Document of an Element, ElementTree or Document (itself!)
+ cdef _Document documentOrRaise(object input)
+
+ # find the root Element of an Element (itself!), ElementTree or Document
+ cdef _Element rootNodeOrRaise(object input)
diff --git a/src/lxml/includes/htmlparser.pxd b/src/lxml/includes/htmlparser.pxd
new file mode 100644
index 0000000..145a69a
--- /dev/null
+++ b/src/lxml/includes/htmlparser.pxd
@@ -0,0 +1,56 @@
+from libc.string cimport const_char
+
+from lxml.includes.tree cimport xmlDoc
+from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback
+from lxml.includes.xmlparser cimport xmlParserCtxt, xmlSAXHandler, xmlSAXHandlerV1
+
+cdef extern from "libxml/HTMLparser.h":
+ ctypedef enum htmlParserOption:
+ HTML_PARSE_NOERROR # suppress error reports
+ HTML_PARSE_NOWARNING # suppress warning reports
+ HTML_PARSE_PEDANTIC # pedantic error reporting
+ HTML_PARSE_NOBLANKS # remove blank nodes
+ HTML_PARSE_NONET # Forbid network access
+ # libxml2 2.6.21+ only:
+ HTML_PARSE_RECOVER # Relaxed parsing
+ HTML_PARSE_COMPACT # compact small text nodes
+ # libxml2 2.7.7+ only:
+ HTML_PARSE_NOIMPLIED # Do not add implied html/body... elements
+ # libxml2 2.7.8+ only:
+ HTML_PARSE_NODEFDTD # do not default a doctype if not found
+ # libxml2 2.8.0+ only:
+ XML_PARSE_IGNORE_ENC # ignore internal document encoding hint
+
+ xmlSAXHandlerV1 htmlDefaultSAXHandler
+
+ cdef xmlParserCtxt* htmlCreateMemoryParserCtxt(
+ char* buffer, int size) nogil
+ cdef xmlParserCtxt* htmlCreateFileParserCtxt(
+ char* filename, char* encoding) nogil
+ cdef xmlParserCtxt* htmlCreatePushParserCtxt(xmlSAXHandler* sax,
+ void* user_data,
+ char* chunk, int size,
+ char* filename, int enc) nogil
+ cdef void htmlFreeParserCtxt(xmlParserCtxt* ctxt) nogil
+ cdef void htmlCtxtReset(xmlParserCtxt* ctxt) nogil
+ cdef int htmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) nogil
+ cdef int htmlParseDocument(xmlParserCtxt* ctxt) nogil
+ cdef int htmlParseChunk(xmlParserCtxt* ctxt,
+ char* chunk, int size, int terminate) nogil
+
+ cdef xmlDoc* htmlCtxtReadFile(xmlParserCtxt* ctxt,
+ char* filename, const_char* encoding,
+ int options) nogil
+ cdef xmlDoc* htmlCtxtReadDoc(xmlParserCtxt* ctxt,
+ char* buffer, char* URL, const_char* encoding,
+ int options) nogil
+ cdef xmlDoc* htmlCtxtReadIO(xmlParserCtxt* ctxt,
+ xmlInputReadCallback ioread,
+ xmlInputCloseCallback ioclose,
+ void* ioctx,
+ char* URL, const_char* encoding,
+ int options) nogil
+ cdef xmlDoc* htmlCtxtReadMemory(xmlParserCtxt* ctxt,
+ char* buffer, int size,
+ char* filename, const_char* encoding,
+ int options) nogil
diff --git a/src/lxml/includes/relaxng.pxd b/src/lxml/includes/relaxng.pxd
new file mode 100644
index 0000000..28e9212
--- /dev/null
+++ b/src/lxml/includes/relaxng.pxd
@@ -0,0 +1,64 @@
+from lxml.includes.tree cimport xmlDoc
+from lxml.includes.xmlerror cimport xmlStructuredErrorFunc
+
+cdef extern from "libxml/relaxng.h":
+ ctypedef struct xmlRelaxNG
+ ctypedef struct xmlRelaxNGParserCtxt
+
+ ctypedef struct xmlRelaxNGValidCtxt
+
+ ctypedef enum xmlRelaxNGValidErr:
+ XML_RELAXNG_OK = 0
+ XML_RELAXNG_ERR_MEMORY = 1
+ XML_RELAXNG_ERR_TYPE = 2
+ XML_RELAXNG_ERR_TYPEVAL = 3
+ XML_RELAXNG_ERR_DUPID = 4
+ XML_RELAXNG_ERR_TYPECMP = 5
+ XML_RELAXNG_ERR_NOSTATE = 6
+ XML_RELAXNG_ERR_NODEFINE = 7
+ XML_RELAXNG_ERR_LISTEXTRA = 8
+ XML_RELAXNG_ERR_LISTEMPTY = 9
+ XML_RELAXNG_ERR_INTERNODATA = 10
+ XML_RELAXNG_ERR_INTERSEQ = 11
+ XML_RELAXNG_ERR_INTEREXTRA = 12
+ XML_RELAXNG_ERR_ELEMNAME = 13
+ XML_RELAXNG_ERR_ATTRNAME = 14
+ XML_RELAXNG_ERR_ELEMNONS = 15
+ XML_RELAXNG_ERR_ATTRNONS = 16
+ XML_RELAXNG_ERR_ELEMWRONGNS = 17
+ XML_RELAXNG_ERR_ATTRWRONGNS = 18
+ XML_RELAXNG_ERR_ELEMEXTRANS = 19
+ XML_RELAXNG_ERR_ATTREXTRANS = 20
+ XML_RELAXNG_ERR_ELEMNOTEMPTY = 21
+ XML_RELAXNG_ERR_NOELEM = 22
+ XML_RELAXNG_ERR_NOTELEM = 23
+ XML_RELAXNG_ERR_ATTRVALID = 24
+ XML_RELAXNG_ERR_CONTENTVALID = 25
+ XML_RELAXNG_ERR_EXTRACONTENT = 26
+ XML_RELAXNG_ERR_INVALIDATTR = 27
+ XML_RELAXNG_ERR_DATAELEM = 28
+ XML_RELAXNG_ERR_VALELEM = 29
+ XML_RELAXNG_ERR_LISTELEM = 30
+ XML_RELAXNG_ERR_DATATYPE = 31
+ XML_RELAXNG_ERR_VALUE = 32
+ XML_RELAXNG_ERR_LIST = 33
+ XML_RELAXNG_ERR_NOGRAMMAR = 34
+ XML_RELAXNG_ERR_EXTRADATA = 35
+ XML_RELAXNG_ERR_LACKDATA = 36
+ XML_RELAXNG_ERR_INTERNAL = 37
+ XML_RELAXNG_ERR_ELEMWRONG = 38
+ XML_RELAXNG_ERR_TEXTWRONG = 39
+
+ cdef xmlRelaxNGValidCtxt* xmlRelaxNGNewValidCtxt(xmlRelaxNG* schema) nogil
+ cdef int xmlRelaxNGValidateDoc(xmlRelaxNGValidCtxt* ctxt, xmlDoc* doc) nogil
+ cdef xmlRelaxNG* xmlRelaxNGParse(xmlRelaxNGParserCtxt* ctxt) nogil
+ cdef xmlRelaxNGParserCtxt* xmlRelaxNGNewParserCtxt(char* URL) nogil
+ cdef xmlRelaxNGParserCtxt* xmlRelaxNGNewDocParserCtxt(xmlDoc* doc) nogil
+ cdef void xmlRelaxNGFree(xmlRelaxNG* schema) nogil
+ cdef void xmlRelaxNGFreeParserCtxt(xmlRelaxNGParserCtxt* ctxt) nogil
+ cdef void xmlRelaxNGFreeValidCtxt(xmlRelaxNGValidCtxt* ctxt) nogil
+
+ cdef void xmlRelaxNGSetValidStructuredErrors(
+ xmlRelaxNGValidCtxt* ctxt, xmlStructuredErrorFunc serror, void *ctx) nogil
+ cdef void xmlRelaxNGSetParserStructuredErrors(
+ xmlRelaxNGParserCtxt* ctxt, xmlStructuredErrorFunc serror, void *ctx) nogil
diff --git a/src/lxml/includes/schematron.pxd b/src/lxml/includes/schematron.pxd
new file mode 100644
index 0000000..f8e3252
--- /dev/null
+++ b/src/lxml/includes/schematron.pxd
@@ -0,0 +1,34 @@
+from lxml.includes cimport xmlerror
+from lxml.includes.tree cimport xmlDoc
+
+cdef extern from "libxml/schematron.h":
+ ctypedef struct xmlSchematron
+ ctypedef struct xmlSchematronParserCtxt
+ ctypedef struct xmlSchematronValidCtxt
+
+ ctypedef enum xmlSchematronValidOptions:
+ XML_SCHEMATRON_OUT_QUIET = 1 # quiet no report
+ XML_SCHEMATRON_OUT_TEXT = 2 # build a textual report
+ XML_SCHEMATRON_OUT_XML = 4 # output SVRL
+ XML_SCHEMATRON_OUT_ERROR = 8 # output via xmlStructuredErrorFunc
+ XML_SCHEMATRON_OUT_FILE = 256 # output to a file descriptor
+ XML_SCHEMATRON_OUT_BUFFER = 512 # output to a buffer
+ XML_SCHEMATRON_OUT_IO = 1024 # output to I/O mechanism
+
+ cdef xmlSchematronParserCtxt* xmlSchematronNewDocParserCtxt(
+ xmlDoc* doc) nogil
+ cdef xmlSchematronParserCtxt* xmlSchematronNewParserCtxt(
+ char* filename) nogil
+ cdef xmlSchematronValidCtxt* xmlSchematronNewValidCtxt(
+ xmlSchematron* schema, int options) nogil
+
+ cdef xmlSchematron* xmlSchematronParse(xmlSchematronParserCtxt* ctxt) nogil
+ cdef int xmlSchematronValidateDoc(xmlSchematronValidCtxt* ctxt,
+ xmlDoc* instance) nogil
+
+ cdef void xmlSchematronFreeParserCtxt(xmlSchematronParserCtxt* ctxt) nogil
+ cdef void xmlSchematronFreeValidCtxt(xmlSchematronValidCtxt* ctxt) nogil
+ cdef void xmlSchematronFree(xmlSchematron* schema) nogil
+ cdef void xmlSchematronSetValidStructuredErrors(
+ xmlSchematronValidCtxt* ctxt,
+ xmlerror.xmlStructuredErrorFunc error_func, void *data)
diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd
new file mode 100644
index 0000000..010af80
--- /dev/null
+++ b/src/lxml/includes/tree.pxd
@@ -0,0 +1,480 @@
+from libc cimport stdio
+from libc.string cimport const_char, const_uchar
+
+cdef extern from "lxml-version.h":
+ # deprecated declaration, use etreepublic.pxd instead
+ cdef char* LXML_VERSION_STRING
+
+cdef extern from "libxml/xmlversion.h":
+ cdef const_char* xmlParserVersion
+ cdef int LIBXML_VERSION
+
+cdef extern from "libxml/xmlstring.h":
+ ctypedef unsigned char xmlChar
+ ctypedef const xmlChar const_xmlChar "const xmlChar"
+ cdef int xmlStrlen(const_xmlChar* str) nogil
+ cdef xmlChar* xmlStrdup(const_xmlChar* cur) nogil
+ cdef int xmlStrncmp(const_xmlChar* str1, const_xmlChar* str2, int length) nogil
+ cdef int xmlStrcmp(const_xmlChar* str1, const_xmlChar* str2) nogil
+ cdef int xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) nogil
+ cdef const_xmlChar* xmlStrstr(const_xmlChar* str1, const_xmlChar* str2) nogil
+ cdef const_xmlChar* xmlStrchr(const_xmlChar* str1, xmlChar ch) nogil
+ cdef const_xmlChar* _xcstr "(const xmlChar*)PyBytes_AS_STRING" (object s)
+
+cdef extern from "libxml/encoding.h":
+ ctypedef enum xmlCharEncoding:
+ XML_CHAR_ENCODING_ERROR = -1 # No char encoding detected
+ XML_CHAR_ENCODING_NONE = 0 # No char encoding detected
+ XML_CHAR_ENCODING_UTF8 = 1 # UTF-8
+ XML_CHAR_ENCODING_UTF16LE = 2 # UTF-16 little endian
+ XML_CHAR_ENCODING_UTF16BE = 3 # UTF-16 big endian
+ XML_CHAR_ENCODING_UCS4LE = 4 # UCS-4 little endian
+ XML_CHAR_ENCODING_UCS4BE = 5 # UCS-4 big endian
+ XML_CHAR_ENCODING_EBCDIC = 6 # EBCDIC uh!
+ XML_CHAR_ENCODING_UCS4_2143 = 7 # UCS-4 unusual ordering
+ XML_CHAR_ENCODING_UCS4_3412 = 8 # UCS-4 unusual ordering
+ XML_CHAR_ENCODING_UCS2 = 9 # UCS-2
+ XML_CHAR_ENCODING_8859_1 = 10 # ISO-8859-1 ISO Latin 1
+ XML_CHAR_ENCODING_8859_2 = 11 # ISO-8859-2 ISO Latin 2
+ XML_CHAR_ENCODING_8859_3 = 12 # ISO-8859-3
+ XML_CHAR_ENCODING_8859_4 = 13 # ISO-8859-4
+ XML_CHAR_ENCODING_8859_5 = 14 # ISO-8859-5
+ XML_CHAR_ENCODING_8859_6 = 15 # ISO-8859-6
+ XML_CHAR_ENCODING_8859_7 = 16 # ISO-8859-7
+ XML_CHAR_ENCODING_8859_8 = 17 # ISO-8859-8
+ XML_CHAR_ENCODING_8859_9 = 18 # ISO-8859-9
+ XML_CHAR_ENCODING_2022_JP = 19 # ISO-2022-JP
+ XML_CHAR_ENCODING_SHIFT_JIS = 20 # Shift_JIS
+ XML_CHAR_ENCODING_EUC_JP = 21 # EUC-JP
+ XML_CHAR_ENCODING_ASCII = 22 # pure ASCII
+
+ ctypedef struct xmlCharEncodingHandler
+ cdef xmlCharEncodingHandler* xmlFindCharEncodingHandler(char* name) nogil
+ cdef xmlCharEncodingHandler* xmlGetCharEncodingHandler(
+ xmlCharEncoding enc) nogil
+ cdef int xmlCharEncCloseFunc(xmlCharEncodingHandler* handler) nogil
+ cdef xmlCharEncoding xmlDetectCharEncoding(const_xmlChar* text, int len) nogil
+ cdef const_char* xmlGetCharEncodingName(xmlCharEncoding enc) nogil
+ cdef xmlCharEncoding xmlParseCharEncoding(char* name) nogil
+ ctypedef int (*xmlCharEncodingOutputFunc)(
+ unsigned char *out_buf, int *outlen, const_uchar *in_buf, int *inlen)
+
+cdef extern from "libxml/chvalid.h":
+ cdef int xmlIsChar_ch(char c) nogil
+ cdef int xmlIsCharQ(int ch) nogil
+
+cdef extern from "libxml/hash.h":
+ ctypedef struct xmlHashTable
+ ctypedef void (*xmlHashScanner)(void* payload, void* data, const_xmlChar* name) # may require GIL!
+ void xmlHashScan(xmlHashTable* table, xmlHashScanner f, void* data) nogil
+ void* xmlHashLookup(xmlHashTable* table, const_xmlChar* name) nogil
+ ctypedef void (*xmlHashDeallocator)(void *payload, xmlChar *name)
+ cdef xmlHashTable* xmlHashCreate(int size)
+ cdef xmlHashTable* xmlHashCreateDict(int size, xmlDict *dict)
+ cdef int xmlHashSize(xmlHashTable* table)
+ cdef void xmlHashFree(xmlHashTable* table, xmlHashDeallocator f)
+
+cdef extern from *: # actually "libxml/dict.h"
+ # libxml/dict.h appears to be broken to include in C
+ ctypedef struct xmlDict
+ cdef const_xmlChar* xmlDictLookup(xmlDict* dict, const_xmlChar* name, int len) nogil
+ cdef const_xmlChar* xmlDictExists(xmlDict* dict, const_xmlChar* name, int len) nogil
+ cdef int xmlDictOwns(xmlDict* dict, const_xmlChar* name) nogil
+ cdef size_t xmlDictSize(xmlDict* dict) nogil
+
+cdef extern from "libxml/tree.h":
+ ctypedef struct xmlDoc
+ ctypedef struct xmlAttr
+ ctypedef struct xmlNotationTable
+
+ ctypedef enum xmlElementType:
+ XML_ELEMENT_NODE= 1
+ XML_ATTRIBUTE_NODE= 2
+ XML_TEXT_NODE= 3
+ XML_CDATA_SECTION_NODE= 4
+ XML_ENTITY_REF_NODE= 5
+ XML_ENTITY_NODE= 6
+ XML_PI_NODE= 7
+ XML_COMMENT_NODE= 8
+ XML_DOCUMENT_NODE= 9
+ XML_DOCUMENT_TYPE_NODE= 10
+ XML_DOCUMENT_FRAG_NODE= 11
+ XML_NOTATION_NODE= 12
+ XML_HTML_DOCUMENT_NODE= 13
+ XML_DTD_NODE= 14
+ XML_ELEMENT_DECL= 15
+ XML_ATTRIBUTE_DECL= 16
+ XML_ENTITY_DECL= 17
+ XML_NAMESPACE_DECL= 18
+ XML_XINCLUDE_START= 19
+ XML_XINCLUDE_END= 20
+
+ ctypedef enum xmlElementTypeVal:
+ XML_ELEMENT_TYPE_UNDEFINED= 0
+ XML_ELEMENT_TYPE_EMPTY= 1
+ XML_ELEMENT_TYPE_ANY= 2
+ XML_ELEMENT_TYPE_MIXED= 3
+ XML_ELEMENT_TYPE_ELEMENT= 4
+
+ ctypedef enum xmlElementContentType:
+ XML_ELEMENT_CONTENT_PCDATA= 1
+ XML_ELEMENT_CONTENT_ELEMENT= 2
+ XML_ELEMENT_CONTENT_SEQ= 3
+ XML_ELEMENT_CONTENT_OR= 4
+
+ ctypedef enum xmlElementContentOccur:
+ XML_ELEMENT_CONTENT_ONCE= 1
+ XML_ELEMENT_CONTENT_OPT= 2
+ XML_ELEMENT_CONTENT_MULT= 3
+ XML_ELEMENT_CONTENT_PLUS= 4
+
+ ctypedef enum xmlAttributeType:
+ XML_ATTRIBUTE_CDATA = 1
+ XML_ATTRIBUTE_ID= 2
+ XML_ATTRIBUTE_IDREF= 3
+ XML_ATTRIBUTE_IDREFS= 4
+ XML_ATTRIBUTE_ENTITY= 5
+ XML_ATTRIBUTE_ENTITIES= 6
+ XML_ATTRIBUTE_NMTOKEN= 7
+ XML_ATTRIBUTE_NMTOKENS= 8
+ XML_ATTRIBUTE_ENUMERATION= 9
+ XML_ATTRIBUTE_NOTATION= 10
+
+ ctypedef enum xmlAttributeDefault:
+ XML_ATTRIBUTE_NONE= 1
+ XML_ATTRIBUTE_REQUIRED= 2
+ XML_ATTRIBUTE_IMPLIED= 3
+ XML_ATTRIBUTE_FIXED= 4
+
+ ctypedef enum xmlEntityType:
+ XML_INTERNAL_GENERAL_ENTITY= 1
+ XML_EXTERNAL_GENERAL_PARSED_ENTITY= 2
+ XML_EXTERNAL_GENERAL_UNPARSED_ENTITY= 3
+ XML_INTERNAL_PARAMETER_ENTITY= 4
+ XML_EXTERNAL_PARAMETER_ENTITY= 5
+ XML_INTERNAL_PREDEFINED_ENTITY= 6
+
+ ctypedef struct xmlNs:
+ const_xmlChar* href
+ const_xmlChar* prefix
+ xmlNs* next
+
+ ctypedef struct xmlNode:
+ void* _private
+ xmlElementType type
+ const_xmlChar* name
+ xmlNode* children
+ xmlNode* last
+ xmlNode* parent
+ xmlNode* next
+ xmlNode* prev
+ xmlDoc* doc
+ xmlChar* content
+ xmlAttr* properties
+ xmlNs* ns
+ xmlNs* nsDef
+ unsigned short line
+
+ ctypedef struct xmlElementContent:
+ xmlElementContentType type
+ xmlElementContentOccur ocur
+ const_xmlChar *name
+ xmlElementContent *c1
+ xmlElementContent *c2
+ xmlElementContent *parent
+ const_xmlChar *prefix
+
+ ctypedef struct xmlEnumeration:
+ xmlEnumeration *next
+ const_xmlChar *name
+
+ ctypedef struct xmlAttribute:
+ void* _private
+ xmlElementType type
+ const_xmlChar* name
+ xmlNode* children
+ xmlNode* last
+ xmlDtd* parent
+ xmlNode* next
+ xmlNode* prev
+ xmlDoc* doc
+ xmlAttribute* nexth
+ xmlAttributeType atype
+ xmlAttributeDefault def_ "def"
+ const_xmlChar* defaultValue
+ xmlEnumeration* tree
+ const_xmlChar* prefix
+ const_xmlChar* elem
+
+ ctypedef struct xmlElement:
+ void* _private
+ xmlElementType type
+ const_xmlChar* name
+ xmlNode* children
+ xmlNode* last
+ xmlNode* parent
+ xmlNode* next
+ xmlNode* prev
+ xmlDoc* doc
+ xmlElementTypeVal etype
+ xmlElementContent* content
+ xmlAttribute* attributes
+ const_xmlChar* prefix
+ void *contModel
+
+ ctypedef struct xmlEntity:
+ void* _private
+ xmlElementType type
+ const_xmlChar* name
+ xmlNode* children
+ xmlNode* last
+ xmlDtd* parent
+ xmlNode* next
+ xmlNode* prev
+ xmlDoc* doc
+ xmlChar* orig
+ xmlChar* content
+ int length
+ xmlEntityType etype
+ const_xmlChar* ExternalID
+ const_xmlChar* SystemID
+ xmlEntity* nexte
+ const_xmlChar* URI
+ int owner
+ int checked
+
+ ctypedef struct xmlDtd:
+ const_xmlChar* name
+ const_xmlChar* ExternalID
+ const_xmlChar* SystemID
+ void* notations
+ void* entities
+ void* pentities
+ void* attributes
+ void* elements
+ xmlNode* children
+ xmlNode* last
+ xmlDoc* doc
+
+ ctypedef struct xmlDoc:
+ xmlElementType type
+ char* name
+ xmlNode* children
+ xmlNode* last
+ xmlNode* parent
+ xmlNode* next
+ xmlNode* prev
+ xmlDoc* doc
+ xmlDict* dict
+ xmlHashTable* ids
+ int standalone
+ const_xmlChar* version
+ const_xmlChar* encoding
+ const_xmlChar* URL
+ void* _private
+ xmlDtd* intSubset
+ xmlDtd* extSubset
+
+ ctypedef struct xmlAttr:
+ void* _private
+ xmlElementType type
+ const_xmlChar* name
+ xmlNode* children
+ xmlNode* last
+ xmlNode* parent
+ xmlAttr* next
+ xmlAttr* prev
+ xmlDoc* doc
+ xmlNs* ns
+ xmlAttributeType atype
+
+ ctypedef struct xmlID:
+ const_xmlChar* value
+ const_xmlChar* name
+ xmlAttr* attr
+ xmlDoc* doc
+
+ ctypedef struct xmlBuffer
+
+ ctypedef struct xmlBuf # new in libxml2 2.9
+
+ ctypedef struct xmlOutputBuffer:
+ xmlBuf* buffer
+ xmlBuf* conv
+ int error
+
+ const_xmlChar* XML_XML_NAMESPACE
+
+ cdef void xmlFreeDoc(xmlDoc* cur) nogil
+ cdef void xmlFreeDtd(xmlDtd* cur) nogil
+ cdef void xmlFreeNode(xmlNode* cur) nogil
+ cdef void xmlFreeNsList(xmlNs* ns) nogil
+ cdef void xmlFreeNs(xmlNs* ns) nogil
+ cdef void xmlFree(void* buf) nogil
+
+ cdef xmlNode* xmlNewNode(xmlNs* ns, const_xmlChar* name) nogil
+ cdef xmlNode* xmlNewDocText(xmlDoc* doc, const_xmlChar* content) nogil
+ cdef xmlNode* xmlNewDocComment(xmlDoc* doc, const_xmlChar* content) nogil
+ cdef xmlNode* xmlNewDocPI(xmlDoc* doc, const_xmlChar* name, const_xmlChar* content) nogil
+ cdef xmlNode* xmlNewReference(xmlDoc* doc, const_xmlChar* name) nogil
+ cdef xmlNode* xmlNewCDataBlock(xmlDoc* doc, const_xmlChar* text, int len) nogil
+ cdef xmlNs* xmlNewNs(xmlNode* node, const_xmlChar* href, const_xmlChar* prefix) nogil
+ cdef xmlNode* xmlAddChild(xmlNode* parent, xmlNode* cur) nogil
+ cdef xmlNode* xmlReplaceNode(xmlNode* old, xmlNode* cur) nogil
+ cdef xmlNode* xmlAddPrevSibling(xmlNode* cur, xmlNode* elem) nogil
+ cdef xmlNode* xmlAddNextSibling(xmlNode* cur, xmlNode* elem) nogil
+ cdef xmlNode* xmlNewDocNode(xmlDoc* doc, xmlNs* ns,
+ const_xmlChar* name, const_xmlChar* content) nogil
+ cdef xmlDoc* xmlNewDoc(const_xmlChar* version) nogil
+ cdef xmlAttr* xmlNewProp(xmlNode* node, const_xmlChar* name, const_xmlChar* value) nogil
+ cdef xmlAttr* xmlNewNsProp(xmlNode* node, xmlNs* ns,
+ const_xmlChar* name, const_xmlChar* value) nogil
+ cdef xmlChar* xmlGetNoNsProp(xmlNode* node, const_xmlChar* name) nogil
+ cdef xmlChar* xmlGetNsProp(xmlNode* node, const_xmlChar* name, const_xmlChar* nameSpace) nogil
+ cdef void xmlSetNs(xmlNode* node, xmlNs* ns) nogil
+ cdef xmlAttr* xmlSetProp(xmlNode* node, const_xmlChar* name, const_xmlChar* value) nogil
+ cdef xmlAttr* xmlSetNsProp(xmlNode* node, xmlNs* ns,
+ const_xmlChar* name, const_xmlChar* value) nogil
+ cdef int xmlRemoveID(xmlDoc* doc, xmlAttr* cur) nogil
+ cdef int xmlRemoveProp(xmlAttr* cur) nogil
+ cdef void xmlFreePropList(xmlAttr* cur) nogil
+ cdef xmlChar* xmlGetNodePath(xmlNode* node) nogil
+ cdef void xmlDocDumpMemory(xmlDoc* cur, char** mem, int* size) nogil
+ cdef void xmlDocDumpMemoryEnc(xmlDoc* cur, char** mem, int* size,
+ char* encoding) nogil
+ cdef int xmlSaveFileTo(xmlOutputBuffer* out, xmlDoc* cur,
+ char* encoding) nogil
+
+ cdef void xmlUnlinkNode(xmlNode* cur) nogil
+ cdef xmlNode* xmlDocSetRootElement(xmlDoc* doc, xmlNode* root) nogil
+ cdef xmlNode* xmlDocGetRootElement(xmlDoc* doc) nogil
+ cdef void xmlSetTreeDoc(xmlNode* tree, xmlDoc* doc) nogil
+ cdef xmlAttr* xmlHasProp(xmlNode* node, const_xmlChar* name) nogil
+ cdef xmlAttr* xmlHasNsProp(xmlNode* node, const_xmlChar* name, const_xmlChar* nameSpace) nogil
+ cdef xmlChar* xmlNodeGetContent(xmlNode* cur) nogil
+ cdef int xmlNodeBufGetContent(xmlBuffer* buffer, xmlNode* cur) nogil
+ cdef xmlNs* xmlSearchNs(xmlDoc* doc, xmlNode* node, const_xmlChar* prefix) nogil
+ cdef xmlNs* xmlSearchNsByHref(xmlDoc* doc, xmlNode* node, const_xmlChar* href) nogil
+ cdef int xmlIsBlankNode(xmlNode* node) nogil
+ cdef long xmlGetLineNo(xmlNode* node) nogil
+ cdef void xmlElemDump(stdio.FILE* f, xmlDoc* doc, xmlNode* cur) nogil
+ cdef void xmlNodeDumpOutput(xmlOutputBuffer* buf,
+ xmlDoc* doc, xmlNode* cur, int level,
+ int format, const_char* encoding) nogil
+ cdef void xmlBufAttrSerializeTxtContent(xmlOutputBuffer *buf, xmlDoc *doc,
+ xmlAttr *attr, const_xmlChar *string) nogil
+ cdef void xmlNodeSetName(xmlNode* cur, const_xmlChar* name) nogil
+ cdef void xmlNodeSetContent(xmlNode* cur, const_xmlChar* content) nogil
+ cdef xmlDtd* xmlCopyDtd(xmlDtd* dtd) nogil
+ cdef xmlDoc* xmlCopyDoc(xmlDoc* doc, int recursive) nogil
+ cdef xmlNode* xmlCopyNode(xmlNode* node, int extended) nogil
+ cdef xmlNode* xmlDocCopyNode(xmlNode* node, xmlDoc* doc, int extended) nogil
+ cdef int xmlReconciliateNs(xmlDoc* doc, xmlNode* tree) nogil
+ cdef xmlNs* xmlNewReconciliedNs(xmlDoc* doc, xmlNode* tree, xmlNs* ns) nogil
+ cdef xmlBuffer* xmlBufferCreate() nogil
+ cdef void xmlBufferWriteChar(xmlBuffer* buf, char* string) nogil
+ cdef void xmlBufferFree(xmlBuffer* buf) nogil
+ cdef const_xmlChar* xmlBufferContent(xmlBuffer* buf) nogil
+ cdef int xmlBufferLength(xmlBuffer* buf) nogil
+ cdef const_xmlChar* xmlBufContent(xmlBuf* buf) nogil # new in libxml2 2.9
+ cdef size_t xmlBufUse(xmlBuf* buf) nogil # new in libxml2 2.9
+ cdef int xmlKeepBlanksDefault(int val) nogil
+ cdef xmlChar* xmlNodeGetBase(xmlDoc* doc, xmlNode* node) nogil
+ cdef xmlDtd* xmlCreateIntSubset(xmlDoc* doc, const_xmlChar* name,
+ const_xmlChar* ExternalID, const_xmlChar* SystemID) nogil
+ cdef void xmlNodeSetBase(xmlNode* node, const_xmlChar* uri) nogil
+ cdef int xmlValidateNCName(const_xmlChar* value, int space) nogil
+
+cdef extern from "libxml/uri.h":
+ cdef const_xmlChar* xmlBuildURI(const_xmlChar* href, const_xmlChar* base) nogil
+
+cdef extern from "libxml/HTMLtree.h":
+ cdef void htmlNodeDumpFormatOutput(xmlOutputBuffer* buf,
+ xmlDoc* doc, xmlNode* cur,
+ char* encoding, int format) nogil
+ cdef xmlDoc* htmlNewDoc(const_xmlChar* uri, const_xmlChar* externalID) nogil
+
+cdef extern from "libxml/valid.h":
+ cdef xmlAttr* xmlGetID(xmlDoc* doc, const_xmlChar* ID) nogil
+ cdef void xmlDumpNotationTable(xmlBuffer* buffer,
+ xmlNotationTable* table) nogil
+ cdef int xmlValidateNameValue(const_xmlChar* value) nogil
+
+cdef extern from "libxml/xmlIO.h":
+ cdef int xmlOutputBufferWrite(xmlOutputBuffer* out,
+ int len, const_char* str) nogil
+ cdef int xmlOutputBufferWriteString(xmlOutputBuffer* out, const_char* str) nogil
+ cdef int xmlOutputBufferWriteEscape(xmlOutputBuffer* out,
+ const_xmlChar* str,
+ xmlCharEncodingOutputFunc escapefunc) nogil
+ cdef int xmlOutputBufferFlush(xmlOutputBuffer* out) nogil
+ cdef int xmlOutputBufferClose(xmlOutputBuffer* out) nogil
+
+ ctypedef int (*xmlInputReadCallback)(void* context,
+ char* buffer, int len)
+ ctypedef int (*xmlInputCloseCallback)(void* context)
+
+ ctypedef int (*xmlOutputWriteCallback)(void* context,
+ char* buffer, int len)
+ ctypedef int (*xmlOutputCloseCallback)(void* context)
+
+ cdef xmlOutputBuffer* xmlAllocOutputBuffer(
+ xmlCharEncodingHandler* encoder) nogil
+ cdef xmlOutputBuffer* xmlOutputBufferCreateIO(
+ xmlOutputWriteCallback iowrite,
+ xmlOutputCloseCallback ioclose,
+ void * ioctx,
+ xmlCharEncodingHandler* encoder) nogil
+ cdef xmlOutputBuffer* xmlOutputBufferCreateFile(
+ stdio.FILE* file, xmlCharEncodingHandler* encoder) nogil
+ cdef xmlOutputBuffer* xmlOutputBufferCreateFilename(
+ char* URI, xmlCharEncodingHandler* encoder, int compression) nogil
+
+cdef extern from "libxml/xmlsave.h":
+ ctypedef struct xmlSaveCtxt
+
+ ctypedef enum xmlSaveOption:
+ XML_SAVE_FORMAT = 1 # format save output (2.6.17)
+ XML_SAVE_NO_DECL = 2 # drop the xml declaration (2.6.21)
+ XML_SAVE_NO_EMPTY = 4 # no empty tags (2.6.22)
+ XML_SAVE_NO_XHTML = 8 # disable XHTML1 specific rules (2.6.22)
+ XML_SAVE_XHTML = 16 # force XHTML1 specific rules (2.7.2)
+ XML_SAVE_AS_XML = 32 # force XML serialization on HTML doc (2.7.2)
+ XML_SAVE_AS_HTML = 64 # force HTML serialization on XML doc (2.7.2)
+
+ cdef xmlSaveCtxt* xmlSaveToFilename(char* filename, char* encoding,
+ int options) nogil
+ cdef xmlSaveCtxt* xmlSaveToBuffer(xmlBuffer* buffer, char* encoding,
+ int options) nogil # libxml2 2.6.23
+ cdef long xmlSaveDoc(xmlSaveCtxt* ctxt, xmlDoc* doc) nogil
+ cdef long xmlSaveTree(xmlSaveCtxt* ctxt, xmlNode* node) nogil
+ cdef int xmlSaveClose(xmlSaveCtxt* ctxt) nogil
+ cdef int xmlSaveFlush(xmlSaveCtxt* ctxt) nogil
+ cdef int xmlSaveSetAttrEscape(xmlSaveCtxt* ctxt, void* escape_func) nogil
+ cdef int xmlSaveSetEscape(xmlSaveCtxt* ctxt, void* escape_func) nogil
+
+cdef extern from "libxml/globals.h":
+ cdef int xmlThrDefKeepBlanksDefaultValue(int onoff) nogil
+ cdef int xmlThrDefLineNumbersDefaultValue(int onoff) nogil
+ cdef int xmlThrDefIndentTreeOutput(int onoff) nogil
+
+cdef extern from "libxml/xmlmemory.h" nogil:
+ cdef void* xmlMalloc(size_t size)
+ cdef int xmlMemBlocks()
+ cdef int xmlMemUsed()
+ cdef void xmlMemDisplay(stdio.FILE* file)
+ cdef void xmlMemDisplayLast(stdio.FILE* file, long num_bytes)
+ cdef void xmlMemShow(stdio.FILE* file, int count)
+
+cdef extern from "etree_defs.h":
+ cdef bint _isElement(xmlNode* node) nogil
+ cdef bint _isElementOrXInclude(xmlNode* node) nogil
+ cdef const_xmlChar* _getNs(xmlNode* node) nogil
+ cdef void BEGIN_FOR_EACH_ELEMENT_FROM(xmlNode* tree_top,
+ xmlNode* start_node,
+ bint inclusive) nogil
+ cdef void END_FOR_EACH_ELEMENT_FROM(xmlNode* start_node) nogil
+ cdef void BEGIN_FOR_EACH_FROM(xmlNode* tree_top,
+ xmlNode* start_node,
+ bint inclusive) nogil
+ cdef void END_FOR_EACH_FROM(xmlNode* start_node) nogil
diff --git a/src/lxml/includes/uri.pxd b/src/lxml/includes/uri.pxd
new file mode 100644
index 0000000..2b6bb79
--- /dev/null
+++ b/src/lxml/includes/uri.pxd
@@ -0,0 +1,5 @@
+cdef extern from "libxml/uri.h":
+ ctypedef struct xmlURI
+
+ cdef xmlURI* xmlParseURI(char* str)
+ cdef void xmlFreeURI(xmlURI* uri)
diff --git a/src/lxml/includes/xinclude.pxd b/src/lxml/includes/xinclude.pxd
new file mode 100644
index 0000000..4232d3e
--- /dev/null
+++ b/src/lxml/includes/xinclude.pxd
@@ -0,0 +1,22 @@
+from lxml.includes.tree cimport xmlDoc, xmlNode
+
+cdef extern from "libxml/xinclude.h":
+
+ ctypedef struct xmlXIncludeCtxt
+
+ cdef int xmlXIncludeProcess(xmlDoc* doc) nogil
+ cdef int xmlXIncludeProcessFlags(xmlDoc* doc, int parser_opts) nogil
+ cdef int xmlXIncludeProcessTree(xmlNode* doc) nogil
+ cdef int xmlXIncludeProcessTreeFlags(xmlNode* doc, int parser_opts) nogil
+
+ # libxml2 >= 2.7.4
+ cdef int xmlXIncludeProcessTreeFlagsData(
+ xmlNode* doc, int parser_opts, void* data) nogil
+
+ cdef xmlXIncludeCtxt* xmlXIncludeNewContext(xmlDoc* doc) nogil
+ cdef int xmlXIncludeProcessNode(xmlXIncludeCtxt* ctxt, xmlNode* node) nogil
+ cdef int xmlXIncludeSetFlags(xmlXIncludeCtxt* ctxt, int flags) nogil
+
+ # libxml2 >= 2.6.27
+ cdef int xmlXIncludeProcessFlagsData(
+ xmlDoc* doc, int flags, void* data) nogil
diff --git a/src/lxml/includes/xmlerror.pxd b/src/lxml/includes/xmlerror.pxd
new file mode 100644
index 0000000..4b7551b
--- /dev/null
+++ b/src/lxml/includes/xmlerror.pxd
@@ -0,0 +1,851 @@
+
+# --- BEGIN: GENERATED CONSTANTS ---
+
+# This section is generated by the script 'update-error-constants.py'.
+
+cdef extern from "libxml/xmlerror.h":
+ ctypedef enum xmlErrorLevel:
+ XML_ERR_NONE = 0
+ XML_ERR_WARNING = 1 # A simple warning
+ XML_ERR_ERROR = 2 # A recoverable error
+ XML_ERR_FATAL = 3 # A fatal error
+
+ ctypedef enum xmlErrorDomain:
+ XML_FROM_NONE = 0
+ XML_FROM_PARSER = 1 # The XML parser
+ XML_FROM_TREE = 2 # The tree module
+ XML_FROM_NAMESPACE = 3 # The XML Namespace module
+ XML_FROM_DTD = 4 # The XML DTD validation with parser contex
+ XML_FROM_HTML = 5 # The HTML parser
+ XML_FROM_MEMORY = 6 # The memory allocator
+ XML_FROM_OUTPUT = 7 # The serialization code
+ XML_FROM_IO = 8 # The Input/Output stack
+ XML_FROM_FTP = 9 # The FTP module
+ XML_FROM_HTTP = 10 # The HTTP module
+ XML_FROM_XINCLUDE = 11 # The XInclude processing
+ XML_FROM_XPATH = 12 # The XPath module
+ XML_FROM_XPOINTER = 13 # The XPointer module
+ XML_FROM_REGEXP = 14 # The regular expressions module
+ XML_FROM_DATATYPE = 15 # The W3C XML Schemas Datatype module
+ XML_FROM_SCHEMASP = 16 # The W3C XML Schemas parser module
+ XML_FROM_SCHEMASV = 17 # The W3C XML Schemas validation module
+ XML_FROM_RELAXNGP = 18 # The Relax-NG parser module
+ XML_FROM_RELAXNGV = 19 # The Relax-NG validator module
+ XML_FROM_CATALOG = 20 # The Catalog module
+ XML_FROM_C14N = 21 # The Canonicalization module
+ XML_FROM_XSLT = 22 # The XSLT engine from libxslt
+ XML_FROM_VALID = 23 # The XML DTD validation with valid context
+ XML_FROM_CHECK = 24 # The error checking module
+ XML_FROM_WRITER = 25 # The xmlwriter module
+ XML_FROM_MODULE = 26 # The dynamically loaded module modul
+ XML_FROM_I18N = 27 # The module handling character conversion
+ XML_FROM_SCHEMATRONV = 28 # The Schematron validator module
+ XML_FROM_BUFFER = 29 # The buffers module
+ XML_FROM_URI = 30 # The URI module
+
+ ctypedef enum xmlParserErrors:
+ XML_ERR_OK = 0
+ XML_ERR_INTERNAL_ERROR = 1
+ XML_ERR_NO_MEMORY = 2
+ XML_ERR_DOCUMENT_START = 3
+ XML_ERR_DOCUMENT_EMPTY = 4
+ XML_ERR_DOCUMENT_END = 5
+ XML_ERR_INVALID_HEX_CHARREF = 6
+ XML_ERR_INVALID_DEC_CHARREF = 7
+ XML_ERR_INVALID_CHARREF = 8
+ XML_ERR_INVALID_CHAR = 9
+ XML_ERR_CHARREF_AT_EOF = 10
+ XML_ERR_CHARREF_IN_PROLOG = 11
+ XML_ERR_CHARREF_IN_EPILOG = 12
+ XML_ERR_CHARREF_IN_DTD = 13
+ XML_ERR_ENTITYREF_AT_EOF = 14
+ XML_ERR_ENTITYREF_IN_PROLOG = 15
+ XML_ERR_ENTITYREF_IN_EPILOG = 16
+ XML_ERR_ENTITYREF_IN_DTD = 17
+ XML_ERR_PEREF_AT_EOF = 18
+ XML_ERR_PEREF_IN_PROLOG = 19
+ XML_ERR_PEREF_IN_EPILOG = 20
+ XML_ERR_PEREF_IN_INT_SUBSET = 21
+ XML_ERR_ENTITYREF_NO_NAME = 22
+ XML_ERR_ENTITYREF_SEMICOL_MISSING = 23
+ XML_ERR_PEREF_NO_NAME = 24
+ XML_ERR_PEREF_SEMICOL_MISSING = 25
+ XML_ERR_UNDECLARED_ENTITY = 26
+ XML_WAR_UNDECLARED_ENTITY = 27
+ XML_ERR_UNPARSED_ENTITY = 28
+ XML_ERR_ENTITY_IS_EXTERNAL = 29
+ XML_ERR_ENTITY_IS_PARAMETER = 30
+ XML_ERR_UNKNOWN_ENCODING = 31
+ XML_ERR_UNSUPPORTED_ENCODING = 32
+ XML_ERR_STRING_NOT_STARTED = 33
+ XML_ERR_STRING_NOT_CLOSED = 34
+ XML_ERR_NS_DECL_ERROR = 35
+ XML_ERR_ENTITY_NOT_STARTED = 36
+ XML_ERR_ENTITY_NOT_FINISHED = 37
+ XML_ERR_LT_IN_ATTRIBUTE = 38
+ XML_ERR_ATTRIBUTE_NOT_STARTED = 39
+ XML_ERR_ATTRIBUTE_NOT_FINISHED = 40
+ XML_ERR_ATTRIBUTE_WITHOUT_VALUE = 41
+ XML_ERR_ATTRIBUTE_REDEFINED = 42
+ XML_ERR_LITERAL_NOT_STARTED = 43
+ XML_ERR_LITERAL_NOT_FINISHED = 44
+ XML_ERR_COMMENT_NOT_FINISHED = 45
+ XML_ERR_PI_NOT_STARTED = 46
+ XML_ERR_PI_NOT_FINISHED = 47
+ XML_ERR_NOTATION_NOT_STARTED = 48
+ XML_ERR_NOTATION_NOT_FINISHED = 49
+ XML_ERR_ATTLIST_NOT_STARTED = 50
+ XML_ERR_ATTLIST_NOT_FINISHED = 51
+ XML_ERR_MIXED_NOT_STARTED = 52
+ XML_ERR_MIXED_NOT_FINISHED = 53
+ XML_ERR_ELEMCONTENT_NOT_STARTED = 54
+ XML_ERR_ELEMCONTENT_NOT_FINISHED = 55
+ XML_ERR_XMLDECL_NOT_STARTED = 56
+ XML_ERR_XMLDECL_NOT_FINISHED = 57
+ XML_ERR_CONDSEC_NOT_STARTED = 58
+ XML_ERR_CONDSEC_NOT_FINISHED = 59
+ XML_ERR_EXT_SUBSET_NOT_FINISHED = 60
+ XML_ERR_DOCTYPE_NOT_FINISHED = 61
+ XML_ERR_MISPLACED_CDATA_END = 62
+ XML_ERR_CDATA_NOT_FINISHED = 63
+ XML_ERR_RESERVED_XML_NAME = 64
+ XML_ERR_SPACE_REQUIRED = 65
+ XML_ERR_SEPARATOR_REQUIRED = 66
+ XML_ERR_NMTOKEN_REQUIRED = 67
+ XML_ERR_NAME_REQUIRED = 68
+ XML_ERR_PCDATA_REQUIRED = 69
+ XML_ERR_URI_REQUIRED = 70
+ XML_ERR_PUBID_REQUIRED = 71
+ XML_ERR_LT_REQUIRED = 72
+ XML_ERR_GT_REQUIRED = 73
+ XML_ERR_LTSLASH_REQUIRED = 74
+ XML_ERR_EQUAL_REQUIRED = 75
+ XML_ERR_TAG_NAME_MISMATCH = 76
+ XML_ERR_TAG_NOT_FINISHED = 77
+ XML_ERR_STANDALONE_VALUE = 78
+ XML_ERR_ENCODING_NAME = 79
+ XML_ERR_HYPHEN_IN_COMMENT = 80
+ XML_ERR_INVALID_ENCODING = 81
+ XML_ERR_EXT_ENTITY_STANDALONE = 82
+ XML_ERR_CONDSEC_INVALID = 83
+ XML_ERR_VALUE_REQUIRED = 84
+ XML_ERR_NOT_WELL_BALANCED = 85
+ XML_ERR_EXTRA_CONTENT = 86
+ XML_ERR_ENTITY_CHAR_ERROR = 87
+ XML_ERR_ENTITY_PE_INTERNAL = 88
+ XML_ERR_ENTITY_LOOP = 89
+ XML_ERR_ENTITY_BOUNDARY = 90
+ XML_ERR_INVALID_URI = 91
+ XML_ERR_URI_FRAGMENT = 92
+ XML_WAR_CATALOG_PI = 93
+ XML_ERR_NO_DTD = 94
+ XML_ERR_CONDSEC_INVALID_KEYWORD = 95
+ XML_ERR_VERSION_MISSING = 96
+ XML_WAR_UNKNOWN_VERSION = 97
+ XML_WAR_LANG_VALUE = 98
+ XML_WAR_NS_URI = 99
+ XML_WAR_NS_URI_RELATIVE = 100
+ XML_ERR_MISSING_ENCODING = 101
+ XML_WAR_SPACE_VALUE = 102
+ XML_ERR_NOT_STANDALONE = 103
+ XML_ERR_ENTITY_PROCESSING = 104
+ XML_ERR_NOTATION_PROCESSING = 105
+ XML_WAR_NS_COLUMN = 106
+ XML_WAR_ENTITY_REDEFINED = 107
+ XML_ERR_UNKNOWN_VERSION = 108
+ XML_ERR_VERSION_MISMATCH = 109
+ XML_ERR_NAME_TOO_LONG = 110
+ XML_ERR_USER_STOP = 111
+ XML_NS_ERR_XML_NAMESPACE = 200
+ XML_NS_ERR_UNDEFINED_NAMESPACE = 201
+ XML_NS_ERR_QNAME = 202
+ XML_NS_ERR_ATTRIBUTE_REDEFINED = 203
+ XML_NS_ERR_EMPTY = 204
+ XML_NS_ERR_COLON = 205
+ XML_DTD_ATTRIBUTE_DEFAULT = 500
+ XML_DTD_ATTRIBUTE_REDEFINED = 501
+ XML_DTD_ATTRIBUTE_VALUE = 502
+ XML_DTD_CONTENT_ERROR = 503
+ XML_DTD_CONTENT_MODEL = 504
+ XML_DTD_CONTENT_NOT_DETERMINIST = 505
+ XML_DTD_DIFFERENT_PREFIX = 506
+ XML_DTD_ELEM_DEFAULT_NAMESPACE = 507
+ XML_DTD_ELEM_NAMESPACE = 508
+ XML_DTD_ELEM_REDEFINED = 509
+ XML_DTD_EMPTY_NOTATION = 510
+ XML_DTD_ENTITY_TYPE = 511
+ XML_DTD_ID_FIXED = 512
+ XML_DTD_ID_REDEFINED = 513
+ XML_DTD_ID_SUBSET = 514
+ XML_DTD_INVALID_CHILD = 515
+ XML_DTD_INVALID_DEFAULT = 516
+ XML_DTD_LOAD_ERROR = 517
+ XML_DTD_MISSING_ATTRIBUTE = 518
+ XML_DTD_MIXED_CORRUPT = 519
+ XML_DTD_MULTIPLE_ID = 520
+ XML_DTD_NO_DOC = 521
+ XML_DTD_NO_DTD = 522
+ XML_DTD_NO_ELEM_NAME = 523
+ XML_DTD_NO_PREFIX = 524
+ XML_DTD_NO_ROOT = 525
+ XML_DTD_NOTATION_REDEFINED = 526
+ XML_DTD_NOTATION_VALUE = 527
+ XML_DTD_NOT_EMPTY = 528
+ XML_DTD_NOT_PCDATA = 529
+ XML_DTD_NOT_STANDALONE = 530
+ XML_DTD_ROOT_NAME = 531
+ XML_DTD_STANDALONE_WHITE_SPACE = 532
+ XML_DTD_UNKNOWN_ATTRIBUTE = 533
+ XML_DTD_UNKNOWN_ELEM = 534
+ XML_DTD_UNKNOWN_ENTITY = 535
+ XML_DTD_UNKNOWN_ID = 536
+ XML_DTD_UNKNOWN_NOTATION = 537
+ XML_DTD_STANDALONE_DEFAULTED = 538
+ XML_DTD_XMLID_VALUE = 539
+ XML_DTD_XMLID_TYPE = 540
+ XML_DTD_DUP_TOKEN = 541
+ XML_HTML_STRUCURE_ERROR = 800
+ XML_HTML_UNKNOWN_TAG = 801
+ XML_RNGP_ANYNAME_ATTR_ANCESTOR = 1000
+ XML_RNGP_ATTR_CONFLICT = 1001
+ XML_RNGP_ATTRIBUTE_CHILDREN = 1002
+ XML_RNGP_ATTRIBUTE_CONTENT = 1003
+ XML_RNGP_ATTRIBUTE_EMPTY = 1004
+ XML_RNGP_ATTRIBUTE_NOOP = 1005
+ XML_RNGP_CHOICE_CONTENT = 1006
+ XML_RNGP_CHOICE_EMPTY = 1007
+ XML_RNGP_CREATE_FAILURE = 1008
+ XML_RNGP_DATA_CONTENT = 1009
+ XML_RNGP_DEF_CHOICE_AND_INTERLEAVE = 1010
+ XML_RNGP_DEFINE_CREATE_FAILED = 1011
+ XML_RNGP_DEFINE_EMPTY = 1012
+ XML_RNGP_DEFINE_MISSING = 1013
+ XML_RNGP_DEFINE_NAME_MISSING = 1014
+ XML_RNGP_ELEM_CONTENT_EMPTY = 1015
+ XML_RNGP_ELEM_CONTENT_ERROR = 1016
+ XML_RNGP_ELEMENT_EMPTY = 1017
+ XML_RNGP_ELEMENT_CONTENT = 1018
+ XML_RNGP_ELEMENT_NAME = 1019
+ XML_RNGP_ELEMENT_NO_CONTENT = 1020
+ XML_RNGP_ELEM_TEXT_CONFLICT = 1021
+ XML_RNGP_EMPTY = 1022
+ XML_RNGP_EMPTY_CONSTRUCT = 1023
+ XML_RNGP_EMPTY_CONTENT = 1024
+ XML_RNGP_EMPTY_NOT_EMPTY = 1025
+ XML_RNGP_ERROR_TYPE_LIB = 1026
+ XML_RNGP_EXCEPT_EMPTY = 1027
+ XML_RNGP_EXCEPT_MISSING = 1028
+ XML_RNGP_EXCEPT_MULTIPLE = 1029
+ XML_RNGP_EXCEPT_NO_CONTENT = 1030
+ XML_RNGP_EXTERNALREF_EMTPY = 1031
+ XML_RNGP_EXTERNAL_REF_FAILURE = 1032
+ XML_RNGP_EXTERNALREF_RECURSE = 1033
+ XML_RNGP_FORBIDDEN_ATTRIBUTE = 1034
+ XML_RNGP_FOREIGN_ELEMENT = 1035
+ XML_RNGP_GRAMMAR_CONTENT = 1036
+ XML_RNGP_GRAMMAR_EMPTY = 1037
+ XML_RNGP_GRAMMAR_MISSING = 1038
+ XML_RNGP_GRAMMAR_NO_START = 1039
+ XML_RNGP_GROUP_ATTR_CONFLICT = 1040
+ XML_RNGP_HREF_ERROR = 1041
+ XML_RNGP_INCLUDE_EMPTY = 1042
+ XML_RNGP_INCLUDE_FAILURE = 1043
+ XML_RNGP_INCLUDE_RECURSE = 1044
+ XML_RNGP_INTERLEAVE_ADD = 1045
+ XML_RNGP_INTERLEAVE_CREATE_FAILED = 1046
+ XML_RNGP_INTERLEAVE_EMPTY = 1047
+ XML_RNGP_INTERLEAVE_NO_CONTENT = 1048
+ XML_RNGP_INVALID_DEFINE_NAME = 1049
+ XML_RNGP_INVALID_URI = 1050
+ XML_RNGP_INVALID_VALUE = 1051
+ XML_RNGP_MISSING_HREF = 1052
+ XML_RNGP_NAME_MISSING = 1053
+ XML_RNGP_NEED_COMBINE = 1054
+ XML_RNGP_NOTALLOWED_NOT_EMPTY = 1055
+ XML_RNGP_NSNAME_ATTR_ANCESTOR = 1056
+ XML_RNGP_NSNAME_NO_NS = 1057
+ XML_RNGP_PARAM_FORBIDDEN = 1058
+ XML_RNGP_PARAM_NAME_MISSING = 1059
+ XML_RNGP_PARENTREF_CREATE_FAILED = 1060
+ XML_RNGP_PARENTREF_NAME_INVALID = 1061
+ XML_RNGP_PARENTREF_NO_NAME = 1062
+ XML_RNGP_PARENTREF_NO_PARENT = 1063
+ XML_RNGP_PARENTREF_NOT_EMPTY = 1064
+ XML_RNGP_PARSE_ERROR = 1065
+ XML_RNGP_PAT_ANYNAME_EXCEPT_ANYNAME = 1066
+ XML_RNGP_PAT_ATTR_ATTR = 1067
+ XML_RNGP_PAT_ATTR_ELEM = 1068
+ XML_RNGP_PAT_DATA_EXCEPT_ATTR = 1069
+ XML_RNGP_PAT_DATA_EXCEPT_ELEM = 1070
+ XML_RNGP_PAT_DATA_EXCEPT_EMPTY = 1071
+ XML_RNGP_PAT_DATA_EXCEPT_GROUP = 1072
+ XML_RNGP_PAT_DATA_EXCEPT_INTERLEAVE = 1073
+ XML_RNGP_PAT_DATA_EXCEPT_LIST = 1074
+ XML_RNGP_PAT_DATA_EXCEPT_ONEMORE = 1075
+ XML_RNGP_PAT_DATA_EXCEPT_REF = 1076
+ XML_RNGP_PAT_DATA_EXCEPT_TEXT = 1077
+ XML_RNGP_PAT_LIST_ATTR = 1078
+ XML_RNGP_PAT_LIST_ELEM = 1079
+ XML_RNGP_PAT_LIST_INTERLEAVE = 1080
+ XML_RNGP_PAT_LIST_LIST = 1081
+ XML_RNGP_PAT_LIST_REF = 1082
+ XML_RNGP_PAT_LIST_TEXT = 1083
+ XML_RNGP_PAT_NSNAME_EXCEPT_ANYNAME = 1084
+ XML_RNGP_PAT_NSNAME_EXCEPT_NSNAME = 1085
+ XML_RNGP_PAT_ONEMORE_GROUP_ATTR = 1086
+ XML_RNGP_PAT_ONEMORE_INTERLEAVE_ATTR = 1087
+ XML_RNGP_PAT_START_ATTR = 1088
+ XML_RNGP_PAT_START_DATA = 1089
+ XML_RNGP_PAT_START_EMPTY = 1090
+ XML_RNGP_PAT_START_GROUP = 1091
+ XML_RNGP_PAT_START_INTERLEAVE = 1092
+ XML_RNGP_PAT_START_LIST = 1093
+ XML_RNGP_PAT_START_ONEMORE = 1094
+ XML_RNGP_PAT_START_TEXT = 1095
+ XML_RNGP_PAT_START_VALUE = 1096
+ XML_RNGP_PREFIX_UNDEFINED = 1097
+ XML_RNGP_REF_CREATE_FAILED = 1098
+ XML_RNGP_REF_CYCLE = 1099
+ XML_RNGP_REF_NAME_INVALID = 1100
+ XML_RNGP_REF_NO_DEF = 1101
+ XML_RNGP_REF_NO_NAME = 1102
+ XML_RNGP_REF_NOT_EMPTY = 1103
+ XML_RNGP_START_CHOICE_AND_INTERLEAVE = 1104
+ XML_RNGP_START_CONTENT = 1105
+ XML_RNGP_START_EMPTY = 1106
+ XML_RNGP_START_MISSING = 1107
+ XML_RNGP_TEXT_EXPECTED = 1108
+ XML_RNGP_TEXT_HAS_CHILD = 1109
+ XML_RNGP_TYPE_MISSING = 1110
+ XML_RNGP_TYPE_NOT_FOUND = 1111
+ XML_RNGP_TYPE_VALUE = 1112
+ XML_RNGP_UNKNOWN_ATTRIBUTE = 1113
+ XML_RNGP_UNKNOWN_COMBINE = 1114
+ XML_RNGP_UNKNOWN_CONSTRUCT = 1115
+ XML_RNGP_UNKNOWN_TYPE_LIB = 1116
+ XML_RNGP_URI_FRAGMENT = 1117
+ XML_RNGP_URI_NOT_ABSOLUTE = 1118
+ XML_RNGP_VALUE_EMPTY = 1119
+ XML_RNGP_VALUE_NO_CONTENT = 1120
+ XML_RNGP_XMLNS_NAME = 1121
+ XML_RNGP_XML_NS = 1122
+ XML_XPATH_EXPRESSION_OK = 1200
+ XML_XPATH_NUMBER_ERROR = 1201
+ XML_XPATH_UNFINISHED_LITERAL_ERROR = 1202
+ XML_XPATH_START_LITERAL_ERROR = 1203
+ XML_XPATH_VARIABLE_REF_ERROR = 1204
+ XML_XPATH_UNDEF_VARIABLE_ERROR = 1205
+ XML_XPATH_INVALID_PREDICATE_ERROR = 1206
+ XML_XPATH_EXPR_ERROR = 1207
+ XML_XPATH_UNCLOSED_ERROR = 1208
+ XML_XPATH_UNKNOWN_FUNC_ERROR = 1209
+ XML_XPATH_INVALID_OPERAND = 1210
+ XML_XPATH_INVALID_TYPE = 1211
+ XML_XPATH_INVALID_ARITY = 1212
+ XML_XPATH_INVALID_CTXT_SIZE = 1213
+ XML_XPATH_INVALID_CTXT_POSITION = 1214
+ XML_XPATH_MEMORY_ERROR = 1215
+ XML_XPTR_SYNTAX_ERROR = 1216
+ XML_XPTR_RESOURCE_ERROR = 1217
+ XML_XPTR_SUB_RESOURCE_ERROR = 1218
+ XML_XPATH_UNDEF_PREFIX_ERROR = 1219
+ XML_XPATH_ENCODING_ERROR = 1220
+ XML_XPATH_INVALID_CHAR_ERROR = 1221
+ XML_TREE_INVALID_HEX = 1300
+ XML_TREE_INVALID_DEC = 1301
+ XML_TREE_UNTERMINATED_ENTITY = 1302
+ XML_TREE_NOT_UTF8 = 1303
+ XML_SAVE_NOT_UTF8 = 1400
+ XML_SAVE_CHAR_INVALID = 1401
+ XML_SAVE_NO_DOCTYPE = 1402
+ XML_SAVE_UNKNOWN_ENCODING = 1403
+ XML_REGEXP_COMPILE_ERROR = 1450
+ XML_IO_UNKNOWN = 1500
+ XML_IO_EACCES = 1501
+ XML_IO_EAGAIN = 1502
+ XML_IO_EBADF = 1503
+ XML_IO_EBADMSG = 1504
+ XML_IO_EBUSY = 1505
+ XML_IO_ECANCELED = 1506
+ XML_IO_ECHILD = 1507
+ XML_IO_EDEADLK = 1508
+ XML_IO_EDOM = 1509
+ XML_IO_EEXIST = 1510
+ XML_IO_EFAULT = 1511
+ XML_IO_EFBIG = 1512
+ XML_IO_EINPROGRESS = 1513
+ XML_IO_EINTR = 1514
+ XML_IO_EINVAL = 1515
+ XML_IO_EIO = 1516
+ XML_IO_EISDIR = 1517
+ XML_IO_EMFILE = 1518
+ XML_IO_EMLINK = 1519
+ XML_IO_EMSGSIZE = 1520
+ XML_IO_ENAMETOOLONG = 1521
+ XML_IO_ENFILE = 1522
+ XML_IO_ENODEV = 1523
+ XML_IO_ENOENT = 1524
+ XML_IO_ENOEXEC = 1525
+ XML_IO_ENOLCK = 1526
+ XML_IO_ENOMEM = 1527
+ XML_IO_ENOSPC = 1528
+ XML_IO_ENOSYS = 1529
+ XML_IO_ENOTDIR = 1530
+ XML_IO_ENOTEMPTY = 1531
+ XML_IO_ENOTSUP = 1532
+ XML_IO_ENOTTY = 1533
+ XML_IO_ENXIO = 1534
+ XML_IO_EPERM = 1535
+ XML_IO_EPIPE = 1536
+ XML_IO_ERANGE = 1537
+ XML_IO_EROFS = 1538
+ XML_IO_ESPIPE = 1539
+ XML_IO_ESRCH = 1540
+ XML_IO_ETIMEDOUT = 1541
+ XML_IO_EXDEV = 1542
+ XML_IO_NETWORK_ATTEMPT = 1543
+ XML_IO_ENCODER = 1544
+ XML_IO_FLUSH = 1545
+ XML_IO_WRITE = 1546
+ XML_IO_NO_INPUT = 1547
+ XML_IO_BUFFER_FULL = 1548
+ XML_IO_LOAD_ERROR = 1549
+ XML_IO_ENOTSOCK = 1550
+ XML_IO_EISCONN = 1551
+ XML_IO_ECONNREFUSED = 1552
+ XML_IO_ENETUNREACH = 1553
+ XML_IO_EADDRINUSE = 1554
+ XML_IO_EALREADY = 1555
+ XML_IO_EAFNOSUPPORT = 1556
+ XML_XINCLUDE_RECURSION = 1600
+ XML_XINCLUDE_PARSE_VALUE = 1601
+ XML_XINCLUDE_ENTITY_DEF_MISMATCH = 1602
+ XML_XINCLUDE_NO_HREF = 1603
+ XML_XINCLUDE_NO_FALLBACK = 1604
+ XML_XINCLUDE_HREF_URI = 1605
+ XML_XINCLUDE_TEXT_FRAGMENT = 1606
+ XML_XINCLUDE_TEXT_DOCUMENT = 1607
+ XML_XINCLUDE_INVALID_CHAR = 1608
+ XML_XINCLUDE_BUILD_FAILED = 1609
+ XML_XINCLUDE_UNKNOWN_ENCODING = 1610
+ XML_XINCLUDE_MULTIPLE_ROOT = 1611
+ XML_XINCLUDE_XPTR_FAILED = 1612
+ XML_XINCLUDE_XPTR_RESULT = 1613
+ XML_XINCLUDE_INCLUDE_IN_INCLUDE = 1614
+ XML_XINCLUDE_FALLBACKS_IN_INCLUDE = 1615
+ XML_XINCLUDE_FALLBACK_NOT_IN_INCLUDE = 1616
+ XML_XINCLUDE_DEPRECATED_NS = 1617
+ XML_XINCLUDE_FRAGMENT_ID = 1618
+ XML_CATALOG_MISSING_ATTR = 1650
+ XML_CATALOG_ENTRY_BROKEN = 1651
+ XML_CATALOG_PREFER_VALUE = 1652
+ XML_CATALOG_NOT_CATALOG = 1653
+ XML_CATALOG_RECURSION = 1654
+ XML_SCHEMAP_PREFIX_UNDEFINED = 1700
+ XML_SCHEMAP_ATTRFORMDEFAULT_VALUE = 1701
+ XML_SCHEMAP_ATTRGRP_NONAME_NOREF = 1702
+ XML_SCHEMAP_ATTR_NONAME_NOREF = 1703
+ XML_SCHEMAP_COMPLEXTYPE_NONAME_NOREF = 1704
+ XML_SCHEMAP_ELEMFORMDEFAULT_VALUE = 1705
+ XML_SCHEMAP_ELEM_NONAME_NOREF = 1706
+ XML_SCHEMAP_EXTENSION_NO_BASE = 1707
+ XML_SCHEMAP_FACET_NO_VALUE = 1708
+ XML_SCHEMAP_FAILED_BUILD_IMPORT = 1709
+ XML_SCHEMAP_GROUP_NONAME_NOREF = 1710
+ XML_SCHEMAP_IMPORT_NAMESPACE_NOT_URI = 1711
+ XML_SCHEMAP_IMPORT_REDEFINE_NSNAME = 1712
+ XML_SCHEMAP_IMPORT_SCHEMA_NOT_URI = 1713
+ XML_SCHEMAP_INVALID_BOOLEAN = 1714
+ XML_SCHEMAP_INVALID_ENUM = 1715
+ XML_SCHEMAP_INVALID_FACET = 1716
+ XML_SCHEMAP_INVALID_FACET_VALUE = 1717
+ XML_SCHEMAP_INVALID_MAXOCCURS = 1718
+ XML_SCHEMAP_INVALID_MINOCCURS = 1719
+ XML_SCHEMAP_INVALID_REF_AND_SUBTYPE = 1720
+ XML_SCHEMAP_INVALID_WHITE_SPACE = 1721
+ XML_SCHEMAP_NOATTR_NOREF = 1722
+ XML_SCHEMAP_NOTATION_NO_NAME = 1723
+ XML_SCHEMAP_NOTYPE_NOREF = 1724
+ XML_SCHEMAP_REF_AND_SUBTYPE = 1725
+ XML_SCHEMAP_RESTRICTION_NONAME_NOREF = 1726
+ XML_SCHEMAP_SIMPLETYPE_NONAME = 1727
+ XML_SCHEMAP_TYPE_AND_SUBTYPE = 1728
+ XML_SCHEMAP_UNKNOWN_ALL_CHILD = 1729
+ XML_SCHEMAP_UNKNOWN_ANYATTRIBUTE_CHILD = 1730
+ XML_SCHEMAP_UNKNOWN_ATTR_CHILD = 1731
+ XML_SCHEMAP_UNKNOWN_ATTRGRP_CHILD = 1732
+ XML_SCHEMAP_UNKNOWN_ATTRIBUTE_GROUP = 1733
+ XML_SCHEMAP_UNKNOWN_BASE_TYPE = 1734
+ XML_SCHEMAP_UNKNOWN_CHOICE_CHILD = 1735
+ XML_SCHEMAP_UNKNOWN_COMPLEXCONTENT_CHILD = 1736
+ XML_SCHEMAP_UNKNOWN_COMPLEXTYPE_CHILD = 1737
+ XML_SCHEMAP_UNKNOWN_ELEM_CHILD = 1738
+ XML_SCHEMAP_UNKNOWN_EXTENSION_CHILD = 1739
+ XML_SCHEMAP_UNKNOWN_FACET_CHILD = 1740
+ XML_SCHEMAP_UNKNOWN_FACET_TYPE = 1741
+ XML_SCHEMAP_UNKNOWN_GROUP_CHILD = 1742
+ XML_SCHEMAP_UNKNOWN_IMPORT_CHILD = 1743
+ XML_SCHEMAP_UNKNOWN_LIST_CHILD = 1744
+ XML_SCHEMAP_UNKNOWN_NOTATION_CHILD = 1745
+ XML_SCHEMAP_UNKNOWN_PROCESSCONTENT_CHILD = 1746
+ XML_SCHEMAP_UNKNOWN_REF = 1747
+ XML_SCHEMAP_UNKNOWN_RESTRICTION_CHILD = 1748
+ XML_SCHEMAP_UNKNOWN_SCHEMAS_CHILD = 1749
+ XML_SCHEMAP_UNKNOWN_SEQUENCE_CHILD = 1750
+ XML_SCHEMAP_UNKNOWN_SIMPLECONTENT_CHILD = 1751
+ XML_SCHEMAP_UNKNOWN_SIMPLETYPE_CHILD = 1752
+ XML_SCHEMAP_UNKNOWN_TYPE = 1753
+ XML_SCHEMAP_UNKNOWN_UNION_CHILD = 1754
+ XML_SCHEMAP_ELEM_DEFAULT_FIXED = 1755
+ XML_SCHEMAP_REGEXP_INVALID = 1756
+ XML_SCHEMAP_FAILED_LOAD = 1757
+ XML_SCHEMAP_NOTHING_TO_PARSE = 1758
+ XML_SCHEMAP_NOROOT = 1759
+ XML_SCHEMAP_REDEFINED_GROUP = 1760
+ XML_SCHEMAP_REDEFINED_TYPE = 1761
+ XML_SCHEMAP_REDEFINED_ELEMENT = 1762
+ XML_SCHEMAP_REDEFINED_ATTRGROUP = 1763
+ XML_SCHEMAP_REDEFINED_ATTR = 1764
+ XML_SCHEMAP_REDEFINED_NOTATION = 1765
+ XML_SCHEMAP_FAILED_PARSE = 1766
+ XML_SCHEMAP_UNKNOWN_PREFIX = 1767
+ XML_SCHEMAP_DEF_AND_PREFIX = 1768
+ XML_SCHEMAP_UNKNOWN_INCLUDE_CHILD = 1769
+ XML_SCHEMAP_INCLUDE_SCHEMA_NOT_URI = 1770
+ XML_SCHEMAP_INCLUDE_SCHEMA_NO_URI = 1771
+ XML_SCHEMAP_NOT_SCHEMA = 1772
+ XML_SCHEMAP_UNKNOWN_MEMBER_TYPE = 1773
+ XML_SCHEMAP_INVALID_ATTR_USE = 1774
+ XML_SCHEMAP_RECURSIVE = 1775
+ XML_SCHEMAP_SUPERNUMEROUS_LIST_ITEM_TYPE = 1776
+ XML_SCHEMAP_INVALID_ATTR_COMBINATION = 1777
+ XML_SCHEMAP_INVALID_ATTR_INLINE_COMBINATION = 1778
+ XML_SCHEMAP_MISSING_SIMPLETYPE_CHILD = 1779
+ XML_SCHEMAP_INVALID_ATTR_NAME = 1780
+ XML_SCHEMAP_REF_AND_CONTENT = 1781
+ XML_SCHEMAP_CT_PROPS_CORRECT_1 = 1782
+ XML_SCHEMAP_CT_PROPS_CORRECT_2 = 1783
+ XML_SCHEMAP_CT_PROPS_CORRECT_3 = 1784
+ XML_SCHEMAP_CT_PROPS_CORRECT_4 = 1785
+ XML_SCHEMAP_CT_PROPS_CORRECT_5 = 1786
+ XML_SCHEMAP_DERIVATION_OK_RESTRICTION_1 = 1787
+ XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_1 = 1788
+ XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_2 = 1789
+ XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_2 = 1790
+ XML_SCHEMAP_DERIVATION_OK_RESTRICTION_3 = 1791
+ XML_SCHEMAP_WILDCARD_INVALID_NS_MEMBER = 1792
+ XML_SCHEMAP_INTERSECTION_NOT_EXPRESSIBLE = 1793
+ XML_SCHEMAP_UNION_NOT_EXPRESSIBLE = 1794
+ XML_SCHEMAP_SRC_IMPORT_3_1 = 1795
+ XML_SCHEMAP_SRC_IMPORT_3_2 = 1796
+ XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_1 = 1797
+ XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_2 = 1798
+ XML_SCHEMAP_DERIVATION_OK_RESTRICTION_4_3 = 1799
+ XML_SCHEMAP_COS_CT_EXTENDS_1_3 = 1800
+ XML_SCHEMAV_NOROOT = 1801
+ XML_SCHEMAV_UNDECLAREDELEM = 1802
+ XML_SCHEMAV_NOTTOPLEVEL = 1803
+ XML_SCHEMAV_MISSING = 1804
+ XML_SCHEMAV_WRONGELEM = 1805
+ XML_SCHEMAV_NOTYPE = 1806
+ XML_SCHEMAV_NOROLLBACK = 1807
+ XML_SCHEMAV_ISABSTRACT = 1808
+ XML_SCHEMAV_NOTEMPTY = 1809
+ XML_SCHEMAV_ELEMCONT = 1810
+ XML_SCHEMAV_HAVEDEFAULT = 1811
+ XML_SCHEMAV_NOTNILLABLE = 1812
+ XML_SCHEMAV_EXTRACONTENT = 1813
+ XML_SCHEMAV_INVALIDATTR = 1814
+ XML_SCHEMAV_INVALIDELEM = 1815
+ XML_SCHEMAV_NOTDETERMINIST = 1816
+ XML_SCHEMAV_CONSTRUCT = 1817
+ XML_SCHEMAV_INTERNAL = 1818
+ XML_SCHEMAV_NOTSIMPLE = 1819
+ XML_SCHEMAV_ATTRUNKNOWN = 1820
+ XML_SCHEMAV_ATTRINVALID = 1821
+ XML_SCHEMAV_VALUE = 1822
+ XML_SCHEMAV_FACET = 1823
+ XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_1 = 1824
+ XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_2 = 1825
+ XML_SCHEMAV_CVC_DATATYPE_VALID_1_2_3 = 1826
+ XML_SCHEMAV_CVC_TYPE_3_1_1 = 1827
+ XML_SCHEMAV_CVC_TYPE_3_1_2 = 1828
+ XML_SCHEMAV_CVC_FACET_VALID = 1829
+ XML_SCHEMAV_CVC_LENGTH_VALID = 1830
+ XML_SCHEMAV_CVC_MINLENGTH_VALID = 1831
+ XML_SCHEMAV_CVC_MAXLENGTH_VALID = 1832
+ XML_SCHEMAV_CVC_MININCLUSIVE_VALID = 1833
+ XML_SCHEMAV_CVC_MAXINCLUSIVE_VALID = 1834
+ XML_SCHEMAV_CVC_MINEXCLUSIVE_VALID = 1835
+ XML_SCHEMAV_CVC_MAXEXCLUSIVE_VALID = 1836
+ XML_SCHEMAV_CVC_TOTALDIGITS_VALID = 1837
+ XML_SCHEMAV_CVC_FRACTIONDIGITS_VALID = 1838
+ XML_SCHEMAV_CVC_PATTERN_VALID = 1839
+ XML_SCHEMAV_CVC_ENUMERATION_VALID = 1840
+ XML_SCHEMAV_CVC_COMPLEX_TYPE_2_1 = 1841
+ XML_SCHEMAV_CVC_COMPLEX_TYPE_2_2 = 1842
+ XML_SCHEMAV_CVC_COMPLEX_TYPE_2_3 = 1843
+ XML_SCHEMAV_CVC_COMPLEX_TYPE_2_4 = 1844
+ XML_SCHEMAV_CVC_ELT_1 = 1845
+ XML_SCHEMAV_CVC_ELT_2 = 1846
+ XML_SCHEMAV_CVC_ELT_3_1 = 1847
+ XML_SCHEMAV_CVC_ELT_3_2_1 = 1848
+ XML_SCHEMAV_CVC_ELT_3_2_2 = 1849
+ XML_SCHEMAV_CVC_ELT_4_1 = 1850
+ XML_SCHEMAV_CVC_ELT_4_2 = 1851
+ XML_SCHEMAV_CVC_ELT_4_3 = 1852
+ XML_SCHEMAV_CVC_ELT_5_1_1 = 1853
+ XML_SCHEMAV_CVC_ELT_5_1_2 = 1854
+ XML_SCHEMAV_CVC_ELT_5_2_1 = 1855
+ XML_SCHEMAV_CVC_ELT_5_2_2_1 = 1856
+ XML_SCHEMAV_CVC_ELT_5_2_2_2_1 = 1857
+ XML_SCHEMAV_CVC_ELT_5_2_2_2_2 = 1858
+ XML_SCHEMAV_CVC_ELT_6 = 1859
+ XML_SCHEMAV_CVC_ELT_7 = 1860
+ XML_SCHEMAV_CVC_ATTRIBUTE_1 = 1861
+ XML_SCHEMAV_CVC_ATTRIBUTE_2 = 1862
+ XML_SCHEMAV_CVC_ATTRIBUTE_3 = 1863
+ XML_SCHEMAV_CVC_ATTRIBUTE_4 = 1864
+ XML_SCHEMAV_CVC_COMPLEX_TYPE_3_1 = 1865
+ XML_SCHEMAV_CVC_COMPLEX_TYPE_3_2_1 = 1866
+ XML_SCHEMAV_CVC_COMPLEX_TYPE_3_2_2 = 1867
+ XML_SCHEMAV_CVC_COMPLEX_TYPE_4 = 1868
+ XML_SCHEMAV_CVC_COMPLEX_TYPE_5_1 = 1869
+ XML_SCHEMAV_CVC_COMPLEX_TYPE_5_2 = 1870
+ XML_SCHEMAV_ELEMENT_CONTENT = 1871
+ XML_SCHEMAV_DOCUMENT_ELEMENT_MISSING = 1872
+ XML_SCHEMAV_CVC_COMPLEX_TYPE_1 = 1873
+ XML_SCHEMAV_CVC_AU = 1874
+ XML_SCHEMAV_CVC_TYPE_1 = 1875
+ XML_SCHEMAV_CVC_TYPE_2 = 1876
+ XML_SCHEMAV_CVC_IDC = 1877
+ XML_SCHEMAV_CVC_WILDCARD = 1878
+ XML_SCHEMAV_MISC = 1879
+ XML_XPTR_UNKNOWN_SCHEME = 1900
+ XML_XPTR_CHILDSEQ_START = 1901
+ XML_XPTR_EVAL_FAILED = 1902
+ XML_XPTR_EXTRA_OBJECTS = 1903
+ XML_C14N_CREATE_CTXT = 1950
+ XML_C14N_REQUIRES_UTF8 = 1951
+ XML_C14N_CREATE_STACK = 1952
+ XML_C14N_INVALID_NODE = 1953
+ XML_C14N_UNKNOW_NODE = 1954
+ XML_C14N_RELATIVE_NAMESPACE = 1955
+ XML_FTP_PASV_ANSWER = 2000
+ XML_FTP_EPSV_ANSWER = 2001
+ XML_FTP_ACCNT = 2002
+ XML_FTP_URL_SYNTAX = 2003
+ XML_HTTP_URL_SYNTAX = 2020
+ XML_HTTP_USE_IP = 2021
+ XML_HTTP_UNKNOWN_HOST = 2022
+ XML_SCHEMAP_SRC_SIMPLE_TYPE_1 = 3000
+ XML_SCHEMAP_SRC_SIMPLE_TYPE_2 = 3001
+ XML_SCHEMAP_SRC_SIMPLE_TYPE_3 = 3002
+ XML_SCHEMAP_SRC_SIMPLE_TYPE_4 = 3003
+ XML_SCHEMAP_SRC_RESOLVE = 3004
+ XML_SCHEMAP_SRC_RESTRICTION_BASE_OR_SIMPLETYPE = 3005
+ XML_SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE = 3006
+ XML_SCHEMAP_SRC_UNION_MEMBERTYPES_OR_SIMPLETYPES = 3007
+ XML_SCHEMAP_ST_PROPS_CORRECT_1 = 3008
+ XML_SCHEMAP_ST_PROPS_CORRECT_2 = 3009
+ XML_SCHEMAP_ST_PROPS_CORRECT_3 = 3010
+ XML_SCHEMAP_COS_ST_RESTRICTS_1_1 = 3011
+ XML_SCHEMAP_COS_ST_RESTRICTS_1_2 = 3012
+ XML_SCHEMAP_COS_ST_RESTRICTS_1_3_1 = 3013
+ XML_SCHEMAP_COS_ST_RESTRICTS_1_3_2 = 3014
+ XML_SCHEMAP_COS_ST_RESTRICTS_2_1 = 3015
+ XML_SCHEMAP_COS_ST_RESTRICTS_2_3_1_1 = 3016
+ XML_SCHEMAP_COS_ST_RESTRICTS_2_3_1_2 = 3017
+ XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_1 = 3018
+ XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_2 = 3019
+ XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_3 = 3020
+ XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_4 = 3021
+ XML_SCHEMAP_COS_ST_RESTRICTS_2_3_2_5 = 3022
+ XML_SCHEMAP_COS_ST_RESTRICTS_3_1 = 3023
+ XML_SCHEMAP_COS_ST_RESTRICTS_3_3_1 = 3024
+ XML_SCHEMAP_COS_ST_RESTRICTS_3_3_1_2 = 3025
+ XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_2 = 3026
+ XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_1 = 3027
+ XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_3 = 3028
+ XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_4 = 3029
+ XML_SCHEMAP_COS_ST_RESTRICTS_3_3_2_5 = 3030
+ XML_SCHEMAP_COS_ST_DERIVED_OK_2_1 = 3031
+ XML_SCHEMAP_COS_ST_DERIVED_OK_2_2 = 3032
+ XML_SCHEMAP_S4S_ELEM_NOT_ALLOWED = 3033
+ XML_SCHEMAP_S4S_ELEM_MISSING = 3034
+ XML_SCHEMAP_S4S_ATTR_NOT_ALLOWED = 3035
+ XML_SCHEMAP_S4S_ATTR_MISSING = 3036
+ XML_SCHEMAP_S4S_ATTR_INVALID_VALUE = 3037
+ XML_SCHEMAP_SRC_ELEMENT_1 = 3038
+ XML_SCHEMAP_SRC_ELEMENT_2_1 = 3039
+ XML_SCHEMAP_SRC_ELEMENT_2_2 = 3040
+ XML_SCHEMAP_SRC_ELEMENT_3 = 3041
+ XML_SCHEMAP_P_PROPS_CORRECT_1 = 3042
+ XML_SCHEMAP_P_PROPS_CORRECT_2_1 = 3043
+ XML_SCHEMAP_P_PROPS_CORRECT_2_2 = 3044
+ XML_SCHEMAP_E_PROPS_CORRECT_2 = 3045
+ XML_SCHEMAP_E_PROPS_CORRECT_3 = 3046
+ XML_SCHEMAP_E_PROPS_CORRECT_4 = 3047
+ XML_SCHEMAP_E_PROPS_CORRECT_5 = 3048
+ XML_SCHEMAP_E_PROPS_CORRECT_6 = 3049
+ XML_SCHEMAP_SRC_INCLUDE = 3050
+ XML_SCHEMAP_SRC_ATTRIBUTE_1 = 3051
+ XML_SCHEMAP_SRC_ATTRIBUTE_2 = 3052
+ XML_SCHEMAP_SRC_ATTRIBUTE_3_1 = 3053
+ XML_SCHEMAP_SRC_ATTRIBUTE_3_2 = 3054
+ XML_SCHEMAP_SRC_ATTRIBUTE_4 = 3055
+ XML_SCHEMAP_NO_XMLNS = 3056
+ XML_SCHEMAP_NO_XSI = 3057
+ XML_SCHEMAP_COS_VALID_DEFAULT_1 = 3058
+ XML_SCHEMAP_COS_VALID_DEFAULT_2_1 = 3059
+ XML_SCHEMAP_COS_VALID_DEFAULT_2_2_1 = 3060
+ XML_SCHEMAP_COS_VALID_DEFAULT_2_2_2 = 3061
+ XML_SCHEMAP_CVC_SIMPLE_TYPE = 3062
+ XML_SCHEMAP_COS_CT_EXTENDS_1_1 = 3063
+ XML_SCHEMAP_SRC_IMPORT_1_1 = 3064
+ XML_SCHEMAP_SRC_IMPORT_1_2 = 3065
+ XML_SCHEMAP_SRC_IMPORT_2 = 3066
+ XML_SCHEMAP_SRC_IMPORT_2_1 = 3067
+ XML_SCHEMAP_SRC_IMPORT_2_2 = 3068
+ XML_SCHEMAP_INTERNAL = 3069 # 3069 non-W3C
+ XML_SCHEMAP_NOT_DETERMINISTIC = 3070 # 3070 non-W3C
+ XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_1 = 3071
+ XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_2 = 3072
+ XML_SCHEMAP_SRC_ATTRIBUTE_GROUP_3 = 3073
+ XML_SCHEMAP_MG_PROPS_CORRECT_1 = 3074
+ XML_SCHEMAP_MG_PROPS_CORRECT_2 = 3075
+ XML_SCHEMAP_SRC_CT_1 = 3076
+ XML_SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_3 = 3077
+ XML_SCHEMAP_AU_PROPS_CORRECT_2 = 3078
+ XML_SCHEMAP_A_PROPS_CORRECT_2 = 3079
+ XML_SCHEMAP_C_PROPS_CORRECT = 3080
+ XML_SCHEMAP_SRC_REDEFINE = 3081
+ XML_SCHEMAP_SRC_IMPORT = 3082
+ XML_SCHEMAP_WARN_SKIP_SCHEMA = 3083
+ XML_SCHEMAP_WARN_UNLOCATED_SCHEMA = 3084
+ XML_SCHEMAP_WARN_ATTR_REDECL_PROH = 3085
+ XML_SCHEMAP_WARN_ATTR_POINTLESS_PROH = 3086 # 3085
+ XML_SCHEMAP_AG_PROPS_CORRECT = 3087 # 3086
+ XML_SCHEMAP_COS_CT_EXTENDS_1_2 = 3088 # 3087
+ XML_SCHEMAP_AU_PROPS_CORRECT = 3089 # 3088
+ XML_SCHEMAP_A_PROPS_CORRECT_3 = 3090 # 3089
+ XML_SCHEMAP_COS_ALL_LIMITED = 3091 # 3090
+ XML_SCHEMATRONV_ASSERT = 4000
+ XML_SCHEMATRONV_REPORT = 4001
+ XML_MODULE_OPEN = 4900
+ XML_MODULE_CLOSE = 4901
+ XML_CHECK_FOUND_ELEMENT = 5000
+ XML_CHECK_FOUND_ATTRIBUTE = 5001
+ XML_CHECK_FOUND_TEXT = 5002
+ XML_CHECK_FOUND_CDATA = 5003
+ XML_CHECK_FOUND_ENTITYREF = 5004
+ XML_CHECK_FOUND_ENTITY = 5005
+ XML_CHECK_FOUND_PI = 5006
+ XML_CHECK_FOUND_COMMENT = 5007
+ XML_CHECK_FOUND_DOCTYPE = 5008
+ XML_CHECK_FOUND_FRAGMENT = 5009
+ XML_CHECK_FOUND_NOTATION = 5010
+ XML_CHECK_UNKNOWN_NODE = 5011
+ XML_CHECK_ENTITY_TYPE = 5012
+ XML_CHECK_NO_PARENT = 5013
+ XML_CHECK_NO_DOC = 5014
+ XML_CHECK_NO_NAME = 5015
+ XML_CHECK_NO_ELEM = 5016
+ XML_CHECK_WRONG_DOC = 5017
+ XML_CHECK_NO_PREV = 5018
+ XML_CHECK_WRONG_PREV = 5019
+ XML_CHECK_NO_NEXT = 5020
+ XML_CHECK_WRONG_NEXT = 5021
+ XML_CHECK_NOT_DTD = 5022
+ XML_CHECK_NOT_ATTR = 5023
+ XML_CHECK_NOT_ATTR_DECL = 5024
+ XML_CHECK_NOT_ELEM_DECL = 5025
+ XML_CHECK_NOT_ENTITY_DECL = 5026
+ XML_CHECK_NOT_NS_DECL = 5027
+ XML_CHECK_NO_HREF = 5028
+ XML_CHECK_WRONG_PARENT = 5029
+ XML_CHECK_NS_SCOPE = 5030
+ XML_CHECK_NS_ANCESTOR = 5031
+ XML_CHECK_NOT_UTF8 = 5032
+ XML_CHECK_NO_DICT = 5033
+ XML_CHECK_NOT_NCNAME = 5034
+ XML_CHECK_OUTSIDE_DICT = 5035
+ XML_CHECK_WRONG_NAME = 5036
+ XML_CHECK_NAME_NOT_NULL = 5037
+ XML_I18N_NO_NAME = 6000
+ XML_I18N_NO_HANDLER = 6001
+ XML_I18N_EXCESS_HANDLER = 6002
+ XML_I18N_CONV_FAILED = 6003
+ XML_I18N_NO_OUTPUT = 6004
+ XML_BUF_OVERFLOW = 7000
+
+ ctypedef enum xmlRelaxNGValidErr:
+ XML_RELAXNG_OK = 0
+ XML_RELAXNG_ERR_MEMORY = 1
+ XML_RELAXNG_ERR_TYPE = 2
+ XML_RELAXNG_ERR_TYPEVAL = 3
+ XML_RELAXNG_ERR_DUPID = 4
+ XML_RELAXNG_ERR_TYPECMP = 5
+ XML_RELAXNG_ERR_NOSTATE = 6
+ XML_RELAXNG_ERR_NODEFINE = 7
+ XML_RELAXNG_ERR_LISTEXTRA = 8
+ XML_RELAXNG_ERR_LISTEMPTY = 9
+ XML_RELAXNG_ERR_INTERNODATA = 10
+ XML_RELAXNG_ERR_INTERSEQ = 11
+ XML_RELAXNG_ERR_INTEREXTRA = 12
+ XML_RELAXNG_ERR_ELEMNAME = 13
+ XML_RELAXNG_ERR_ATTRNAME = 14
+ XML_RELAXNG_ERR_ELEMNONS = 15
+ XML_RELAXNG_ERR_ATTRNONS = 16
+ XML_RELAXNG_ERR_ELEMWRONGNS = 17
+ XML_RELAXNG_ERR_ATTRWRONGNS = 18
+ XML_RELAXNG_ERR_ELEMEXTRANS = 19
+ XML_RELAXNG_ERR_ATTREXTRANS = 20
+ XML_RELAXNG_ERR_ELEMNOTEMPTY = 21
+ XML_RELAXNG_ERR_NOELEM = 22
+ XML_RELAXNG_ERR_NOTELEM = 23
+ XML_RELAXNG_ERR_ATTRVALID = 24
+ XML_RELAXNG_ERR_CONTENTVALID = 25
+ XML_RELAXNG_ERR_EXTRACONTENT = 26
+ XML_RELAXNG_ERR_INVALIDATTR = 27
+ XML_RELAXNG_ERR_DATAELEM = 28
+ XML_RELAXNG_ERR_VALELEM = 29
+ XML_RELAXNG_ERR_LISTELEM = 30
+ XML_RELAXNG_ERR_DATATYPE = 31
+ XML_RELAXNG_ERR_VALUE = 32
+ XML_RELAXNG_ERR_LIST = 33
+ XML_RELAXNG_ERR_NOGRAMMAR = 34
+ XML_RELAXNG_ERR_EXTRADATA = 35
+ XML_RELAXNG_ERR_LACKDATA = 36
+ XML_RELAXNG_ERR_INTERNAL = 37
+ XML_RELAXNG_ERR_ELEMWRONG = 38
+ XML_RELAXNG_ERR_TEXTWRONG = 39
+# --- END: GENERATED CONSTANTS ---
+
+cdef extern from "libxml/xmlerror.h":
+ ctypedef struct xmlError:
+ int domain
+ int code
+ char* message
+ xmlErrorLevel level
+ char* file
+ char* str1
+ char* str2
+ char* str3
+ int line
+ int int1
+ int int2
+ void* node
+
+ ctypedef void (*xmlGenericErrorFunc)(void* ctxt, char* msg, ...) nogil
+ ctypedef void (*xmlStructuredErrorFunc)(void* userData,
+ xmlError* error) nogil
+
+ cdef void xmlSetGenericErrorFunc(
+ void* ctxt, xmlGenericErrorFunc func) nogil
+ cdef void xmlSetStructuredErrorFunc(
+ void* ctxt, xmlStructuredErrorFunc func) nogil
+
+cdef extern from "libxml/globals.h":
+ cdef xmlStructuredErrorFunc xmlStructuredError
+ cdef void* xmlStructuredErrorContext
diff --git a/src/lxml/includes/xmlparser.pxd b/src/lxml/includes/xmlparser.pxd
new file mode 100644
index 0000000..a196e34
--- /dev/null
+++ b/src/lxml/includes/xmlparser.pxd
@@ -0,0 +1,249 @@
+from libc.string cimport const_char
+
+from lxml.includes.tree cimport (
+ xmlDoc, xmlNode, xmlDict, xmlDtd, xmlChar, const_xmlChar)
+from lxml.includes.tree cimport xmlInputReadCallback, xmlInputCloseCallback
+from lxml.includes.xmlerror cimport xmlError, xmlStructuredErrorFunc
+
+
+cdef extern from "libxml/parser.h":
+ ctypedef void (*startElementNsSAX2Func)(void* ctx,
+ const_xmlChar* localname,
+ const_xmlChar* prefix,
+ const_xmlChar* URI,
+ int nb_namespaces,
+ const_xmlChar** namespaces,
+ int nb_attributes,
+ int nb_defaulted,
+ const_xmlChar** attributes)
+
+ ctypedef void (*endElementNsSAX2Func)(void* ctx,
+ const_xmlChar* localname,
+ const_xmlChar* prefix,
+ const_xmlChar* URI)
+
+ ctypedef void (*startElementSAXFunc)(void* ctx, const_xmlChar* name, const_xmlChar** atts)
+
+ ctypedef void (*endElementSAXFunc)(void* ctx, const_xmlChar* name)
+
+ ctypedef void (*charactersSAXFunc)(void* ctx, const_xmlChar* ch, int len)
+
+ ctypedef void (*cdataBlockSAXFunc)(void* ctx, const_xmlChar* value, int len)
+
+ ctypedef void (*commentSAXFunc)(void* ctx, const_xmlChar* value)
+
+ ctypedef void (*processingInstructionSAXFunc)(void* ctx,
+ const_xmlChar* target,
+ const_xmlChar* data)
+
+ ctypedef void (*internalSubsetSAXFunc)(void* ctx,
+ const_xmlChar* name,
+ const_xmlChar* externalID,
+ const_xmlChar* systemID)
+
+ ctypedef void (*endDocumentSAXFunc)(void* ctx)
+
+ ctypedef void (*startDocumentSAXFunc)(void* ctx)
+
+ ctypedef void (*referenceSAXFunc)(void * ctx, const_xmlChar* name)
+
+ cdef int XML_SAX2_MAGIC
+
+cdef extern from "libxml/tree.h":
+ ctypedef struct xmlParserInput:
+ int line
+ int length
+ const_xmlChar* base
+ const_xmlChar* cur
+ const_xmlChar* end
+ const_char *filename
+
+ ctypedef struct xmlParserInputBuffer:
+ void* context
+ xmlInputReadCallback readcallback
+ xmlInputCloseCallback closecallback
+
+ ctypedef struct xmlSAXHandlerV1:
+ # same as xmlSAXHandler, but without namespaces
+ pass
+
+ ctypedef struct xmlSAXHandler:
+ internalSubsetSAXFunc internalSubset
+ startElementNsSAX2Func startElementNs
+ endElementNsSAX2Func endElementNs
+ startElementSAXFunc startElement
+ endElementSAXFunc endElement
+ charactersSAXFunc characters
+ cdataBlockSAXFunc cdataBlock
+ referenceSAXFunc reference
+ commentSAXFunc comment
+ processingInstructionSAXFunc processingInstruction
+ startDocumentSAXFunc startDocument
+ endDocumentSAXFunc endDocument
+ int initialized
+ xmlStructuredErrorFunc serror
+ void* _private
+
+
+cdef extern from "libxml/SAX2.h" nogil:
+ cdef void xmlSAX2StartDocument(void* ctxt)
+
+
+cdef extern from "libxml/xmlIO.h" nogil:
+ cdef xmlParserInputBuffer* xmlAllocParserInputBuffer(int enc)
+
+
+cdef extern from "libxml/parser.h":
+
+ cdef xmlDict* xmlDictCreate() nogil
+ cdef xmlDict* xmlDictCreateSub(xmlDict* subdict) nogil
+ cdef void xmlDictFree(xmlDict* sub) nogil
+ cdef int xmlDictReference(xmlDict* dict) nogil
+
+ cdef int XML_COMPLETE_ATTRS # SAX option for adding DTD default attributes
+ cdef int XML_SKIP_IDS # SAX option for not building an XML ID dict
+
+ ctypedef enum xmlParserInputState:
+ XML_PARSER_EOF = -1 # nothing is to be parsed
+ XML_PARSER_START = 0 # nothing has been parsed
+ XML_PARSER_MISC = 1 # Misc* before int subset
+ XML_PARSER_PI = 2 # Within a processing instruction
+ XML_PARSER_DTD = 3 # within some DTD content
+ XML_PARSER_PROLOG = 4 # Misc* after internal subset
+ XML_PARSER_COMMENT = 5 # within a comment
+ XML_PARSER_START_TAG = 6 # within a start tag
+ XML_PARSER_CONTENT = 7 # within the content
+ XML_PARSER_CDATA_SECTION = 8 # within a CDATA section
+ XML_PARSER_END_TAG = 9 # within a closing tag
+ XML_PARSER_ENTITY_DECL = 10 # within an entity declaration
+ XML_PARSER_ENTITY_VALUE = 11 # within an entity value in a decl
+ XML_PARSER_ATTRIBUTE_VALUE = 12 # within an attribute value
+ XML_PARSER_SYSTEM_LITERAL = 13 # within a SYSTEM value
+ XML_PARSER_EPILOG = 14 # the Misc* after the last end tag
+ XML_PARSER_IGNORE = 15 # within an IGNORED section
+ XML_PARSER_PUBLIC_LITERAL = 16 # within a PUBLIC value
+
+
+ ctypedef struct xmlParserCtxt:
+ xmlDoc* myDoc
+ xmlDict* dict
+ int dictNames
+ void* _private
+ bint wellFormed
+ bint recovery
+ int options
+ bint disableSAX
+ int errNo
+ xmlParserInputState instate
+ bint replaceEntities
+ int loadsubset # != 0 if enabled, int value == why
+ bint validate
+ xmlError lastError
+ xmlNode* node
+ xmlSAXHandler* sax
+ void* userData
+ int* spaceTab
+ int spaceMax
+ bint html
+ bint progressive
+ int inSubset
+ int charset
+ xmlParserInput* input
+
+ ctypedef enum xmlParserOption:
+ XML_PARSE_RECOVER = 1 # recover on errors
+ XML_PARSE_NOENT = 2 # substitute entities
+ XML_PARSE_DTDLOAD = 4 # load the external subset
+ XML_PARSE_DTDATTR = 8 # default DTD attributes
+ XML_PARSE_DTDVALID = 16 # validate with the DTD
+ XML_PARSE_NOERROR = 32 # suppress error reports
+ XML_PARSE_NOWARNING = 64 # suppress warning reports
+ XML_PARSE_PEDANTIC = 128 # pedantic error reporting
+ XML_PARSE_NOBLANKS = 256 # remove blank nodes
+ XML_PARSE_SAX1 = 512 # use the SAX1 interface internally
+ XML_PARSE_XINCLUDE = 1024 # Implement XInclude substitution
+ XML_PARSE_NONET = 2048 # Forbid network access
+ XML_PARSE_NODICT = 4096 # Do not reuse the context dictionary
+ XML_PARSE_NSCLEAN = 8192 # remove redundant namespaces declarations
+ XML_PARSE_NOCDATA = 16384 # merge CDATA as text nodes
+ XML_PARSE_NOXINCNODE = 32768 # do not generate XINCLUDE START/END nodes
+ # libxml2 2.6.21+ only:
+ XML_PARSE_COMPACT = 65536 # compact small text nodes
+ # libxml2 2.7.0+ only:
+ XML_PARSE_OLD10 = 131072 # parse using XML-1.0 before update 5
+ XML_PARSE_NOBASEFIX = 262144 # do not fixup XINCLUDE xml:base uris
+ XML_PARSE_HUGE = 524288 # relax any hardcoded limit from the parser
+ # libxml2 2.7.3+ only:
+ XML_PARSE_OLDSAX = 1048576 # parse using SAX2 interface before 2.7.0
+ # libxml2 2.8.0+ only:
+ XML_PARSE_IGNORE_ENC = 2097152 # ignore internal document encoding hint
+ # libxml2 2.9.0+ only:
+ XML_PARSE_BIG_LINES = 4194304 # Store big lines numbers in text PSVI field
+
+ cdef void xmlInitParser() nogil
+ cdef void xmlCleanupParser() nogil
+
+ cdef int xmlLineNumbersDefault(int onoff) nogil
+ cdef xmlParserCtxt* xmlNewParserCtxt() nogil
+ cdef xmlParserInput* xmlNewIOInputStream(xmlParserCtxt* ctxt,
+ xmlParserInputBuffer* input,
+ int enc) nogil
+ cdef int xmlCtxtUseOptions(xmlParserCtxt* ctxt, int options) nogil
+ cdef void xmlFreeParserCtxt(xmlParserCtxt* ctxt) nogil
+ cdef void xmlCtxtReset(xmlParserCtxt* ctxt) nogil
+ cdef void xmlClearParserCtxt(xmlParserCtxt* ctxt) nogil
+ cdef int xmlParseChunk(xmlParserCtxt* ctxt,
+ char* chunk, int size, int terminate) nogil
+ cdef xmlDoc* xmlCtxtReadDoc(xmlParserCtxt* ctxt,
+ char* cur, char* URL, char* encoding,
+ int options) nogil
+ cdef xmlDoc* xmlCtxtReadFile(xmlParserCtxt* ctxt,
+ char* filename, char* encoding,
+ int options) nogil
+ cdef xmlDoc* xmlCtxtReadIO(xmlParserCtxt* ctxt,
+ xmlInputReadCallback ioread,
+ xmlInputCloseCallback ioclose,
+ void* ioctx,
+ char* URL, char* encoding,
+ int options) nogil
+ cdef xmlDoc* xmlCtxtReadMemory(xmlParserCtxt* ctxt,
+ char* buffer, int size,
+ char* filename, const_char* encoding,
+ int options) nogil
+
+# iterparse:
+
+ cdef xmlParserCtxt* xmlCreatePushParserCtxt(xmlSAXHandler* sax,
+ void* user_data,
+ char* chunk,
+ int size,
+ char* filename) nogil
+
+ cdef int xmlCtxtResetPush(xmlParserCtxt* ctxt,
+ char* chunk,
+ int size,
+ char* filename,
+ char* encoding) nogil
+
+# entity loaders:
+
+ ctypedef xmlParserInput* (*xmlExternalEntityLoader)(
+ const_char * URL, const_char * ID, xmlParserCtxt* context) nogil
+ cdef xmlExternalEntityLoader xmlGetExternalEntityLoader() nogil
+ cdef void xmlSetExternalEntityLoader(xmlExternalEntityLoader f) nogil
+
+# DTDs:
+
+ cdef xmlDtd* xmlParseDTD(const_xmlChar* ExternalID, const_xmlChar* SystemID) nogil
+ cdef xmlDtd* xmlIOParseDTD(xmlSAXHandler* sax,
+ xmlParserInputBuffer* input,
+ int enc) nogil
+
+cdef extern from "libxml/parserInternals.h":
+ cdef xmlParserInput* xmlNewInputStream(xmlParserCtxt* ctxt)
+ cdef xmlParserInput* xmlNewStringInputStream(xmlParserCtxt* ctxt,
+ char* buffer) nogil
+ cdef xmlParserInput* xmlNewInputFromFile(xmlParserCtxt* ctxt,
+ char* filename) nogil
+ cdef void xmlFreeInputStream(xmlParserInput* input) nogil
+ cdef int xmlSwitchEncoding(xmlParserCtxt* ctxt, int enc) nogil
diff --git a/src/lxml/includes/xmlschema.pxd b/src/lxml/includes/xmlschema.pxd
new file mode 100644
index 0000000..8e93cc5
--- /dev/null
+++ b/src/lxml/includes/xmlschema.pxd
@@ -0,0 +1,35 @@
+from lxml.includes.tree cimport xmlDoc
+from lxml.includes.xmlparser cimport xmlSAXHandler
+from lxml.includes.xmlerror cimport xmlStructuredErrorFunc
+
+cdef extern from "libxml/xmlschemas.h":
+ ctypedef struct xmlSchema
+ ctypedef struct xmlSchemaParserCtxt
+
+ ctypedef struct xmlSchemaSAXPlugStruct
+ ctypedef struct xmlSchemaValidCtxt
+
+ ctypedef enum xmlSchemaValidOption:
+ XML_SCHEMA_VAL_VC_I_CREATE = 1
+
+ cdef xmlSchemaValidCtxt* xmlSchemaNewValidCtxt(xmlSchema* schema) nogil
+ cdef void xmlSchemaSetParserStructuredErrors(xmlSchemaParserCtxt* ctxt,
+ xmlStructuredErrorFunc serror, void *ctx)
+ cdef void xmlSchemaSetValidStructuredErrors(xmlSchemaValidCtxt* ctxt,
+ xmlStructuredErrorFunc serror, void *ctx)
+
+ cdef int xmlSchemaValidateDoc(xmlSchemaValidCtxt* ctxt, xmlDoc* doc) nogil
+ cdef xmlSchema* xmlSchemaParse(xmlSchemaParserCtxt* ctxt) nogil
+ cdef xmlSchemaParserCtxt* xmlSchemaNewParserCtxt(char* URL) nogil
+ cdef xmlSchemaParserCtxt* xmlSchemaNewDocParserCtxt(xmlDoc* doc) nogil
+ cdef void xmlSchemaFree(xmlSchema* schema) nogil
+ cdef void xmlSchemaFreeParserCtxt(xmlSchemaParserCtxt* ctxt) nogil
+ cdef void xmlSchemaFreeValidCtxt(xmlSchemaValidCtxt* ctxt) nogil
+ cdef int xmlSchemaSetValidOptions(xmlSchemaValidCtxt* ctxt,
+ int options) nogil
+
+ cdef xmlSchemaSAXPlugStruct* xmlSchemaSAXPlug(xmlSchemaValidCtxt* ctxt,
+ xmlSAXHandler** sax,
+ void** data) nogil
+ cdef int xmlSchemaSAXUnplug(xmlSchemaSAXPlugStruct* sax_plug)
+ cdef int xmlSchemaIsValid(xmlSchemaValidCtxt* ctxt)
diff --git a/src/lxml/includes/xpath.pxd b/src/lxml/includes/xpath.pxd
new file mode 100644
index 0000000..d01735b
--- /dev/null
+++ b/src/lxml/includes/xpath.pxd
@@ -0,0 +1,135 @@
+from lxml.includes cimport tree
+from lxml.includes cimport xmlerror
+
+from libc.string cimport const_char
+from lxml.includes.tree cimport xmlChar, const_xmlChar
+
+cdef extern from "libxml/xpath.h":
+ ctypedef enum xmlXPathObjectType:
+ XPATH_UNDEFINED = 0
+ XPATH_NODESET = 1
+ XPATH_BOOLEAN = 2
+ XPATH_NUMBER = 3
+ XPATH_STRING = 4
+ XPATH_POINT = 5
+ XPATH_RANGE = 6
+ XPATH_LOCATIONSET = 7
+ XPATH_USERS = 8
+ XPATH_XSLT_TREE = 9
+
+ ctypedef enum xmlXPathError:
+ XPATH_EXPRESSION_OK = 0
+ XPATH_NUMBER_ERROR = 1
+ XPATH_UNFINISHED_LITERAL_ERROR = 2
+ XPATH_START_LITERAL_ERROR = 3
+ XPATH_VARIABLE_REF_ERROR = 4
+ XPATH_UNDEF_VARIABLE_ERROR = 5
+ XPATH_INVALID_PREDICATE_ERROR = 6
+ XPATH_EXPR_ERROR = 7
+ XPATH_UNCLOSED_ERROR = 8
+ XPATH_UNKNOWN_FUNC_ERROR = 9
+ XPATH_INVALID_OPERAND = 10
+ XPATH_INVALID_TYPE = 11
+ XPATH_INVALID_ARITY = 12
+ XPATH_INVALID_CTXT_SIZE = 13
+ XPATH_INVALID_CTXT_POSITION = 14
+ XPATH_MEMORY_ERROR = 15
+ XPTR_SYNTAX_ERROR = 16
+ XPTR_RESOURCE_ERROR = 17
+ XPTR_SUB_RESOURCE_ERROR = 18
+ XPATH_UNDEF_PREFIX_ERROR = 19
+ XPATH_ENCODING_ERROR = 20
+ XPATH_INVALID_CHAR_ERROR = 21
+ XPATH_INVALID_CTXT = 22
+
+ ctypedef struct xmlNodeSet:
+ int nodeNr
+ int nodeMax
+ tree.xmlNode** nodeTab
+
+ ctypedef struct xmlXPathObject:
+ xmlXPathObjectType type
+ xmlNodeSet* nodesetval
+ bint boolval
+ double floatval
+ xmlChar* stringval
+
+ ctypedef struct xmlXPathContext:
+ tree.xmlDoc* doc
+ tree.xmlNode* node
+ tree.xmlDict* dict
+ tree.xmlHashTable* nsHash
+ const_xmlChar* function
+ const_xmlChar* functionURI
+ xmlerror.xmlStructuredErrorFunc error
+ xmlerror.xmlError lastError
+ void* userData
+
+ ctypedef struct xmlXPathParserContext:
+ xmlXPathContext* context
+ xmlXPathObject* value
+ tree.xmlNode* ancestor
+ int error
+
+ ctypedef struct xmlXPathCompExpr
+
+ ctypedef void (*xmlXPathFunction)(xmlXPathParserContext* ctxt, int nargs) nogil
+ ctypedef xmlXPathFunction (*xmlXPathFuncLookupFunc)(void* ctxt,
+ const_xmlChar* name,
+ const_xmlChar* ns_uri) nogil
+
+ cdef xmlXPathContext* xmlXPathNewContext(tree.xmlDoc* doc) nogil
+ cdef xmlXPathObject* xmlXPathEvalExpression(const_xmlChar* str,
+ xmlXPathContext* ctxt) nogil
+ cdef xmlXPathObject* xmlXPathCompiledEval(xmlXPathCompExpr* comp,
+ xmlXPathContext* ctxt) nogil
+ cdef xmlXPathCompExpr* xmlXPathCompile(const_xmlChar* str) nogil
+ cdef xmlXPathCompExpr* xmlXPathCtxtCompile(xmlXPathContext* ctxt,
+ const_xmlChar* str) nogil
+ cdef void xmlXPathFreeContext(xmlXPathContext* ctxt) nogil
+ cdef void xmlXPathFreeCompExpr(xmlXPathCompExpr* comp) nogil
+ cdef void xmlXPathFreeObject(xmlXPathObject* obj) nogil
+ cdef int xmlXPathRegisterNs(xmlXPathContext* ctxt,
+ const_xmlChar* prefix, const_xmlChar* ns_uri) nogil
+
+ cdef xmlNodeSet* xmlXPathNodeSetCreate(tree.xmlNode* val) nogil
+ cdef void xmlXPathFreeNodeSet(xmlNodeSet* val) nogil
+
+
+cdef extern from "libxml/xpathInternals.h":
+ cdef int xmlXPathRegisterFunc(xmlXPathContext* ctxt,
+ const_xmlChar* name,
+ xmlXPathFunction f) nogil
+ cdef int xmlXPathRegisterFuncNS(xmlXPathContext* ctxt,
+ const_xmlChar* name,
+ const_xmlChar* ns_uri,
+ xmlXPathFunction f) nogil
+ cdef void xmlXPathRegisterFuncLookup(xmlXPathContext *ctxt,
+ xmlXPathFuncLookupFunc f,
+ void *funcCtxt) nogil
+ cdef int xmlXPathRegisterVariable(xmlXPathContext *ctxt,
+ const_xmlChar* name,
+ xmlXPathObject* value) nogil
+ cdef int xmlXPathRegisterVariableNS(xmlXPathContext *ctxt,
+ const_xmlChar* name,
+ const_xmlChar* ns_uri,
+ xmlXPathObject* value) nogil
+ cdef void xmlXPathRegisteredVariablesCleanup(xmlXPathContext *ctxt) nogil
+ cdef void xmlXPathRegisteredNsCleanup(xmlXPathContext *ctxt) nogil
+ cdef xmlXPathObject* valuePop (xmlXPathParserContext *ctxt) nogil
+ cdef int valuePush(xmlXPathParserContext* ctxt, xmlXPathObject *value) nogil
+
+ cdef xmlXPathObject* xmlXPathNewCString(const_char *val) nogil
+ cdef xmlXPathObject* xmlXPathWrapCString(const_char * val) nogil
+ cdef xmlXPathObject* xmlXPathNewString(const_xmlChar *val) nogil
+ cdef xmlXPathObject* xmlXPathWrapString(const_xmlChar * val) nogil
+ cdef xmlXPathObject* xmlXPathNewFloat(double val) nogil
+ cdef xmlXPathObject* xmlXPathNewBoolean(int val) nogil
+ cdef xmlXPathObject* xmlXPathNewNodeSet(tree.xmlNode* val) nogil
+ cdef xmlXPathObject* xmlXPathNewValueTree(tree.xmlNode* val) nogil
+ cdef void xmlXPathNodeSetAdd(xmlNodeSet* cur,
+ tree.xmlNode* val) nogil
+ cdef void xmlXPathNodeSetAddUnique(xmlNodeSet* cur,
+ tree.xmlNode* val) nogil
+ cdef xmlXPathObject* xmlXPathWrapNodeSet(xmlNodeSet* val) nogil
+ cdef void xmlXPathErr(xmlXPathParserContext* ctxt, int error) nogil
diff --git a/src/lxml/includes/xslt.pxd b/src/lxml/includes/xslt.pxd
new file mode 100644
index 0000000..101fb7e
--- /dev/null
+++ b/src/lxml/includes/xslt.pxd
@@ -0,0 +1,191 @@
+from lxml.includes.tree cimport xmlDoc, xmlNode, xmlDict, xmlChar, const_xmlChar, xmlOutputBuffer
+from lxml.includes.xmlerror cimport xmlGenericErrorFunc
+from lxml.includes.xpath cimport xmlXPathContext, xmlXPathFunction
+
+from libc.string cimport const_char
+
+cdef extern from "libxslt/xslt.h":
+ cdef int xsltLibxsltVersion
+ cdef int xsltMaxDepth
+
+cdef extern from "libxslt/xsltconfig.h":
+ cdef int LIBXSLT_VERSION
+
+cdef extern from "libxslt/xsltInternals.h":
+ ctypedef enum xsltTransformState:
+ XSLT_STATE_OK # 0
+ XSLT_STATE_ERROR # 1
+ XSLT_STATE_STOPPED # 2
+
+ ctypedef struct xsltDocument:
+ xmlDoc* doc
+
+ ctypedef struct xsltStylesheet:
+ xmlChar* encoding
+ xmlDoc* doc
+ int errors
+
+ ctypedef struct xsltTransformContext:
+ xsltStylesheet* style
+ xmlXPathContext* xpathCtxt
+ xsltDocument* document
+ void* _private
+ xmlDict* dict
+ int profile
+ xmlNode* node
+ xmlDoc* output
+ xmlNode* insert
+ xmlNode* inst
+ xsltTransformState state
+
+ ctypedef struct xsltStackElem
+
+ ctypedef struct xsltTemplate
+
+ cdef xsltStylesheet* xsltParseStylesheetDoc(xmlDoc* doc) nogil
+ cdef void xsltFreeStylesheet(xsltStylesheet* sheet) nogil
+
+cdef extern from "libxslt/imports.h":
+ # actually defined in "etree_defs.h"
+ cdef void LXML_GET_XSLT_ENCODING(const_xmlChar* result_var, xsltStylesheet* style)
+
+cdef extern from "libxslt/extensions.h":
+ ctypedef void (*xsltTransformFunction)(xsltTransformContext* ctxt,
+ xmlNode* context_node,
+ xmlNode* inst,
+ void* precomp_unused) nogil
+
+ cdef int xsltRegisterExtFunction(xsltTransformContext* ctxt,
+ const_xmlChar* name,
+ const_xmlChar* URI,
+ xmlXPathFunction function) nogil
+ cdef int xsltRegisterExtModuleFunction(const_xmlChar* name, const_xmlChar* URI,
+ xmlXPathFunction function) nogil
+ cdef int xsltUnregisterExtModuleFunction(const_xmlChar* name, const_xmlChar* URI)
+ cdef xmlXPathFunction xsltExtModuleFunctionLookup(
+ const_xmlChar* name, const_xmlChar* URI) nogil
+ cdef int xsltRegisterExtPrefix(xsltStylesheet* style,
+ const_xmlChar* prefix, const_xmlChar* URI) nogil
+ cdef int xsltRegisterExtElement(xsltTransformContext* ctxt,
+ const_xmlChar* name, const_xmlChar* URI,
+ xsltTransformFunction function) nogil
+
+cdef extern from "libxslt/documents.h":
+ ctypedef enum xsltLoadType:
+ XSLT_LOAD_START
+ XSLT_LOAD_STYLESHEET
+ XSLT_LOAD_DOCUMENT
+
+ ctypedef xmlDoc* (*xsltDocLoaderFunc)(const_xmlChar* URI, xmlDict* dict,
+ int options,
+ void* ctxt,
+ xsltLoadType type) nogil
+ cdef xsltDocLoaderFunc xsltDocDefaultLoader
+ cdef void xsltSetLoaderFunc(xsltDocLoaderFunc f) nogil
+
+cdef extern from "libxslt/transform.h":
+ cdef xmlDoc* xsltApplyStylesheet(xsltStylesheet* style, xmlDoc* doc,
+ const_char** params) nogil
+ cdef xmlDoc* xsltApplyStylesheetUser(xsltStylesheet* style, xmlDoc* doc,
+ const_char** params, const_char* output,
+ void* profile,
+ xsltTransformContext* context) nogil
+ cdef void xsltProcessOneNode(xsltTransformContext* ctxt,
+ xmlNode* contextNode,
+ xsltStackElem* params) nogil
+ cdef xsltTransformContext* xsltNewTransformContext(xsltStylesheet* style,
+ xmlDoc* doc) nogil
+ cdef void xsltFreeTransformContext(xsltTransformContext* context) nogil
+ cdef void xsltApplyOneTemplate(xsltTransformContext* ctxt,
+ xmlNode* contextNode, xmlNode* list,
+ xsltTemplate* templ,
+ xsltStackElem* params) nogil
+
+
+cdef extern from "libxslt/xsltutils.h":
+ cdef int xsltSaveResultToString(xmlChar** doc_txt_ptr,
+ int* doc_txt_len,
+ xmlDoc* result,
+ xsltStylesheet* style) nogil
+ cdef int xsltSaveResultToFilename(const_char *URL,
+ xmlDoc* result,
+ xsltStylesheet* style,
+ int compression) nogil
+ cdef int xsltSaveResultTo(xmlOutputBuffer* buf,
+ xmlDoc* result,
+ xsltStylesheet* style) nogil
+ cdef xmlGenericErrorFunc xsltGenericError
+ cdef void *xsltGenericErrorContext
+ cdef void xsltSetGenericErrorFunc(
+ void* ctxt, void (*handler)(void* ctxt, char* msg, ...)) nogil
+ cdef void xsltSetTransformErrorFunc(
+ xsltTransformContext*, void* ctxt,
+ void (*handler)(void* ctxt, char* msg, ...) nogil) nogil
+ cdef void xsltTransformError(xsltTransformContext* ctxt,
+ xsltStylesheet* style,
+ xmlNode* node, char* msg, ...)
+ cdef void xsltSetCtxtParseOptions(
+ xsltTransformContext* ctxt, int options)
+
+
+cdef extern from "libxslt/security.h":
+ ctypedef struct xsltSecurityPrefs
+ ctypedef enum xsltSecurityOption:
+ XSLT_SECPREF_READ_FILE = 1
+ XSLT_SECPREF_WRITE_FILE = 2
+ XSLT_SECPREF_CREATE_DIRECTORY = 3
+ XSLT_SECPREF_READ_NETWORK = 4
+ XSLT_SECPREF_WRITE_NETWORK = 5
+
+ ctypedef int (*xsltSecurityCheck)(xsltSecurityPrefs* sec,
+ xsltTransformContext* ctxt,
+ char* value) nogil
+
+ cdef xsltSecurityPrefs* xsltNewSecurityPrefs() nogil
+ cdef void xsltFreeSecurityPrefs(xsltSecurityPrefs* sec) nogil
+ cdef int xsltSecurityForbid(xsltSecurityPrefs* sec,
+ xsltTransformContext* ctxt,
+ char* value) nogil
+ cdef int xsltSecurityAllow(xsltSecurityPrefs* sec,
+ xsltTransformContext* ctxt,
+ char* value) nogil
+ cdef int xsltSetSecurityPrefs(xsltSecurityPrefs* sec,
+ xsltSecurityOption option,
+ xsltSecurityCheck func) nogil
+ cdef xsltSecurityCheck xsltGetSecurityPrefs(
+ xsltSecurityPrefs* sec,
+ xsltSecurityOption option) nogil
+ cdef int xsltSetCtxtSecurityPrefs(xsltSecurityPrefs* sec,
+ xsltTransformContext* ctxt) nogil
+ cdef xmlDoc* xsltGetProfileInformation(xsltTransformContext* ctxt) nogil
+
+cdef extern from "libxslt/variables.h":
+ cdef int xsltQuoteUserParams(xsltTransformContext* ctxt,
+ const_char** params)
+ cdef int xsltQuoteOneUserParam(xsltTransformContext* ctxt,
+ const_xmlChar* name,
+ const_xmlChar* value)
+
+cdef extern from "libxslt/extra.h":
+ const_xmlChar* XSLT_LIBXSLT_NAMESPACE
+ const_xmlChar* XSLT_XALAN_NAMESPACE
+ const_xmlChar* XSLT_SAXON_NAMESPACE
+ const_xmlChar* XSLT_XT_NAMESPACE
+
+ cdef xmlXPathFunction xsltFunctionNodeSet
+ cdef void xsltRegisterAllExtras() nogil
+
+cdef extern from "libexslt/exslt.h":
+ cdef void exsltRegisterAll() nogil
+
+ # libexslt 1.1.25+
+ const_xmlChar* EXSLT_DATE_NAMESPACE
+ const_xmlChar* EXSLT_SETS_NAMESPACE
+ const_xmlChar* EXSLT_MATH_NAMESPACE
+ const_xmlChar* EXSLT_STRINGS_NAMESPACE
+
+ cdef int exsltDateXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)
+ cdef int exsltSetsXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)
+ cdef int exsltMathXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)
+ cdef int exsltStrXpathCtxtRegister(xmlXPathContext* ctxt, const_xmlChar* prefix)
+
diff --git a/src/lxml/isoschematron/__init__.py b/src/lxml/isoschematron/__init__.py
new file mode 100644
index 0000000..5967b10
--- /dev/null
+++ b/src/lxml/isoschematron/__init__.py
@@ -0,0 +1,334 @@
+"""The ``lxml.isoschematron`` package implements ISO Schematron support on top
+of the pure-xslt 'skeleton' implementation.
+"""
+
+import sys
+import os.path
+from lxml import etree as _etree # due to validator __init__ signature
+
+
+# some compat stuff, borrowed from lxml.html
+try:
+ unicode
+except NameError:
+ # Python 3
+ unicode = str
+try:
+ basestring
+except NameError:
+ # Python 3
+ basestring = str
+
+
+__all__ = ['extract_xsd', 'extract_rng', 'iso_dsdl_include',
+ 'iso_abstract_expand', 'iso_svrl_for_xslt1',
+ 'svrl_validation_errors', 'schematron_schema_valid',
+ 'stylesheet_params', 'Schematron']
+
+
+# some namespaces
+#FIXME: Maybe lxml should provide a dedicated place for common namespace
+#FIXME: definitions?
+XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema"
+RELAXNG_NS = "http://relaxng.org/ns/structure/1.0"
+SCHEMATRON_NS = "http://purl.oclc.org/dsdl/schematron"
+SVRL_NS = "http://purl.oclc.org/dsdl/svrl"
+
+
+# some helpers
+_schematron_root = '{%s}schema' % SCHEMATRON_NS
+_xml_schema_root = '{%s}schema' % XML_SCHEMA_NS
+_resources_dir = os.path.join(os.path.dirname(__file__), 'resources')
+
+
+# the iso-schematron skeleton implementation steps aka xsl transformations
+extract_xsd = _etree.XSLT(_etree.parse(
+ os.path.join(_resources_dir, 'xsl', 'XSD2Schtrn.xsl')))
+extract_rng = _etree.XSLT(_etree.parse(
+ os.path.join(_resources_dir, 'xsl', 'RNG2Schtrn.xsl')))
+iso_dsdl_include = _etree.XSLT(_etree.parse(
+ os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
+ 'iso_dsdl_include.xsl')))
+iso_abstract_expand = _etree.XSLT(_etree.parse(
+ os.path.join(_resources_dir, 'xsl', 'iso-schematron-xslt1',
+ 'iso_abstract_expand.xsl')))
+iso_svrl_for_xslt1 = _etree.XSLT(_etree.parse(
+ os.path.join(_resources_dir,
+ 'xsl', 'iso-schematron-xslt1', 'iso_svrl_for_xslt1.xsl')))
+
+
+# svrl result accessors
+svrl_validation_errors = _etree.XPath(
+ '//svrl:failed-assert', namespaces={'svrl': SVRL_NS})
+
+
+# RelaxNG validator for schematron schemas
+schematron_schema_valid = _etree.RelaxNG(
+ file=os.path.join(_resources_dir, 'rng', 'iso-schematron.rng'))
+
+
+def stylesheet_params(**kwargs):
+ """Convert keyword args to a dictionary of stylesheet parameters.
+ XSL stylesheet parameters must be XPath expressions, i.e.:
+
+ * string expressions, like "'5'"
+ * simple (number) expressions, like "5"
+ * valid XPath expressions, like "/a/b/text()"
+
+ This function converts native Python keyword arguments to stylesheet
+ parameters following these rules:
+ If an arg is a string wrap it with XSLT.strparam().
+ If an arg is an XPath object use its path string.
+ If arg is None raise TypeError.
+ Else convert arg to string.
+ """
+ result = {}
+ for key, val in kwargs.items():
+ if isinstance(val, basestring):
+ val = _etree.XSLT.strparam(val)
+ elif val is None:
+ raise TypeError('None not allowed as a stylesheet parameter')
+ elif not isinstance(val, _etree.XPath):
+ val = unicode(val)
+ result[key] = val
+ return result
+
+
+# helper function for use in Schematron __init__
+def _stylesheet_param_dict(paramsDict, kwargsDict):
+ """Return a copy of paramsDict, updated with kwargsDict entries, wrapped as
+ stylesheet arguments.
+ kwargsDict entries with a value of None are ignored.
+ """
+ # beware of changing mutable default arg
+ paramsDict = dict(paramsDict)
+ for k, v in kwargsDict.items():
+ if v is not None: # None values do not override
+ paramsDict[k] = v
+ paramsDict = stylesheet_params(**paramsDict)
+ return paramsDict
+
+
+class Schematron(_etree._Validator):
+ """An ISO Schematron validator.
+
+ Pass a root Element or an ElementTree to turn it into a validator.
+ Alternatively, pass a filename as keyword argument 'file' to parse from
+ the file system.
+
+ Schematron is a less well known, but very powerful schema language.
+ The main idea is to use the capabilities of XPath to put restrictions on
+ the structure and the content of XML documents.
+
+ The standard behaviour is to fail on ``failed-assert`` findings only
+ (``ASSERTS_ONLY``). To change this, you can either pass a report filter
+ function to the ``error_finder`` parameter (e.g. ``ASSERTS_AND_REPORTS``
+ or a custom ``XPath`` object), or subclass isoschematron.Schematron for
+ complete control of the validation process.
+
+ Built on the Schematron language 'reference' skeleton pure-xslt
+ implementation, the validator is created as an XSLT 1.0 stylesheet using
+ these steps:
+
+ 0) (Extract from XML Schema or RelaxNG schema)
+ 1) Process inclusions
+ 2) Process abstract patterns
+ 3) Compile the schematron schema to XSLT
+
+ The ``include`` and ``expand`` keyword arguments can be used to switch off
+ steps 1) and 2).
+ To set parameters for steps 1), 2) and 3) hand parameter dictionaries to the
+ keyword arguments ``include_params``, ``expand_params`` or
+ ``compile_params``.
+ For convenience, the compile-step parameter ``phase`` is also exposed as a
+ keyword argument ``phase``. This takes precedence if the parameter is also
+ given in the parameter dictionary.
+
+ If ``store_schematron`` is set to True, the (included-and-expanded)
+ schematron document tree is stored and available through the ``schematron``
+ property.
+ If ``store_xslt`` is set to True, the validation XSLT document tree will be
+ stored and can be retrieved through the ``validator_xslt`` property.
+ With ``store_report`` set to True (default: False), the resulting validation
+ report document gets stored and can be accessed as the ``validation_report``
+ property.
+
+ Here is a usage example::
+
+ >>> from lxml import etree
+ >>> from lxml.isoschematron import Schematron
+
+ >>> schematron = Schematron(etree.XML('''
+ ... <schema xmlns="http://purl.oclc.org/dsdl/schematron" >
+ ... <pattern id="id_only_attribute">
+ ... <title>id is the only permitted attribute name</title>
+ ... <rule context="*">
+ ... <report test="@*[not(name()='id')]">Attribute
+ ... <name path="@*[not(name()='id')]"/> is forbidden<name/>
+ ... </report>
+ ... </rule>
+ ... </pattern>
+ ... </schema>'''),
+ ... error_finder=Schematron.ASSERTS_AND_REPORTS)
+
+ >>> xml = etree.XML('''
+ ... <AAA name="aaa">
+ ... <BBB id="bbb"/>
+ ... <CCC color="ccc"/>
+ ... </AAA>
+ ... ''')
+
+ >>> schematron.validate(xml)
+ False
+
+ >>> xml = etree.XML('''
+ ... <AAA id="aaa">
+ ... <BBB id="bbb"/>
+ ... <CCC/>
+ ... </AAA>
+ ... ''')
+
+ >>> schematron.validate(xml)
+ True
+ """
+
+ # libxml2 error categorization for validation errors
+ _domain = _etree.ErrorDomains.SCHEMATRONV
+ _level = _etree.ErrorLevels.ERROR
+ _error_type = _etree.ErrorTypes.SCHEMATRONV_ASSERT
+
+ # convenience definitions for common behaviours
+ ASSERTS_ONLY = svrl_validation_errors # Default
+ ASSERTS_AND_REPORTS = _etree.XPath(
+ '//svrl:failed-assert | //svrl:successful-report',
+ namespaces={'svrl': SVRL_NS})
+
+ def _extract(self, element):
+ """Extract embedded schematron schema from non-schematron host schema.
+ This method will only be called by __init__ if the given schema document
+ is not a schematron schema by itself.
+ Must return a schematron schema document tree or None.
+ """
+ schematron = None
+ if element.tag == _xml_schema_root:
+ schematron = self._extract_xsd(element)
+ elif element.nsmap[element.prefix] == RELAXNG_NS:
+ # RelaxNG does not have a single unique root element
+ schematron = self._extract_rng(element)
+ return schematron
+
+ # customization points
+ # etree.XSLT objects that provide the extract, include, expand, compile
+ # steps
+ _extract_xsd = extract_xsd
+ _extract_rng = extract_rng
+ _include = iso_dsdl_include
+ _expand = iso_abstract_expand
+ _compile = iso_svrl_for_xslt1
+
+ # etree.xpath object that determines input document validity when applied to
+ # the svrl result report; must return a list of result elements (empty if
+ # valid)
+ _validation_errors = ASSERTS_ONLY
+
+ def __init__(self, etree=None, file=None, include=True, expand=True,
+ include_params={}, expand_params={}, compile_params={},
+ store_schematron=False, store_xslt=False, store_report=False,
+ phase=None, error_finder=ASSERTS_ONLY):
+ super(Schematron, self).__init__()
+
+ self._store_report = store_report
+ self._schematron = None
+ self._validator_xslt = None
+ self._validation_report = None
+ if error_finder is not self.ASSERTS_ONLY:
+ self._validation_errors = error_finder
+
+ # parse schema document, may be a schematron schema or an XML Schema or
+ # a RelaxNG schema with embedded schematron rules
+ root = None
+ try:
+ if etree is not None:
+ if _etree.iselement(etree):
+ root = etree
+ else:
+ root = etree.getroot()
+ elif file is not None:
+ root = _etree.parse(file).getroot()
+ except Exception:
+ raise _etree.SchematronParseError(
+ "No tree or file given: %s" % sys.exc_info()[1])
+ if root is None:
+ raise ValueError("Empty tree")
+ if root.tag == _schematron_root:
+ schematron = root
+ else:
+ schematron = self._extract(root)
+ if schematron is None:
+ raise _etree.SchematronParseError(
+ "Document is not a schematron schema or schematron-extractable")
+ # perform the iso-schematron skeleton implementation steps to get a
+ # validating xslt
+ if include:
+ schematron = self._include(schematron, **include_params)
+ if expand:
+ schematron = self._expand(schematron, **expand_params)
+ if not schematron_schema_valid(schematron):
+ raise _etree.SchematronParseError(
+ "invalid schematron schema: %s" %
+ schematron_schema_valid.error_log)
+ if store_schematron:
+ self._schematron = schematron
+ # add new compile keyword args here if exposing them
+ compile_kwargs = {'phase': phase}
+ compile_params = _stylesheet_param_dict(compile_params, compile_kwargs)
+ validator_xslt = self._compile(schematron, **compile_params)
+ if store_xslt:
+ self._validator_xslt = validator_xslt
+ self._validator = _etree.XSLT(validator_xslt)
+
+ def __call__(self, etree):
+ """Validate doc using Schematron.
+
+ Returns true if document is valid, false if not.
+ """
+ self._clear_error_log()
+ result = self._validator(etree)
+ if self._store_report:
+ self._validation_report = result
+ errors = self._validation_errors(result)
+ if errors:
+ if _etree.iselement(etree):
+ fname = etree.getroottree().docinfo.URL or '<file>'
+ else:
+ fname = etree.docinfo.URL or '<file>'
+ for error in errors:
+ # Does svrl report the line number, anywhere? Don't think so.
+ self._append_log_message(
+ domain=self._domain, type=self._error_type,
+ level=self._level, line=0,
+ message=_etree.tostring(error, encoding='unicode'),
+ filename=fname)
+ return False
+ return True
+
+ @property
+ def schematron(self):
+ """ISO-schematron schema document (None if object has been initialized
+ with store_schematron=False).
+ """
+ return self._schematron
+
+ @property
+ def validator_xslt(self):
+ """ISO-schematron skeleton implementation XSLT validator document (None
+ if object has been initialized with store_xslt=False).
+ """
+ return self._validator_xslt
+
+ @property
+ def validation_report(self):
+ """ISO-schematron validation result report (None if result-storing has
+ been turned off).
+ """
+ return self._validation_report
diff --git a/src/lxml/isoschematron/resources/rng/iso-schematron.rng b/src/lxml/isoschematron/resources/rng/iso-schematron.rng
new file mode 100644
index 0000000..a4f504a
--- /dev/null
+++ b/src/lxml/isoschematron/resources/rng/iso-schematron.rng
@@ -0,0 +1,709 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Copyright © ISO/IEC 2015 -->
+<!--
+ The following permission notice and disclaimer shall be included in all
+ copies of this XML schema ("the Schema"), and derivations of the Schema:
+
+ Permission is hereby granted, free of charge in perpetuity, to any
+ person obtaining a copy of the Schema, to use, copy, modify, merge and
+ distribute free of charge, copies of the Schema for the purposes of
+ developing, implementing, installing and using software based on the
+ Schema, and to permit persons to whom the Schema is furnished to do so,
+ subject to the following conditions:
+
+ THE SCHEMA IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SCHEMA OR THE USE OR
+ OTHER DEALINGS IN THE SCHEMA.
+
+ In addition, any modified copy of the Schema shall include the following
+ notice:
+
+ "THIS SCHEMA HAS BEEN MODIFIED FROM THE SCHEMA DEFINED IN ISO/IEC 19757-3,
+ AND SHOULD NOT BE INTERPRETED AS COMPLYING WITH THAT STANDARD".
+-->
+<grammar ns="http://purl.oclc.org/dsdl/schematron" xmlns="http://relaxng.org/ns/structure/1.0" datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
+ <start>
+ <ref name="schema"/>
+ </start>
+ <!-- Element declarations -->
+ <define name="schema">
+ <element name="schema">
+ <optional>
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ </optional>
+ <ref name="rich"/>
+ <optional>
+ <attribute name="schemaVersion">
+ <ref name="non-empty-string"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="defaultPhase">
+ <data type="IDREF"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="queryBinding">
+ <ref name="non-empty-string"/>
+ </attribute>
+ </optional>
+ <interleave>
+ <ref name="foreign"/>
+ <zeroOrMore>
+ <ref name="inclusion"/>
+ </zeroOrMore>
+ <group>
+ <optional>
+ <ref name="title"/>
+ </optional>
+ <zeroOrMore>
+ <ref name="ns"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="p"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="let"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="phase"/>
+ </zeroOrMore>
+ <oneOrMore>
+ <ref name="pattern"/>
+ </oneOrMore>
+ <zeroOrMore>
+ <ref name="p"/>
+ </zeroOrMore>
+ <optional>
+ <ref name="diagnostics"/>
+ </optional>
+ <optional>
+ <!-- edited (lxml): required in standard, optional here (since it can be empty anyway) -->
+ <ref name="properties"/>
+ </optional>
+ </group>
+ </interleave>
+ </element>
+ </define>
+ <define name="active">
+ <element name="active">
+ <attribute name="pattern">
+ <data type="IDREF"/>
+ </attribute>
+ <interleave>
+ <ref name="foreign"/>
+ <zeroOrMore>
+ <choice>
+ <text/>
+ <ref name="dir"/>
+ <ref name="emph"/>
+ <ref name="span"/>
+ </choice>
+ </zeroOrMore>
+ </interleave>
+ </element>
+ </define>
+ <define name="assert">
+ <element name="assert">
+ <attribute name="test">
+ <ref name="exprValue"/>
+ </attribute>
+ <optional>
+ <attribute name="flag">
+ <ref name="flagValue"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="diagnostics">
+ <data type="IDREFS"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="properties">
+ <data type="IDREFS"/>
+ </attribute>
+ </optional>
+ <ref name="rich"/>
+ <ref name="linkable"/>
+ <interleave>
+ <ref name="foreign"/>
+ <zeroOrMore>
+ <choice>
+ <text/>
+ <ref name="name"/>
+ <ref name="value-of"/>
+ <ref name="emph"/>
+ <ref name="dir"/>
+ <ref name="span"/>
+ </choice>
+ </zeroOrMore>
+ </interleave>
+ </element>
+ </define>
+ <define name="diagnostic">
+ <element name="diagnostic">
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ <ref name="rich"/>
+ <interleave>
+ <ref name="foreign"/>
+ <zeroOrMore>
+ <choice>
+ <text/>
+ <ref name="value-of"/>
+ <ref name="emph"/>
+ <ref name="dir"/>
+ <ref name="span"/>
+ </choice>
+ </zeroOrMore>
+ </interleave>
+ </element>
+ </define>
+ <define name="diagnostics">
+ <element name="diagnostics">
+ <interleave>
+ <ref name="foreign"/>
+ <zeroOrMore>
+ <ref name="inclusion"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="diagnostic"/>
+ </zeroOrMore>
+ </interleave>
+ </element>
+ </define>
+ <define name="dir">
+ <element name="dir">
+ <optional>
+ <attribute name="value">
+ <choice>
+ <value>ltr</value>
+ <value>rtl</value>
+ </choice>
+ </attribute>
+ </optional>
+ <interleave>
+ <ref name="foreign"/>
+ <text/>
+ </interleave>
+ </element>
+ </define>
+ <define name="emph">
+ <element name="emph">
+ <text/>
+ </element>
+ </define>
+ <define name="extends">
+ <element name="extends">
+ <choice>
+ <attribute name="rule">
+ <data type="IDREF"/>
+ </attribute>
+ <attribute name="href">
+ <ref name="uriValue"/>
+ </attribute>
+ </choice>
+ <ref name="foreign-empty"/>
+ </element>
+ </define>
+ <define name="let">
+ <element name="let">
+ <attribute name="name">
+ <ref name="nameValue"/>
+ </attribute>
+ <choice>
+ <attribute name="value">
+ <data type="string" datatypeLibrary=""/>
+ </attribute>
+ <oneOrMore>
+ <ref name="foreign-element"/>
+ </oneOrMore>
+ </choice>
+ </element>
+ </define>
+ <define name="name">
+ <element name="name">
+ <optional>
+ <attribute name="path">
+ <ref name="pathValue"/>
+ </attribute>
+ </optional>
+ <ref name="foreign-empty"/>
+ </element>
+ </define>
+ <define name="ns">
+ <element name="ns">
+ <attribute name="uri">
+ <ref name="uriValue"/>
+ </attribute>
+ <attribute name="prefix">
+ <ref name="nameValue"/>
+ </attribute>
+ <ref name="foreign-empty"/>
+ </element>
+ </define>
+ <define name="p">
+ <element name="p">
+ <optional>
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="class">
+ <ref name="classValue"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="icon">
+ <ref name="uriValue"/>
+ </attribute>
+ </optional>
+ <interleave>
+ <ref name="foreign"/>
+ <zeroOrMore>
+ <choice>
+ <text/>
+ <ref name="dir"/>
+ <ref name="emph"/>
+ <ref name="span"/>
+ </choice>
+ </zeroOrMore>
+ </interleave>
+ </element>
+ </define>
+ <define name="param">
+ <element name="param">
+ <attribute name="name">
+ <ref name="nameValue"/>
+ </attribute>
+ <attribute name="value">
+ <ref name="non-empty-string"/>
+ </attribute>
+ </element>
+ </define>
+ <define name="pattern">
+ <element name="pattern">
+ <optional>
+ <attribute name="documents">
+ <ref name="pathValue"/>
+ </attribute>
+ </optional>
+ <ref name="rich"/>
+ <interleave>
+ <ref name="foreign"/>
+ <zeroOrMore>
+ <ref name="inclusion"/>
+ </zeroOrMore>
+ <choice>
+ <group>
+ <attribute name="abstract">
+ <value>true</value>
+ </attribute>
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ <optional>
+ <ref name="title"/>
+ </optional>
+ <group>
+ <zeroOrMore>
+ <ref name="p"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="let"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="rule"/>
+ </zeroOrMore>
+ </group>
+ </group>
+ <group>
+ <optional>
+ <attribute name="abstract">
+ <value>false</value>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ </optional>
+ <optional>
+ <ref name="title"/>
+ </optional>
+ <group>
+ <zeroOrMore>
+ <ref name="p"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="let"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="rule"/>
+ </zeroOrMore>
+ </group>
+ </group>
+ <group>
+ <optional>
+ <attribute name="abstract">
+ <value>false</value>
+ </attribute>
+ </optional>
+ <attribute name="is-a">
+ <data type="IDREF"/>
+ </attribute>
+ <optional>
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ </optional>
+ <optional>
+ <ref name="title"/>
+ </optional>
+ <group>
+ <zeroOrMore>
+ <ref name="p"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="param"/>
+ </zeroOrMore>
+ </group>
+ </group>
+ </choice>
+ </interleave>
+ </element>
+ </define>
+ <define name="phase">
+ <element name="phase">
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ <ref name="rich"/>
+ <interleave>
+ <ref name="foreign"/>
+ <zeroOrMore>
+ <ref name="inclusion"/>
+ </zeroOrMore>
+ <group>
+ <zeroOrMore>
+ <ref name="p"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="let"/>
+ </zeroOrMore>
+ <zeroOrMore>
+ <ref name="active"/>
+ </zeroOrMore>
+ </group>
+ </interleave>
+ </element>
+ </define>
+ <define name="properties">
+ <element name="properties">
+ <zeroOrMore>
+ <ref name="property"/>
+ </zeroOrMore>
+ </element>
+ </define>
+ <define name="property">
+ <element name="property">
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ <optional>
+ <attribute name="role">
+ <ref name="roleValue"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="scheme"/>
+ </optional>
+ <interleave>
+ <ref name="foreign"/>
+ <zeroOrMore>
+ <choice>
+ <text/>
+ <ref name="name"/>
+ <ref name="value-of"/>
+ <ref name="emph"/>
+ <ref name="dir"/>
+ <ref name="span"/>
+ </choice>
+ </zeroOrMore>
+ </interleave>
+ </element>
+ </define>
+ <define name="report">
+ <element name="report">
+ <attribute name="test">
+ <ref name="exprValue"/>
+ </attribute>
+ <optional>
+ <attribute name="flag">
+ <ref name="flagValue"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="diagnostics">
+ <data type="IDREFS"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="properties">
+ <data type="IDREFS"/>
+ </attribute>
+ </optional>
+ <ref name="rich"/>
+ <ref name="linkable"/>
+ <interleave>
+ <ref name="foreign"/>
+ <zeroOrMore>
+ <choice>
+ <text/>
+ <ref name="name"/>
+ <ref name="value-of"/>
+ <ref name="emph"/>
+ <ref name="dir"/>
+ <ref name="span"/>
+ </choice>
+ </zeroOrMore>
+ </interleave>
+ </element>
+ </define>
+ <define name="rule">
+ <element name="rule">
+ <optional>
+ <attribute name="flag">
+ <ref name="flagValue"/>
+ </attribute>
+ </optional>
+ <ref name="rich"/>
+ <ref name="linkable"/>
+ <interleave>
+ <ref name="foreign"/>
+ <zeroOrMore>
+ <ref name="inclusion"/>
+ </zeroOrMore>
+ <choice>
+ <group>
+ <attribute name="abstract">
+ <value>true</value>
+ </attribute>
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ <zeroOrMore>
+ <ref name="let"/>
+ </zeroOrMore>
+ <oneOrMore>
+ <choice>
+ <ref name="assert"/>
+ <ref name="report"/>
+ <ref name="extends"/>
+ <ref name="p"/>
+ </choice>
+ </oneOrMore>
+ </group>
+ <group>
+ <attribute name="context">
+ <ref name="pathValue"/>
+ </attribute>
+ <optional>
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="abstract">
+ <value>false</value>
+ </attribute>
+ </optional>
+ <zeroOrMore>
+ <ref name="let"/>
+ </zeroOrMore>
+ <oneOrMore>
+ <choice>
+ <ref name="assert"/>
+ <ref name="report"/>
+ <ref name="extends"/>
+ <ref name="p"/>
+ </choice>
+ </oneOrMore>
+ </group>
+ </choice>
+ </interleave>
+ </element>
+ </define>
+ <define name="span">
+ <element name="span">
+ <attribute name="class">
+ <ref name="classValue"/>
+ </attribute>
+ <interleave>
+ <ref name="foreign"/>
+ <text/>
+ </interleave>
+ </element>
+ </define>
+ <define name="title">
+ <element name="title">
+ <zeroOrMore>
+ <choice>
+ <text/>
+ <ref name="dir"/>
+ </choice>
+ </zeroOrMore>
+ </element>
+ </define>
+ <define name="value-of">
+ <element name="value-of">
+ <attribute name="select">
+ <ref name="pathValue"/>
+ </attribute>
+ <ref name="foreign-empty"/>
+ </element>
+ </define>
+ <!-- common declarations -->
+ <define name="inclusion">
+ <element name="include">
+ <attribute name="href">
+ <ref name="uriValue"/>
+ </attribute>
+ <ref name="foreign-empty"/>
+ </element>
+ </define>
+ <define name="rich">
+ <optional>
+ <attribute name="icon">
+ <ref name="uriValue"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="see">
+ <ref name="uriValue"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="fpi">
+ <ref name="fpiValue"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="xml:lang">
+ <ref name="langValue"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="xml:space">
+ <choice>
+ <value>preserve</value>
+ <value>default</value>
+ </choice>
+ </attribute>
+ </optional>
+ </define>
+ <define name="linkable">
+ <optional>
+ <attribute name="role">
+ <ref name="roleValue"/>
+ </attribute>
+ </optional>
+ <optional>
+ <attribute name="subject">
+ <ref name="pathValue"/>
+ </attribute>
+ </optional>
+ </define>
+ <define name="foreign">
+ <ref name="foreign-attributes"/>
+ <zeroOrMore>
+ <ref name="foreign-element"/>
+ </zeroOrMore>
+ </define>
+ <define name="foreign-empty">
+ <ref name="foreign-attributes"/>
+ </define>
+ <define name="foreign-attributes">
+ <zeroOrMore>
+ <attribute>
+ <anyName>
+ <except>
+ <nsName ns=""/>
+ <nsName ns="http://www.w3.org/XML/1998/namespace"/>
+ </except>
+ </anyName>
+ </attribute>
+ </zeroOrMore>
+ </define>
+ <define name="foreign-element">
+ <element>
+ <anyName>
+ <except>
+ <nsName/>
+ </except>
+ </anyName>
+ <zeroOrMore>
+ <choice>
+ <attribute>
+ <anyName/>
+ </attribute>
+ <ref name="foreign-element"/>
+ <ref name="schema"/>
+ <text/>
+ </choice>
+ </zeroOrMore>
+ </element>
+ </define>
+ <!-- Data types -->
+ <define name="uriValue">
+ <data type="anyURI"/>
+ </define>
+ <define name="pathValue">
+ <data type="string" datatypeLibrary=""/>
+ </define>
+ <define name="exprValue">
+ <data type="string" datatypeLibrary=""/>
+ </define>
+ <define name="fpiValue">
+ <data type="string" datatypeLibrary=""/>
+ </define>
+ <define name="langValue">
+ <data type="language"/>
+ </define>
+ <define name="roleValue">
+ <data type="string" datatypeLibrary=""/>
+ </define>
+ <define name="flagValue">
+ <data type="string" datatypeLibrary=""/>
+ </define>
+ <define name="nameValue">
+ <data type="string" datatypeLibrary=""/>
+ </define>
+ <!-- In the default query language binding, xsd:NCNAME -->
+ <define name="classValue">
+ <data type="string" datatypeLibrary=""/>
+ </define>
+ <define name="non-empty-string">
+ <data type="token">
+ <param name="minLength">1</param>
+ </data>
+ </define>
+</grammar>
diff --git a/src/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl b/src/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl
new file mode 100644
index 0000000..21a5d2a
--- /dev/null
+++ b/src/lxml/isoschematron/resources/xsl/RNG2Schtrn.xsl
@@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Stylesheet for extracting Schematron information from a RELAX-NG schema.
+ Based on the stylesheet for extracting Schematron information from W3C XML Schema.
+ Created by Eddie Robertsson 2002/06/01
+ 2009/12/10 hj: changed Schematron namespace to ISO URI (Holger Joukl)
+-->
+<xsl:transform version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+xmlns:sch="http://purl.oclc.org/dsdl/schematron" xmlns:rng="http://relaxng.org/ns/structure/1.0">
+ <!-- Set the output to be XML with an XML declaration and use indentation -->
+ <xsl:output method="xml" omit-xml-declaration="no" indent="yes" standalone="yes"/>
+ <!-- -->
+ <!-- match schema and call recursive template to extract included schemas -->
+ <!-- -->
+ <xsl:template match="/rng:grammar | /rng:element">
+ <!-- call the schema definition template ... -->
+ <xsl:call-template name="gatherSchema">
+ <!-- ... with current node as the $schemas parameter ... -->
+ <xsl:with-param name="schemas" select="."/>
+ <!-- ... and any includes in the $include parameter -->
+ <xsl:with-param name="includes" select="document(/rng:grammar/rng:include/@href
+| //rng:externalRef/@href)"/>
+ </xsl:call-template>
+ </xsl:template>
+ <!-- -->
+ <!-- gather all included schemas into a single parameter variable -->
+ <!-- -->
+ <xsl:template name="gatherSchema">
+ <xsl:param name="schemas"/>
+ <xsl:param name="includes"/>
+ <xsl:choose>
+ <xsl:when test="count($schemas) &lt; count($schemas | $includes)">
+ <!-- when $includes includes something new, recurse ... -->
+ <xsl:call-template name="gatherSchema">
+ <!-- ... with current $includes added to the $schemas parameter ... -->
+ <xsl:with-param name="schemas" select="$schemas | $includes"/>
+ <!-- ... and any *new* includes in the $include parameter -->
+ <xsl:with-param name="includes" select="document($includes/rng:grammar/rng:include/@href
+| $includes//rng:externalRef/@href)"/>
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ <!-- we have the complete set of included schemas, so now let's output the embedded schematron -->
+ <xsl:call-template name="output">
+ <xsl:with-param name="schemas" select="$schemas"/>
+ </xsl:call-template>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+ <!-- -->
+ <!-- output the schematron information -->
+ <!-- -->
+ <xsl:template name="output">
+ <xsl:param name="schemas"/>
+ <!-- -->
+ <sch:schema>
+ <!-- get header-type elements - eg title and especially ns -->
+ <!-- title (just one) -->
+ <xsl:copy-of select="$schemas//sch:title[1]"/>
+ <!-- get remaining schematron schema children -->
+ <!-- get non-blank namespace elements, dropping duplicates -->
+ <xsl:for-each select="$schemas//sch:ns">
+ <xsl:if test="generate-id(.) = generate-id($schemas//sch:ns[@prefix = current()/@prefix][1])">
+ <xsl:copy-of select="."/>
+ </xsl:if>
+ </xsl:for-each>
+ <xsl:copy-of select="$schemas//sch:phase"/>
+ <xsl:copy-of select="$schemas//sch:pattern"/>
+ <sch:diagnostics>
+ <xsl:copy-of select="$schemas//sch:diagnostics/*"/>
+ </sch:diagnostics>
+ </sch:schema>
+ </xsl:template>
+ <!-- -->
+</xsl:transform>
diff --git a/src/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl b/src/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl
new file mode 100644
index 0000000..de0c9ea
--- /dev/null
+++ b/src/lxml/isoschematron/resources/xsl/XSD2Schtrn.xsl
@@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ based on an original transform by Eddie Robertsson
+ 2001/04/21 fn: added support for included schemas
+ 2001/06/27 er: changed XMl Schema prefix from xsd: to xs: and changed to the Rec namespace
+ 2009/12/10 hj: changed Schematron namespace to ISO URI (Holger Joukl)
+-->
+<xsl:transform version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+xmlns:sch="http://purl.oclc.org/dsdl/schematron" xmlns:xs="http://www.w3.org/2001/XMLSchema">
+ <!-- Set the output to be XML with an XML declaration and use indentation -->
+ <xsl:output method="xml" omit-xml-declaration="no" indent="yes" standalone="yes"/>
+ <!-- -->
+ <!-- match schema and call recursive template to extract included schemas -->
+ <!-- -->
+ <xsl:template match="xs:schema">
+ <!-- call the schema definition template ... -->
+ <xsl:call-template name="gatherSchema">
+ <!-- ... with current current root as the $schemas parameter ... -->
+ <xsl:with-param name="schemas" select="/"/>
+ <!-- ... and any includes in the $include parameter -->
+ <xsl:with-param name="includes"
+ select="document(/xs:schema/xs:*[self::xs:include or self::xs:import or self::xs:redefine]/@schemaLocation)"/>
+ </xsl:call-template>
+ </xsl:template>
+ <!-- -->
+ <!-- gather all included schemas into a single parameter variable -->
+ <!-- -->
+ <xsl:template name="gatherSchema">
+ <xsl:param name="schemas"/>
+ <xsl:param name="includes"/>
+ <xsl:choose>
+ <xsl:when test="count($schemas) &lt; count($schemas | $includes)">
+ <!-- when $includes includes something new, recurse ... -->
+ <xsl:call-template name="gatherSchema">
+ <!-- ... with current $includes added to the $schemas parameter ... -->
+ <xsl:with-param name="schemas" select="$schemas | $includes"/>
+ <!-- ... and any *new* includes in the $include parameter -->
+ <xsl:with-param name="includes"
+ select="document($includes/xs:schema/xs:*[self::xs:include or self::xs:import or self::xs:redefine]/@schemaLocation)"/>
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ <!-- we have the complete set of included schemas,
+ so now let's output the embedded schematron -->
+ <xsl:call-template name="output">
+ <xsl:with-param name="schemas" select="$schemas"/>
+ </xsl:call-template>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+ <!-- -->
+ <!-- output the schematron information -->
+ <!-- -->
+ <xsl:template name="output">
+ <xsl:param name="schemas"/>
+ <!-- -->
+ <sch:schema>
+ <!-- get header-type elements - eg title and especially ns -->
+ <!-- title (just one) -->
+ <xsl:copy-of select="$schemas//xs:appinfo/sch:title[1]"/>
+ <!-- get remaining schematron schema children -->
+ <!-- get non-blank namespace elements, dropping duplicates -->
+ <xsl:for-each select="$schemas//xs:appinfo/sch:ns">
+ <xsl:if test="generate-id(.) =
+ generate-id($schemas//xs:appinfo/sch:ns[@prefix = current()/@prefix][1])">
+ <xsl:copy-of select="."/>
+ </xsl:if>
+ </xsl:for-each>
+ <xsl:copy-of select="$schemas//xs:appinfo/sch:phase"/>
+ <xsl:copy-of select="$schemas//xs:appinfo/sch:pattern"/>
+ <sch:diagnostics>
+ <xsl:copy-of select="$schemas//xs:appinfo/sch:diagnostics/*"/>
+ </sch:diagnostics>
+ </sch:schema>
+ </xsl:template>
+ <!-- -->
+</xsl:transform>
diff --git a/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl
new file mode 100644
index 0000000..5018395
--- /dev/null
+++ b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_abstract_expand.xsl
@@ -0,0 +1,313 @@
+<?xml version="1.0" encoding="UTF-8"?><?xar XSLT?>
+
+<!--
+ OVERVIEW - iso_abstract_expand.xsl
+
+ This is a preprocessor for ISO Schematron, which implements abstract patterns.
+ It also
+ * extracts a particular schema using an ID, where there are multiple
+ schemas, such as when they are embedded in the same NVDL script
+ * allows parameter substitution inside @context, @test, @select, @path
+ * experimentally, allows parameter recognition and substitution inside
+ text (NOTE: to be removed, for compataibility with other implementations,
+ please do not use this)
+
+ This should be used after iso-dsdl-include.xsl and before the skeleton or
+ meta-stylesheet (e.g. iso-svrl.xsl) . It only requires XSLT 1.
+
+ Each kind of inclusion can be turned off (or on) on the command line.
+
+-->
+
+<!--
+Open Source Initiative OSI - The MIT License:Licensing
+[OSI Approved License]
+
+This source code was previously available under the zlib/libpng license.
+Attribution is polite.
+
+The MIT License
+
+Copyright (c) 2004-2010 Rick Jellife and Academia Sinica Computing Centre, Taiwan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+-->
+
+<!--
+VERSION INFORMATION
+ 2013-09-19 RJ
+ * Allow macro expansion in @path attributes, eg. for sch:name/@path
+
+ 2010-07-10 RJ
+ * Move to MIT license
+
+ 2008-09-18 RJ
+ * move out param test from iso:schema template to work with XSLT 1. (Noah Fontes)
+
+ 2008-07-29 RJ
+ * Create. Pull out as distinct XSL in its own namespace from old iso_pre_pro.xsl
+ * Put everything in private namespace
+ * Rewrite replace_substring named template so that copyright is clear
+
+ 2008-07-24 RJ
+ * correct abstract patterns so for correct names: param/@name and
+ param/@value
+
+ 2007-01-12 RJ
+ * Use ISO namespace
+ * Use pattern/@id not pattern/@name
+ * Add Oliver Becker's suggests from old Schematron-love-in list for <copy>
+ * Add XT -ism?
+ 2003 RJ
+ * Original written for old namespace
+ * http://www.topologi.com/resources/iso-pre-pro.xsl
+-->
+<xslt:stylesheet version="1.0" xmlns:xslt="http://www.w3.org/1999/XSL/Transform"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:iso="http://purl.oclc.org/dsdl/schematron"
+ xmlns:nvdl="http://purl.oclc.org/dsdl/nvdl"
+
+ xmlns:iae="http://www.schematron.com/namespace/iae"
+
+ >
+
+ <xslt:param name="schema-id"></xslt:param>
+
+
+ <!-- Driver for the mode -->
+ <xsl:template match="/">
+ <xsl:apply-templates select="." mode="iae:go" />
+ </xsl:template>
+
+
+ <!-- ================================================================================== -->
+ <!-- Normal processing rules -->
+ <!-- ================================================================================== -->
+ <!-- Output only the selected schema -->
+ <xslt:template match="iso:schema" >
+ <xsl:if test="string-length($schema-id) =0 or @id= $schema-id ">
+ <xslt:copy>
+ <xslt:copy-of select="@*" />
+ <xslt:apply-templates mode="iae:go" />
+ </xslt:copy>
+ </xsl:if>
+ </xslt:template>
+
+
+ <!-- Strip out any foreign elements above the Schematron schema .
+ -->
+ <xslt:template match="*[not(ancestor-or-self::iso:*)]" mode="iae:go" >
+ <xslt:apply-templates mode="iae:go" />
+ </xslt:template>
+
+
+ <!-- ================================================================================== -->
+ <!-- Handle Schematron abstract pattern preprocessing -->
+ <!-- abstract-to-real calls
+ do-pattern calls
+ macro-expand calls
+ multi-macro-expand
+ replace-substring -->
+ <!-- ================================================================================== -->
+
+ <!--
+ Abstract patterns allow you to say, for example
+
+ <pattern name="htmlTable" is-a="table">
+ <param name="row" value="html:tr"/>
+ <param name="cell" value="html:td" />
+ <param name="table" value="html:table" />
+ </pattern>
+
+ For a good introduction, see Uche Ogbujii's article for IBM DeveloperWorks
+ "Discover the flexibility of Schematron abstract patterns"
+ http://www-128.ibm.com/developerworks/xml/library/x-stron.html
+ However, note that ISO Schematron uses @name and @value attributes on
+ the iso:param element, and @id not @name on the pattern element.
+
+ -->
+
+ <!-- Suppress declarations of abstract patterns -->
+ <xslt:template match="iso:pattern[@abstract='true']" mode="iae:go" >
+ <xslt:comment>Suppressed abstract pattern <xslt:value-of select="@id"/> was here</xslt:comment>
+ </xslt:template>
+
+
+ <!-- Suppress uses of abstract patterns -->
+ <xslt:template match="iso:pattern[@is-a]" mode="iae:go" >
+
+ <xslt:comment>Start pattern based on abstract <xslt:value-of select="@is-a"/></xslt:comment>
+
+ <xslt:call-template name="iae:abstract-to-real" >
+ <xslt:with-param name="caller" select="@id" />
+ <xslt:with-param name="is-a" select="@is-a" />
+ </xslt:call-template>
+
+ </xslt:template>
+
+
+
+ <!-- output everything else unchanged -->
+ <xslt:template match="*" priority="-1" mode="iae:go" >
+ <xslt:copy>
+ <xslt:copy-of select="@*" />
+ <xslt:apply-templates mode="iae:go"/>
+ </xslt:copy>
+ </xslt:template>
+
+ <!-- Templates for macro expansion of abstract patterns -->
+ <!-- Sets up the initial conditions for the recursive call -->
+ <xslt:template name="iae:macro-expand">
+ <xslt:param name="caller"/>
+ <xslt:param name="text" />
+ <xslt:call-template name="iae:multi-macro-expand">
+ <xslt:with-param name="caller" select="$caller"/>
+ <xslt:with-param name="text" select="$text"/>
+ <xslt:with-param name="paramNumber" select="1"/>
+ </xslt:call-template>
+
+ </xslt:template>
+
+ <!-- Template to replace the current parameter and then
+ recurse to replace subsequent parameters. -->
+
+ <xslt:template name="iae:multi-macro-expand">
+ <xslt:param name="caller"/>
+ <xslt:param name="text" />
+ <xslt:param name="paramNumber" />
+
+
+ <xslt:choose>
+ <xslt:when test="//iso:pattern[@id=$caller]/iso:param[ $paramNumber]">
+
+ <xslt:call-template name="iae:multi-macro-expand">
+ <xslt:with-param name="caller" select="$caller"/>
+ <xslt:with-param name="paramNumber" select="$paramNumber + 1"/>
+ <xslt:with-param name="text" >
+ <xslt:call-template name="iae:replace-substring">
+ <xslt:with-param name="original" select="$text"/>
+ <xslt:with-param name="substring"
+ select="concat('$', //iso:pattern[@id=$caller]/iso:param[ $paramNumber ]/@name)"/>
+ <xslt:with-param name="replacement"
+ select="//iso:pattern[@id=$caller]/iso:param[ $paramNumber ]/@value"/>
+ </xslt:call-template>
+ </xslt:with-param>
+ </xslt:call-template>
+ </xslt:when>
+ <xslt:otherwise><xslt:value-of select="$text" /></xslt:otherwise>
+
+ </xslt:choose>
+ </xslt:template>
+
+
+ <!-- generate the real pattern from an abstract pattern + parameters-->
+ <xslt:template name="iae:abstract-to-real" >
+ <xslt:param name="caller"/>
+ <xslt:param name="is-a" />
+ <xslt:for-each select="//iso:pattern[@id= $is-a]">
+ <xslt:copy>
+
+ <xslt:choose>
+ <xslt:when test=" string-length( $caller ) = 0">
+ <xslt:attribute name="id"><xslt:value-of select="concat( generate-id(.) , $is-a)" /></xslt:attribute>
+ </xslt:when>
+ <xslt:otherwise>
+ <xslt:attribute name="id"><xslt:value-of select="$caller" /></xslt:attribute>
+ </xslt:otherwise>
+ </xslt:choose>
+
+ <xslt:apply-templates select="*|text()" mode="iae:do-pattern" >
+ <xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
+ </xslt:apply-templates>
+
+ </xslt:copy>
+ </xslt:for-each>
+ </xslt:template>
+
+
+ <!-- Generate a non-abstract pattern -->
+ <xslt:template mode="iae:do-pattern" match="*">
+ <xslt:param name="caller"/>
+ <xslt:copy>
+ <xslt:for-each select="@*[name()='test' or name()='context' or name()='select' or name()='path' ]">
+ <xslt:attribute name="{name()}">
+ <xslt:call-template name="iae:macro-expand">
+ <xslt:with-param name="text"><xslt:value-of select="."/></xslt:with-param>
+ <xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
+ </xslt:call-template>
+ </xslt:attribute>
+ </xslt:for-each>
+ <xslt:copy-of select="@*[name()!='test'][name()!='context'][name()!='select'][name()!='path']" />
+ <xsl:for-each select="node()">
+ <xsl:choose>
+ <!-- Experiment: replace macros in text as well, to allow parameterized assertions
+ and so on, without having to have spurious <iso:value-of> calls and multiple
+ delimiting.
+ NOTE: THIS FUNCTIONALITY WILL BE REMOVED IN THE FUTURE -->
+ <xsl:when test="self::text()">
+ <xslt:call-template name="iae:macro-expand">
+ <xslt:with-param name="text"><xslt:value-of select="."/></xslt:with-param>
+ <xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
+ </xslt:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ <xslt:apply-templates select="." mode="iae:do-pattern">
+ <xslt:with-param name="caller"><xslt:value-of select="$caller"/></xslt:with-param>
+ </xslt:apply-templates>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:for-each>
+ </xslt:copy>
+ </xslt:template>
+
+ <!-- UTILITIES -->
+ <!-- Simple version of replace-substring function -->
+ <xslt:template name="iae:replace-substring">
+ <xslt:param name="original" />
+ <xslt:param name="substring" />
+ <xslt:param name="replacement" select="''"/>
+
+ <xsl:choose>
+ <xsl:when test="not($original)" />
+ <xsl:when test="not(string($substring))">
+ <xsl:value-of select="$original" />
+ </xsl:when>
+ <xsl:when test="contains($original, $substring)">
+ <xsl:variable name="before" select="substring-before($original, $substring)" />
+ <xsl:variable name="after" select="substring-after($original, $substring)" />
+
+ <xsl:value-of select="$before" />
+ <xsl:value-of select="$replacement" />
+ <!-- recursion -->
+ <xsl:call-template name="iae:replace-substring">
+ <xsl:with-param name="original" select="$after" />
+ <xsl:with-param name="substring" select="$substring" />
+ <xsl:with-param name="replacement" select="$replacement" />
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise>
+ <!-- no substitution -->
+ <xsl:value-of select="$original" />
+ </xsl:otherwise>
+ </xsl:choose>
+</xslt:template>
+
+
+
+</xslt:stylesheet> \ No newline at end of file
diff --git a/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl
new file mode 100644
index 0000000..44e5573
--- /dev/null
+++ b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_dsdl_include.xsl
@@ -0,0 +1,1160 @@
+<?xml version="1.0" encoding="UTF-8"?><?xar XSLT?>
+
+<!--
+ OVERVIEW : iso_dsdl_include.xsl
+
+ This is an inclusion preprocessor for the non-smart text inclusions
+ of ISO DSDL. It handles
+ <relax:extRef> for ISO RELAX NG
+ <sch:include> for ISO Schematron and Schematron 1.n
+ <sch:extends> for 2009 draft ISO Schematron
+ <xi:xinclude> simple W3C XIncludes for ISO NVRL and DSRL
+ <crdl:ref> for draft ISO CRDL
+ <dtll:include> for draft ISO DTLL
+ <* @xlink:href> for simple W3C XLink 1.1 embedded links
+
+
+ This should be the first in any chain of processing. It only requires
+ XSLT 1. Each kind of inclusion can be turned off (or on) on the command line.
+
+ Ids in fragment identifiers or xpointers will be sought in the following
+ order:
+ * @xml:id
+ * id() for typed schemas (e.g. from DTD) [NOTE: XInclude does not support this]
+ * untyped @id
+
+ The proposed behaviour for the update to ISO Schematron has been implemented. If an
+ include points to an element with the same name as the parent, then that element's
+ contents will be included. This supports the merge style of inclusion.
+
+ When an inclusion is made, it is preceded by a PI with target DSDL_INCLUDE_START
+ and the href and closed by a PI with target DSDL_INCLUDE_START and the href. This is
+ to allow better location of problems, though only to the file level.
+
+ Limitations:
+ * No rebasing: relative paths will be interpreted based on the initial document's
+ path, not the including document. (Severe limitation!)
+ * No checking for circular references
+ * Not full xpointers: only ID matching
+ * <relax:include> not implemented
+ * XInclude handling of xml:base and xml:lang not implemented
+-->
+<!--
+ VERSION INFORMATION
+ 2009-02-25
+ * Update DSDL namespace to use schematron.com
+ * Tested with SAXON9, Xalan 2.7.1, IE7,
+ * IE does not like multiple variables in same template with same name: rename.
+ 2008-09-18
+ * Remove new behaviour for include, because it conflicts with existing usage [KH]
+ * Add extends[@href] element with that merge functionality
+ * Generate PIs to notate source of inclusions for potential better diagnostics
+
+ 2008-09-16
+ * Fix for XSLT1
+
+ 2008-08-28
+ * New behaviour for schematron includes: if the pointed to element is the same as the current,
+ include the children.
+
+ 2008-08-20
+ * Fix bug: in XSLT1 cannot do $document/id('x') but need to use for-each
+
+ 2008-08-04
+ * Add support for inclusions in old namespace
+
+ 2008-08-03
+ * Fix wrong param name include-relaxng & include-crdl (KH, PH)
+ * Allow inclusion of XSLT and XHTML (KH)
+ * Fix inclusion of fragments (KH)
+
+ 2008-07-25
+ * Add selectable input parameter
+
+ 2008-07-24
+ * RJ New
+-->
+<!--
+ LEGAL INFORMATION
+
+ Copyright (c) 2008 Rick Jelliffe
+
+ This software is provided 'as-is', without any express or implied warranty.
+ In no event will the authors be held liable for any damages arising from
+ the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not claim
+ that you wrote the original software. If you use this software in a product,
+ an acknowledgment in the product documentation would be appreciated but is
+ not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source distribution.
+-->
+<xslt:stylesheet version="1.0"
+ xmlns:xslt="http://www.w3.org/1999/XSL/Transform"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:iso="http://purl.oclc.org/dsdl/schematron"
+ xmlns:nvdl="http://purl.oclc.org/dsdl/nvdl"
+ xmlns:xhtml="http://www.w3.org/1999/xhtml"
+ xmlns:schold="http://www.ascc.net/xml/schematron"
+ xmlns:crdl="http://purl.oclc.org/dsdl/crepdl/ns/structure/1.0"
+ xmlns:xi="http://www.w3.org/2001/XInclude"
+ xmlns:dtll="http://www.jenitennison.com/datatypes"
+ xmlns:dsdl="http://www.schematron.com/namespace/dsdl"
+ xmlns:relax="http://relaxng.org/ns/structure/1.0"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+ <!-- Note: The URL for the dsdl namespace is not official -->
+
+
+ <xsl:param name="include-schematron">true</xsl:param>
+ <xsl:param name="include-crdl">true</xsl:param>
+ <xsl:param name="include-xinclude">true</xsl:param>
+ <xsl:param name="include-dtll">true</xsl:param>
+ <xsl:param name="include-relaxng">true</xsl:param>
+ <xsl:param name="include-xlink">true</xsl:param>
+
+ <xsl:template match="/">
+ <xsl:apply-templates select="." mode="dsdl:go" />
+ </xsl:template>
+
+ <!-- output everything else unchanged -->
+ <xslt:template match="node()" priority="-1" mode="dsdl:go">
+ <xslt:copy>
+ <xslt:copy-of select="@*" />
+ <xslt:apply-templates mode="dsdl:go" />
+ </xslt:copy>
+ </xslt:template>
+
+
+
+ <!-- =========================================================== -->
+ <!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages -->
+ <!-- Part 2 - Regular grammar-based validation - RELAX NG -->
+ <!-- This only implements relax:extRef not relax:include which -->
+ <!-- is complex. -->
+ <!-- =========================================================== -->
+ <xslt:template match="relax:extRef" mode="dsdl:go">
+
+
+ <!-- Insert subschema -->
+
+ <xsl:variable name="document-uri"
+ select="substring-before(concat(@href,'#'), '#')" />
+ <xsl:variable name="fragment-id"
+ select="substring-after(@href, '#')" />
+
+ <xsl:processing-instruction name="DSDL_INCLUDE_START">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+ <xsl:choose>
+ <xsl:when test="not( $include-relaxng = 'true' )">
+ <xslt:copy>
+ <xslt:copy-of select="@*" />
+ <xslt:apply-templates mode="dsdl:go" />
+ </xslt:copy>
+ </xsl:when>
+ <xsl:otherwise>
+
+ <xsl:choose>
+
+ <xsl:when
+ test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+ <xsl:message>
+ Error: Impossible URL in RELAX NG extRef
+ include
+ </xsl:message>
+ </xsl:when>
+
+ <!-- this case is when there is in embedded schema in the same document elsewhere -->
+ <xslt:when
+ test="string-length( $document-uri ) = 0">
+ <xslt:apply-templates mode="dsdl:go"
+ select="//*[@xml:id= $fragment-id ] | id( $fragment-id) | //*[@id= $fragment-id ]" />
+ </xslt:when>
+
+ <xsl:when
+ test="string-length( $fragment-id ) &gt; 0">
+ <xsl:variable name="theDocument_1"
+ select="document( $document-uri,/ )" />
+
+ <xsl:if test="not($theDocument_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <!-- use a for-each so that the id() function works correctly on the external document -->
+ <xsl:for-each select="$theDocument_1">
+ <xsl:variable name="theFragment_1"
+ select="$theDocument_1//*[@xml:id= $fragment-id ]
+ | id( $fragment-id)
+ | $theDocument_1//*[@id= $fragment-id ]" />
+ <xsl:if test="not($theFragment_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <xsl:apply-templates
+ select=" $theFragment_1[1]" mode="dsdl:go" />
+ </xsl:for-each>
+ </xsl:when>
+
+ <xsl:otherwise>
+ <xsl:variable name="theDocument_2"
+ select="document( $document-uri,/ )" />
+ <xsl:variable name="theFragment_2"
+ select="$theDocument_2/*" />
+ <xsl:if test="not($theDocument_2)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+
+ <xsl:if test="not($theFragment_2)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <xsl:apply-templates select="$theFragment_2 "
+ mode="dsdl:go" />
+ </xsl:otherwise>
+ </xsl:choose>
+
+ </xsl:otherwise>
+ </xsl:choose>
+
+ <xsl:processing-instruction name="DSDL_INCLUDE_END">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+ </xslt:template>
+
+
+
+ <!-- =========================================================== -->
+ <!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages -->
+ <!-- Part 3 - Rule-based validation - Schematron -->
+ <!-- =========================================================== -->
+
+
+ <!-- Extend the URI syntax to allow # references -->
+ <!-- Add experimental support for simple containers like /xxx:xxx/iso:pattern to allow better includes -->
+ <xsl:template match="iso:include" mode="dsdl:go">
+
+ <xsl:variable name="document-uri"
+ select="substring-before(concat(@href,'#'), '#')" />
+ <xsl:variable name="fragment-id"
+ select="substring-after(@href, '#')" />
+
+
+ <xsl:processing-instruction name="DSDL_INCLUDE_START">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+
+ <xsl:choose>
+ <xsl:when test="not( $include-schematron = 'true' )">
+ <xslt:copy>
+ <xslt:copy-of select="@*" />
+ <xslt:apply-templates mode="dsdl:go" />
+ </xslt:copy>
+ </xsl:when>
+ <xsl:otherwise>
+
+ <xsl:choose>
+
+ <xsl:when
+ test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+ <xsl:message>
+ Error: Impossible URL in Schematron include
+ </xsl:message>
+ </xsl:when>
+
+ <!-- this case is when there is in embedded schema in the same document elsewhere -->
+ <xslt:when
+ test="string-length( $document-uri ) = 0">
+ <xslt:apply-templates mode="dsdl:go"
+ select="//iso:*[@xml:id= $fragment-id ]
+ |id( $fragment-id)
+ | //iso:*[@id= $fragment-id ]" />
+ </xslt:when>
+
+ <!-- case where there is a fragment in another document (should be an iso: element) -->
+ <!-- There are three cases for includes with fragment:
+ 0) No href file or no matching id - error!
+ 1) REMOVED
+
+ 2) The linked-to element is sch:schema however the parent of the include
+ is not a schema. In this case, it is an error. (Actually, it should
+ be an error for other kinds of containment problems, but we won't
+ check for them in this version.)
+
+ 3) Otherwise, include the pointed-to element
+ -->
+
+ <xsl:when
+ test="string-length( $fragment-id ) &gt; 0">
+ <xsl:variable name="theDocument_1"
+ select="document( $document-uri,/ )" />
+ <xsl:variable name="originalParent" select=".." />
+
+ <!-- case 0 -->
+ <xsl:if test="not($theDocument_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <!-- use for-each to rebase id() to external document -->
+ <xsl:for-each select="$theDocument_1">
+ <xsl:variable name="theFragment_1"
+ select=" $theDocument_1//iso:*[@xml:id= $fragment-id ] |
+ id($fragment-id) |
+ $theDocument_1//iso:*[@id= $fragment-id ]" />
+
+
+ <xsl:choose>
+ <!-- case 0 -->
+ <xsl:when test="not($theFragment_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:when>
+
+
+ <!-- case 1 REMOVED -->
+
+ <!-- case 2 -->
+ <xsl:when
+ test=" $theFragment_1/self::iso:schema ">
+ <xsl:message>
+ Schema error: Use include to
+ include fragments, not a whole
+ schema
+ </xsl:message>
+ </xsl:when>
+
+ <!-- case 3 -->
+ <xsl:otherwise>
+ <xsl:apply-templates
+ select=" $theFragment_1[1]" mode="dsdl:go" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:for-each>
+ </xsl:when>
+
+ <!-- Case where there is no ID so we include the whole document -->
+ <!-- Experimental addition: include fragments of children -->
+ <xsl:otherwise>
+ <xsl:variable name="theDocument_2"
+ select="document( $document-uri,/ )" />
+ <xsl:variable name="theFragment_2"
+ select="$theDocument_2/iso:*" />
+ <xsl:variable name="theContainedFragments"
+ select="$theDocument_2/*/iso:* | $theDocument_2/*/xsl:* | $theDocument_2/*/xhtml:*" />
+ <xsl:if test="not($theDocument_2)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+
+ <!-- There are three cases for includes:
+ 0) No text specified- error!
+
+ 1) REMOVED
+
+ 2) The linked-to element is sch:schema however the parent of the include
+ is not a schema. In this case, it is an error. (Actually, it should
+ be an error for other kinds of containment problems, but we won't
+ check for them in this version.)
+
+ 3) Otherwise, include the pointed-to element
+ -->
+ <xsl:choose>
+ <!-- case 0 -->
+ <xsl:when
+ test="not($theFragment_2) and not ($theContainedFragments)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:when>
+
+ <!-- case 1 removed -->
+
+ <!-- case 2 -->
+ <xsl:when
+ test=" $theFragment_2/self::iso:schema or $theContainedFragments/self::iso:schema">
+ <xsl:message>
+ Schema error: Use include to include
+ fragments, not a whole schema
+ </xsl:message>
+ </xsl:when>
+
+ <!-- If this were XLST 2, we could use
+ if ($theFragment) then $theFragment else $theContainedFragments
+ here (thanks to KN)
+ -->
+ <!-- case 3 -->
+ <xsl:otherwise>
+ <xsl:apply-templates
+ select="$theFragment_2 " mode="dsdl:go" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:otherwise>
+ </xsl:choose>
+
+ <xsl:processing-instruction name="DSDL_INCLUDE_END">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+ </xsl:template>
+
+
+ <!-- WARNING sch:extends[@href] is experimental and non standard -->
+ <!-- Basically, it adds the children of the selected element, not the element itself. -->
+ <xsl:template match="iso:extends[@href]" mode="dsdl:go">
+
+ <xsl:variable name="document-uri"
+ select="substring-before(concat(@href,'#'), '#')" />
+ <xsl:variable name="fragment-id"
+ select="substring-after(@href, '#')" />
+
+
+ <xsl:processing-instruction name="DSDL_INCLUDE_START">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+
+ <xsl:choose>
+ <xsl:when test="not( $include-schematron = 'true' )">
+ <xslt:copy>
+ <xslt:copy-of select="@*" />
+ <xslt:apply-templates mode="dsdl:go" />
+ </xslt:copy>
+ </xsl:when>
+ <xsl:otherwise>
+
+ <xsl:choose>
+
+ <xsl:when
+ test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+ <xsl:message>
+ Error: Impossible URL in Schematron include
+ </xsl:message>
+ </xsl:when>
+
+ <!-- this case is when there is in embedded schema in the same document elsewhere -->
+ <xslt:when
+ test="string-length( $document-uri ) = 0">
+ <xslt:apply-templates mode="dsdl:go"
+ select="//iso:*[@xml:id= $fragment-id ]/*
+ |id( $fragment-id)/*
+ | //iso:*[@id= $fragment-id ]/*" />
+ </xslt:when>
+
+ <!-- case where there is a fragment in another document (should be an iso: element) -->
+ <!-- There are three cases for includes with fragment:
+ 0) No href file or no matching id - error!
+ 1) REMOVED
+
+ 2) REMOVED
+
+ 3) Otherwise, include the pointed-to element
+ -->
+
+ <xsl:when
+ test="string-length( $fragment-id ) &gt; 0">
+ <xsl:variable name="theDocument_1"
+ select="document( $document-uri,/ )" />
+ <xsl:variable name="originalParent" select=".." />
+
+ <!-- case 0 -->
+ <xsl:if test="not($theDocument_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <!-- use for-each to rebase id() to external document -->
+ <xsl:for-each select="$theDocument_1">
+ <xsl:variable name="theFragment_1"
+ select=" $theDocument_1//iso:*[@xml:id= $fragment-id ] |
+ id($fragment-id) |
+ $theDocument_1//iso:*[@id= $fragment-id ]" />
+
+
+ <xsl:choose>
+ <!-- case 0 -->
+ <xsl:when test="not($theFragment_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:when>
+
+
+ <!-- case 1 REMOVED -->
+
+ <!-- case 2 REMOVED -->
+
+
+ <!-- case 3 -->
+ <xsl:otherwise>
+
+ <xsl:apply-templates
+ select=" $theFragment_1[1]/*" mode="dsdl:go" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:for-each>
+ </xsl:when>
+
+ <!-- Case where there is no ID so we include the whole document -->
+ <!-- Experimental addition: include fragments of children -->
+ <xsl:otherwise>
+ <xsl:variable name="theDocument_2"
+ select="document( $document-uri,/ )" />
+ <xsl:variable name="theFragment_2"
+ select="$theDocument_2/iso:*" />
+ <xsl:variable name="theContainedFragments"
+ select="$theDocument_2/*/iso:* | $theDocument_2/*/xsl:* | $theDocument_2/*/xhtml:*" />
+ <xsl:if test="not($theDocument_2)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+
+ <!-- There are three cases for includes:
+ 0) No text specified- error!
+
+ 1) REMOVED
+
+ 2) REMOVED
+
+ 3) Otherwise, include the pointed-to element
+ -->
+ <xsl:choose>
+ <!-- case 0 -->
+ <xsl:when
+ test="not($theFragment_2) and not ($theContainedFragments)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:when>
+
+ <!-- case 1 removed -->
+
+ <!-- case 2 removed -->
+
+ <!-- If this were XLST 2, we could use
+ if ($theFragment) then $theFragment else $theContainedFragments
+ here (thanks to KN)
+ -->
+ <!-- case 3 -->
+ <xsl:otherwise>
+ <xsl:apply-templates
+ select="$theFragment_2/* " mode="dsdl:go" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:otherwise>
+ </xsl:choose>
+
+ <xsl:processing-instruction name="DSDL_INCLUDE_END">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+ </xsl:template>
+
+
+
+ <!-- =========================================================== -->
+ <!-- Handle Schematron 1.6 inclusions: clone of ISO code above -->
+ <!-- =========================================================== -->
+
+
+ <!-- Extend the URI syntax to allow # references -->
+ <!-- Add experimental support for simple containers like /xxx:xxx/schold:pattern to allow better includes -->
+ <xsl:template match="schold:include" mode="dsdl:go">
+ <xsl:variable name="document-uri"
+ select="substring-before(concat(@href,'#'), '#')" />
+ <xsl:variable name="fragment-id"
+ select="substring-after(@href, '#')" />
+
+ <xsl:processing-instruction name="DSDL_INCLUDE_START">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+
+ <xsl:choose>
+ <xsl:when test="not( $include-schematron = 'true' )">
+ <xslt:copy>
+ <xslt:copy-of select="@*" />
+ <xslt:apply-templates mode="dsdl:go" />
+ </xslt:copy>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:choose>
+
+ <xsl:when
+ test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+ <xsl:message>
+ Error: Impossible URL in Schematron include
+ </xsl:message>
+ </xsl:when>
+
+ <!-- this case is when there is in embedded schema in the same document elsewhere -->
+ <xslt:when
+ test="string-length( $document-uri ) = 0">
+ <xslt:apply-templates mode="dsdl:go"
+ select="//schold:*[@xml:id= $fragment-id ]
+ |id( $fragment-id)
+ | //schold:*[@id= $fragment-id ]" />
+ </xslt:when>
+
+ <!-- case where there is a fragment in another document (should be an iso: element) -->
+ <xsl:when
+ test="string-length( $fragment-id ) &gt; 0">
+ <xsl:variable name="theDocument_1"
+ select="document( $document-uri,/ )" />
+ <xsl:if test="not($theDocument_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <!-- use for-each to rebase id() to $theDocument -->
+ <xsl:for-each select="$theDocument_1">
+ <xsl:variable name="theFragment_1"
+ select=" $theDocument_1//schold:*[@xml:id= $fragment-id ] |
+ id($fragment-id) |
+ $theDocument_1//schold:*[@id= $fragment-id ]" />
+ <xsl:if
+ test=" $theFragment_1/self::schold:schema ">
+ <xsl:message>
+ Schema error: Use include to include
+ fragments, not a whole schema
+ </xsl:message>
+ </xsl:if>
+ <xsl:if test="not($theFragment_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <xsl:apply-templates
+ select=" $theFragment_1[1]" mode="dsdl:go" />
+ </xsl:for-each>
+ </xsl:when>
+
+ <!-- Case where there is no ID so we include the whole document -->
+ <!-- Experimental addition: include fragments of children -->
+ <xsl:otherwise>
+ <xsl:variable name="theDocument_2"
+ select="document( $document-uri,/ )" />
+ <xsl:variable name="theFragment_2"
+ select="$theDocument_2/iso:*" />
+ <xsl:variable name="theContainedFragments"
+ select="$theDocument_2/*/schold:* | $theDocument_2/*/xsl:* | $theDocument_2/*/xhtml:*" />
+ <xsl:if test="not($theDocument_2)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+
+ <xsl:if
+ test=" $theFragment_2/self::schold:schema or $theContainedFragments/self::schold:schema">
+ <xsl:message>
+ Schema error: Use include to include
+ fragments, not a whole schema
+ </xsl:message>
+ </xsl:if>
+ <xsl:if
+ test="not($theFragment_2) and not ($theContainedFragments)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <!-- If this were XLST 2, we could use
+ if ($theFragment) then $theFragment else $theContainedFragments
+ here (thanks to KN)
+ -->
+ <xsl:choose>
+ <xsl:when test=" $theFragment_2 ">
+ <xsl:apply-templates
+ select="$theFragment_2 " mode="dsdl:go" />
+ </xsl:when>
+ <xsl:otherwise>
+ <!-- WARNING! EXPERIMENTAL! Use at your own risk. This may be discontinued! -->
+ <xsl:apply-templates
+ select=" $theContainedFragments " mode="dsdl:go" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:otherwise>
+ </xsl:choose>
+
+ </xsl:otherwise>
+ </xsl:choose>
+
+ <xsl:processing-instruction name="DSDL_INCLUDE_END">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+ </xsl:template>
+ <!-- =========================================================== -->
+ <!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages -->
+ <!-- Part 5 - DataType Library Language - DTLL -->
+ <!-- Committee Draft Experimental support only -->
+ <!-- The <include> element may well be replaced by XInclude in -->
+ <!-- any final version. -->
+ <!-- =========================================================== -->
+ <xslt:template match="dtll:include" mode="dsdl:go">
+ <!-- Insert subschema -->
+
+ <xsl:variable name="document-uri"
+ select="substring-before(concat(@href,'#'), '#')" />
+ <xsl:variable name="fragment-id"
+ select="substring-after(@href, '#')" />
+ <xsl:processing-instruction name="DSDL_INCLUDE_START">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+ <xsl:choose>
+ <xsl:when test="not( $include-dtll = 'true' )">
+ <xslt:copy>
+ <xslt:copy-of select="@*" />
+ <xslt:apply-templates mode="dsdl:go" />
+ </xslt:copy>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:choose>
+
+ <xsl:when
+ test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+ <xsl:message>
+ Error: Impossible URL in DTLL include
+ </xsl:message>
+ </xsl:when>
+
+ <!-- this case is when there is in embedded schema in the same document elsewhere -->
+ <xslt:when
+ test="string-length( $document-uri ) = 0">
+ <xslt:apply-templates mode="dsdl:go"
+ select="//*[@xml:id= $fragment-id ] | id( $fragment-id)
+ | //*[@id= $fragment-id ]" />
+ </xslt:when>
+
+ <xsl:when
+ test="string-length( $fragment-id ) &gt; 0">
+ <xsl:variable name="theDocument_1"
+ select="document( $document-uri,/ )" />
+ <xsl:if test="not($theDocument_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <!-- use for-each to rebase id() to $theDocument -->
+ <xsl:for-each select="$theDocument_1">
+ <xsl:variable name="theFragment_1"
+ select="$theDocument_1//*[@xml:id= $fragment-id ]
+ | id( $fragment-id )
+ | $theDocument_1//*[@id= $fragment-id ]" />
+ <xsl:if test="not($theFragment_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <xsl:apply-templates
+ select=" $theFragment_1[1]" mode="dsdl:go" />
+ </xsl:for-each>
+ </xsl:when>
+
+ <xsl:otherwise>
+ <xsl:variable name="theDocument_2"
+ select="document( $document-uri,/ )" />
+ <xsl:variable name="theFragment_2"
+ select="$theDocument_2/*" />
+
+ <xsl:if test="not($theDocument_2)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+
+ <xsl:if test="not($theFragment_2)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <xsl:apply-templates select="$theFragment_2 "
+ mode="dsdl:go" />
+ </xsl:otherwise>
+ </xsl:choose>
+
+ </xsl:otherwise>
+ </xsl:choose>
+ <xsl:processing-instruction name="DSDL_INCLUDE_END">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+ </xslt:template>
+
+ <!-- =========================================================== -->
+ <!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages -->
+ <!-- Part 7 - Character Repertoire Description Language - CRDL -->
+ <!-- Final Committee Draft 2008-01-11 Experimental support only -->
+ <!-- =========================================================== -->
+ <xslt:template match="crdl:ref" mode="dsdl:go">
+ <!-- Insert subschema -->
+
+ <xsl:variable name="document-uri"
+ select="substring-before(concat(@href,'#'), '#')" />
+ <xsl:variable name="fragment-id"
+ select="substring-after(@href, '#')" />
+ <xsl:processing-instruction name="DSDL_INCLUDE_START">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+ <xsl:choose>
+ <xsl:when test="not( $include-crdl = 'true' )">
+ <xslt:copy>
+ <xslt:copy-of select="@*" />
+ <xslt:apply-templates mode="dsdl:go" />
+ </xslt:copy>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:choose>
+
+ <xsl:when
+ test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+ <xsl:message>
+ Error: Impossible URL in CRDL include
+ </xsl:message>
+ </xsl:when>
+
+ <!-- this case is when there is in embedded schema in the same document elsewhere -->
+ <xslt:when
+ test="string-length( $document-uri ) = 0">
+
+ <xslt:apply-templates mode="dsdl:go"
+ select="//*[@xml:id= $fragment-id ] | id( $fragment-id)
+ | //*[@id= $fragment-id ]" />
+ </xslt:when>
+
+ <xsl:when
+ test="string-length( $fragment-id ) &gt; 0">
+ <xsl:variable name="theDocument_1"
+ select="document( $document-uri,/ )" />
+ <xsl:if test="not($theDocument_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <!-- use for-each to rebase id() to $theDocument -->
+ <xsl:for-each select="$theDocument_1">
+ <xsl:variable name="theFragment_1"
+ select="$theDocument_1//*[@xml:id= $fragment-id ]
+ | id( $fragment-id )
+ | $theDocument_1//*[@id= $fragment-id ]" />
+
+ <xsl:if test="not($theFragment_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <xsl:apply-templates select=" $theFragment_1 "
+ mode="dsdl:go" />
+ </xsl:for-each>
+ </xsl:when>
+
+ <xsl:otherwise>
+ <xsl:variable name="theDocument_2"
+ select="document( $document-uri,/ )" />
+ <xsl:variable name="theFragment_2"
+ select="$theDocument_2/*" />
+
+ <xsl:if test="not($theDocument_2)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+ <xsl:if test="not($theFragment_2)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+
+ <xsl:apply-templates select="$theFragment_2"
+ mode="dsdl:go" />
+ </xsl:otherwise>
+ </xsl:choose>
+
+ </xsl:otherwise>
+ </xsl:choose>
+ <xsl:processing-instruction name="DSDL_INCLUDE_END">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+ </xslt:template>
+
+
+ <!-- =========================================================== -->
+ <!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages -->
+ <!-- Part 4 - Namespace-based Validation Dispatching Language - NVDL -->
+ <!-- Note: This does not include schemas referenced for -->
+ <!-- validation, it merely handles any simple XIncludes -->
+ <!-- =========================================================== -->
+ <!-- ISO/IEC 19757 - DSDL Document Schema Definition Languages -->
+ <!-- Part 8 - Document Schema Renaming Language - DSRL -->
+ <!-- Note: Final? Committee Draft Experimental support only -->
+ <!-- =========================================================== -->
+ <!-- XInclude support for id based references only, with 1 level -->
+ <!-- of fallback. -->
+ <!-- =========================================================== -->
+
+ <xslt:template mode="dsdl:go"
+ match="xi:include[@href][not(@parseType) or @parseType ='xml']">
+ <!-- Simple inclusions only here -->
+ <xsl:processing-instruction name="DSDL_INCLUDE_START">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+ <xsl:choose>
+ <xsl:when test="not( $include-xinclude = 'true' )">
+ <xslt:copy>
+ <xslt:copy-of select="@*" />
+ <xslt:apply-templates mode="dsdl:go" />
+ </xslt:copy>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:choose>
+
+ <xsl:when test="contains( @href, '#')">
+ <xsl:message terminate="yes">
+ Fatal error: Xinclude href contains fragment
+ identifier #
+ </xsl:message>
+ </xsl:when>
+
+
+ <xsl:when test="contains( @xpointer, '(')">
+ <xsl:message terminate="yes">
+ Fatal error: Sorry, this software only
+ supports simple ids in XInclude xpointers
+ </xsl:message>
+ </xsl:when>
+
+ <xsl:when
+ test="string-length( @href ) = 0 and string-length( @xpointer ) = 0">
+
+ <xsl:message terminate="yes">
+ Fatal Error: Impossible URL in XInclude
+ include
+ </xsl:message>
+ </xsl:when>
+
+ <!-- this case is when there is in embedded schema in the same document elsewhere -->
+ <xslt:when test="string-length( @href ) = 0">
+
+ <xslt:apply-templates mode="dsdl:go"
+ select="//*[@xml:id= current()/@xpointer ] | id( @xpointer)
+ | //*[@id= current()/@xpointer ]" />
+ </xslt:when>
+
+ <xsl:when
+ test="string-length( @xpointer ) &gt; 0">
+ <xsl:variable name="theDocument_1"
+ select="document( @href,/ )" />
+ <xsl:variable name="theFragment_1"
+ select="$theDocument_1//*[@xml:id= current()/@xpointer ]
+
+ | $theDocument_1//*[@id= current()/@xpointer ]" />
+ <!-- removed
+ | $theDocument_1/id( @xpointer)
+ because it requires rebasing in XSLT1 and that would mess up the use of current()
+ -->
+
+
+ <!-- Allow one level of fallback, to another XInclude -->
+ <xsl:if test="not($theDocument_1)">
+ <xsl:choose>
+ <xsl:when test="xi:fallback">
+ <xsl:variable name="theDocument_2"
+ select="document( xi:fallback[1]/xi:include[not(@parseType)
+ or @parseType='xml']/@href,/ )" />
+ <xsl:variable name="theFragment_2"
+ select="$theDocument_2//*[@xml:id= current()/xi:fallback[1]/xi:include/@xpointer ]
+ | $theDocument_2//*[@id= current()/xi:fallback[1]/xi:include/@xpointer ]" />
+ <!-- removed
+ | $theDocument_2/id( xi:fallback[1]/xi:include/@xpointer)
+ because it id() would need rebasing in XSLT1 and that would mess up use of current()
+ -->
+
+ <xsl:if
+ test="not($theDocument_2)">
+
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file and fallback
+ file: </xsl:text>
+ <xsl:value-of
+ select="@href" />
+ </xsl:message>
+ </xsl:if>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:if>
+ <xsl:apply-templates select=" $theFragment_1"
+ mode="dsdl:go" />
+ </xsl:when>
+
+ <!-- Document but no fragment specified -->
+ <xsl:otherwise>
+ <xsl:variable name="theDocument_3"
+ select="document( @href,/ )" />
+ <xsl:variable name="theFragment_3"
+ select="$theDocument_3/*" />
+
+ <xsl:if test="not($theDocument_3)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@href" />
+ </xsl:message>
+ </xsl:if>
+
+ <xsl:apply-templates select="$theFragment_3 "
+ mode="dsdl:go" />
+ </xsl:otherwise>
+ </xsl:choose>
+
+ </xsl:otherwise>
+ </xsl:choose>
+ <xsl:processing-instruction name="DSDL_INCLUDE_END">
+ <xsl:value-of select="@href" />
+ </xsl:processing-instruction>
+ </xslt:template>
+
+ <!-- =========================================================== -->
+ <!-- W3C XLink 1.1 embedded simple links -->
+ <!-- =========================================================== -->
+ <xslt:template
+ match="*[@xlink:href][not(parent::*[@xlink:type='complex'])]
+ [not(@xlink:type) or (@xlink:type='simple')]
+ [@xlink:show='embed']
+ [not(@xlink:actuate) or (@xlink:actuate='onLoad')]"
+ mode="dsdl:go" priority="1">
+
+ <xsl:variable name="document-uri"
+ select="substring-before(concat(@xlink:href,'#'), '#')" />
+ <xsl:variable name="fragment-id"
+ select="substring-after(@xlink:href, '#')" />
+ <xsl:processing-instruction name="DSDL_INCLUDE_START">
+ <xsl:value-of select="@xlink:href" />
+ </xsl:processing-instruction>
+ <xsl:choose>
+ <xsl:when test="not( $include-xlink = 'true' )">
+ <xslt:copy>
+ <xslt:copy-of select="@*" />
+ <xslt:apply-templates mode="dsdl:go" />
+ </xslt:copy>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:choose>
+
+ <xsl:when
+ test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0">
+ <xsl:message>
+ Error: Impossible URL in XLink embedding
+ link
+ </xsl:message>
+ </xsl:when>
+
+ <!-- this case is when there is in embedded schema in the same document elsewhere -->
+ <xslt:when
+ test="string-length( $document-uri ) = 0">
+ <xslt:apply-templates mode="dsdl:go"
+ select="//*[@xml:id= $fragment-id ] | id( $fragment-id)
+ | //*[@id= $fragment-id ]" />
+ </xslt:when>
+
+ <xsl:when
+ test="string-length( $fragment-id ) &gt; 0">
+ <xsl:variable name="theDocument_1"
+ select="document( $document-uri,/ )" />
+ <xsl:if test="not($theDocument_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@xlink:href" />
+ </xsl:message>
+ </xsl:if>
+ <!-- use for-each to rebase id() to $theDocument -->
+ <xsl:for-each select="$theDocument_1">
+ <xsl:variable name="theFragment_1"
+ select="$theDocument_1//*[@xml:id= $fragment-id ]
+ | id( $fragment-id )
+ | $theDocument_1//*[@id= $fragment-id ]" />
+ <xsl:if test="not($theFragment_1)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@xlink:href" />
+ </xsl:message>
+ </xsl:if>
+ <xsl:apply-templates
+ select=" $theFragment_1[1]" mode="dsdl:go" />
+ </xsl:for-each>
+ </xsl:when>
+
+ <xsl:otherwise>
+ <xsl:variable name="theDocument_2"
+ select="document( $document-uri,/ )" />
+ <xsl:variable name="theFragment_2"
+ select="$theDocument_2/*" />
+
+ <xsl:if test="not($theDocument_2)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to open referenced included file: </xsl:text>
+ <xsl:value-of select="@xlink:href" />
+ </xsl:message>
+ </xsl:if>
+
+ <xsl:if test="not($theFragment_2)">
+ <xsl:message terminate="no">
+ <xsl:text>Unable to locate id attribute: </xsl:text>
+ <xsl:value-of select="@xlink:href" />
+ </xsl:message>
+ </xsl:if>
+ <xsl:apply-templates select="$theFragment_2 "
+ mode="dsdl:go" />
+ </xsl:otherwise>
+ </xsl:choose>
+
+ </xsl:otherwise>
+ </xsl:choose>
+
+ <xsl:processing-instruction name="DSDL_INCLUDE_END">
+ <xsl:value-of select="@xlink:href" />
+ </xsl:processing-instruction>
+ </xslt:template>
+
+
+</xslt:stylesheet> \ No newline at end of file
diff --git a/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl
new file mode 100644
index 0000000..d59b8f3
--- /dev/null
+++ b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_message.xsl
@@ -0,0 +1,55 @@
+<?xml version="1.0" ?><?xar XSLT?>
+<!-- Implmentation for the Schematron XML Schema Language.
+ http://www.ascc.net/xml/resource/schematron/schematron.html
+
+ Copyright (c) 2000,2001 Rick Jelliffe and Academia Sinica Computing Center, Taiwan
+
+ This software is provided 'as-is', without any express or implied warranty.
+ In no event will the authors be held liable for any damages arising from
+ the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not claim
+ that you wrote the original software. If you use this software in a product,
+ an acknowledgment in the product documentation would be appreciated but is
+ not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source distribution.
+-->
+
+<!-- Schematron message -->
+
+<xsl:stylesheet
+ version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:axsl="http://www.w3.org/1999/XSL/TransformAlias">
+
+<xsl:import href="iso_schematron_skeleton_for_xslt1.xsl"/>
+
+<xsl:template name="process-prolog">
+ <axsl:output method="text" />
+</xsl:template>
+
+<!-- use default rule for process-root: copy contens / ignore title -->
+<!-- use default rule for process-pattern: ignore name and see -->
+<!-- use default rule for process-name: output name -->
+<!-- use default rule for process-assert and process-report:
+ call process-message -->
+
+<xsl:template name="process-message">
+ <xsl:param name="pattern" />
+ <xsl:param name="role" />
+ <axsl:message>
+ <xsl:apply-templates mode="text"
+ /> (<xsl:value-of select="$pattern" />
+ <xsl:if test="$role"> / <xsl:value-of select="$role" />
+ </xsl:if>)</axsl:message>
+</xsl:template>
+
+</xsl:stylesheet> \ No newline at end of file
diff --git a/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl
new file mode 100644
index 0000000..b0e7175
--- /dev/null
+++ b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_schematron_skeleton_for_xslt1.xsl
@@ -0,0 +1,1796 @@
+<?xml version="1.0"?><?xar XSLT?>
+
+<!--
+ OVERVIEW
+
+ ASCC/Schematron.com Skeleton Module for ISO Schematron (for XSLT1 systems)
+
+ ISO Schematron is a language for making assertion about the presence or absence
+ of patterns in XML documents. It is typically used for as a schema language, or
+ to augment existing schema languages, and to check business rules. It is very
+ powerful, yet quite simple: a developer only need know XPath and about five other
+ elements.
+
+ This is an open source implementation of ISO Schematron in XSLT. Although ISO does
+ not allow reference implementations which might compete with the text of the
+ standard, this code has been compiled by Rick Jelliffe, inventor of Schematron
+ and editor of the ISO standard; so developers can certainly use it as an
+ unofficial reference implementation for clarification.
+
+ This implementation is based on one by Oliver Becker. API documentation is
+ available separately; try www.schematron.com for this. Funding for this
+ stylesheet over the years has come from Topologi Pty. Ltd., Geotempo Ltd.,
+ and ASCC, Tapei.
+
+ There are two versions of this skeleton: one is tailored for XSLT1 processors
+ and the other is tailored for XSLT2 processors. Future versions of the
+ XSLT2 skeleton may support more features than that the XSLT 1 skeleton.
+-->
+<!--
+ TIPS
+
+ A tip for new users of Schematron: make your assertions contain positive messages
+ about what is expected, rather than error messages. For example, use the form
+ "An X should have a Y, because Z".
+
+ Another tip is that Schematron provides an
+ element <sch:ns> for declaring the namespaces and prefixes used in Xpaths in
+ attribute values; it does not extend the XML Namespaces mechanism: if a name
+ in an XPath has a prefix, there must be an <sch:ns> element for that prefix; if
+ a name in an XPath does not have a prefix, it is always in no namespace.
+
+ A tip for implementers of Schematron, either using this API or re-implementing it:
+ make the value of the diagnostics, flags and richer features available if possible;
+ Schematron has many of the optional richer features which, if implemented, provide
+ a compelling alternative approach to validation and business-rules checking compared
+ to other schema languages and programs.
+
+ If you create your own meta-stylesheet to override this one, it is a
+ good idea to have both in the same directory and to run the stylesheet
+ from that directory, as many XSLT implementations have ideosyncratic
+ handling of URLs: keep it simple.
+-->
+
+
+<!--
+ INVOCATION INFORMATION
+
+ The following parameters are available
+
+ phase NMTOKEN | "#ALL" (default) Select the phase for validation
+ allow-foreign "true" | "false" (default) Pass non-Schematron elements to the generated stylesheet
+ sch.exslt.imports semi-colon delimited string of filenames for some EXSLT implementations
+ message-newline "true" (default) | "false" Generate an extra newline at the end of messages
+ optimize "visit-no-attributes"
+ debug "true" | "false" (default) Debug mode lets compilation continue despite problems
+ attributes "true" | "false" (Autodetecting) Use only when the schema has no attributes as the context nodes
+ only-child-elements "true" | "false" (Autodetecting) Use only when the schema has no comments
+ or PI as the context nodes
+
+ The following parameters can be specified as Schematron variables in diagnostics, assertions and so on.
+ fileNameParameter string
+ fileDirParameter string
+ archiveNameParameter string In case of ZIP files
+ archiveDirParameter string In case of ZIP files
+ output-encoding Use when outputting to XML
+
+ Experimental: USE AT YOUR OWN RISK
+ visit-text "true" "false" Also visist text nodes for context. WARNING: NON_STARDARD.
+ select-contents '' | 'key' | '//' Select different implementation strategies
+
+ Conventions: Meta-stylesheets that override this may use the following parameters
+ generate-paths=true|false generate the @location attribute with XPaths
+ diagnose= yes | no Add the diagnostics to the assertion test in reports
+ terminate= yes | no Terminate on the first failed assertion or successful report
+-->
+
+<!--
+ XSLT VERSION SUPPORT
+
+ XSLT 1:
+ A schema using the standard XSLT 1 query binding will have a /schema/@queryBinding='xslt' or
+ nothing.
+
+ * Note: XT does not implement key() and will die if given it.
+ * Add all formal parameters to default templates
+ * Fix missing apply-templates from process-ns and add params back
+
+ EXSLT: Experimental support
+ A schema using the EXSLT query binding will have a /schema/@queryBinding='exslt'.
+ It is built on XSLT 1. After experience is gained, this binding is expected to be
+ formalized as part of ISO Schematron, which currently reserves the "exslt" name for this purpose.
+
+ Some EXSLT engines have the extra functions built-in. For these, there is no need to
+ provide library locations. For engines that require the functions, either hard code
+ them in this script or provide them on the command-line argument.
+
+-->
+<!--
+ PROCESS INFORMATION
+
+ This stylesheet compiles a Schematron schema (*.sch) into XSLT code (*.xsl).
+ The generated XSLT code can then be run against an XML file (*.xml, etc) and
+ will produce validation results.
+
+ The output of validation results is performed using named templates (process-*).
+ These can be overridden easily by making a new XSLT stylesheet that imports this
+ stylesheet but has its own version of the relevant process-* templates. Several
+ of these invoking stylesheets are available: "iso_svrl.xsl", for example generates
+ ISO Schematron Validation Report Language format results.
+
+ In this version of the stylesheet, the ISO feature called "abstract patterns" is
+ implemented using macro processing: a prior XSLT stage to which converts uses
+ of abstract patterns into normal patterns. If you do not use abstract patterns,
+ it is not necessary to preprocess the schema.
+
+ To summarize, a basic process flow for some commandline processor is like this:
+ XSLT -input=xxx.sch -output=xxx.xsl -stylesheet=iso_schematron_skeleton.xsl
+ XSLT -input=document.xml -output=xxx-document.results -stylesheet=xxx.xsl
+
+ iso_svrl.xslt is an implementation of Schematron that can use this skeleton and
+ generate ISO SVRL reports. A process flow for some commandline processor would
+ be like this:
+ XSLT -input=xxx.sch -output=xxx.xsl -stylesheet=iso_svrl.xsl
+ XSLT -input=document.xml -output=xxx-document.results -stylesheet=xxx.xsl
+
+ It is not impossible that ultimately a third stage, to handle macro-preprocessing
+ and inclusion, might be necessary. (The trade-off is in making this XSLT more
+ complex compared to making the outer process more complex.)
+
+ This version has so far been tested with
+ Saxon 8
+ MSXML 4 (or 6?)
+
+ Please note that if you are using SAXON and JAXP, then you should use
+ System.setProperty("javax.xml.transform.TransformerFactory",
+ "net.sf.saxon.TransformerFactoryImpl");
+ rather than
+ System.setProperty("javax.xml.xpath.TransformerFactory",
+ "net.sf.saxon.TransformerFactoryImpl");
+ which is does not work, at least for the versions of SAXON we tried.
+-->
+<!--
+ LEGAL INFORMATION
+
+ Copyright (c) 2000-2008 Rick Jelliffe and Academia Sinica Computing Center, Taiwan
+
+ This software is provided 'as-is', without any express or implied warranty.
+ In no event will the authors be held liable for any damages arising from
+ the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not claim
+ that you wrote the original software. If you use this software in a product,
+ an acknowledgment in the product documentation would be appreciated but is
+ not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source distribution.
+-->
+<!--
+ NOTE: Compared to the iso_schematron_skeleton_for_saxon.xsl code, this version is currently missing
+ 1) localization
+ 2) properties
+ 3) pattern/@documents
+
+ VERSION INFORMATION
+ 2009-02-25 RJ
+ * Fix up variable names so none are used twice in same template
+ * Tested on SAXON 9, Xalan 2.7.1. Partly tested MSXML.
+ 2008-09-19 RJ
+ * Add mode schematron-select-full-path and param full-path-notation
+
+ 2008-08-11
+ * TT report/@flag was missing
+ 2008-08-06
+ * TT Top-level lets need to be implemented using xsl:param not xsl:variable
+ * TT xsl:param/@select must have XPath or not be specified
+
+ Version: 2008-07-28
+ * KH schematron-get-full-path-3 has [index] even on top step
+ * RJ fix schematron-get-full-path to have namespace predicate, I don't know why this was removed
+
+ Version: 2008-07-24
+ * RJ clean out commented out namespace handling code
+ * RJ add support for experimental non-standard attribute report/@action
+ and assert/@action, and add parameter not in the published API (should
+ not break anything, it is XSLT1)
+ * RJ Remove remaining XSLT2 code for ease of reading
+
+ Version: 2008-07-14 minor update for inclusion experiments
+ * RJ Clean up zero-length fragment test on include
+ * RJ Add experimental support for include containers
+ * RJ For path generation, test for //iso:schema not just /iso:schema, for potential embedded Schematron support
+ * RJ Don't generate double error messages for old namespace elements
+ * RJ Experimental iso:rule/iso:title just kept as comment (bigger request Uche Ogbuji)
+ * RJ Remove spurious debug messages
+ * RJ Fix bug that prevented including patterns in this (report Roger
+ Costello)
+
+ Version: 2007-10-17
+ From this version on I am forking XSLT2 support to a different version of the script.
+ This is due to the increasingly horrible state of the namespace handling code as well
+ as other inconsistencies between the major implementations of different versions.
+ The intent is that future versions of this will have XSLT2 isms removed and be simplified
+ to cope with only XSLT1 and EXLST. Note that though this version is called
+ iso_schematron_skeleton_for_xslt1, the various meta-stylesheets will continue to just call
+ iso_schematron_skeleton: it is up to you to rename the stylesheet to the one you want to
+ use.
+
+ * RJ fix FULL-PATH problem with attribute names
+
+
+ Version: 2007-07-19
+ Accept most changes in David Carlisle's fork, but continue as XSLT1 script:
+ http://dpcarlisle.blogspot.com/search/label/schematron
+ * DPC Remove "optimize" parameter
+ * DPC Add autodetecting optimize parameter attribute to skip checking attribute
+ context
+ * DPC Add autodetecting optimize parameter only-child-elements turn off checking for
+ comments and PIs
+ * DPC (Experimental: NON_STANDARD DANGER!) Add param visit-text to viist text
+ nodes too for context
+ * DPC Fix inclusion syntax to allow #
+ * DPC Priorities count up from 1000 not down from 4000 to allow more rules
+ * RJ Add new template for titles of schemas, with existing behaviour.
+ Override process-schema-title for custom processing of title
+
+
+ Version: 2007-04-04
+ * RJ debug mode param
+ * RJ alter mixed test to only test mixed branches, so the same document
+ could have old and new namespaces schemas in it, but each schema must
+ be distinct, just so as not to overconstrain things.
+ * KH zero-length include/@href is fatal error, but allow debug mode
+ * SB add hint on SAXON and JAXP
+ * DC generate-full-path-1 generates XLST1 code by default
+ Version: 2007-03-05
+ * AS Typo for EXSLT randome, improve comment
+ * KH get-schematron-full-path-2 needs to apply to attributes too
+ * DP document policy on extensions better
+ * DC use copy-of not copy for foreign elements
+ * DC add generate-path-2
+ * DC don't try to apply templates to attribute axis on attribute nodes, to
+ stop SAXON warning.
+ * RJ improve reporting of typos
+
+ Version: 2007-02-08
+ * KH Schematron fullpath implementation: @* handled twice and / missing
+ * KH Change stylesheetbody from named template to mode to allow implementers more flexibility.
+ Move process-ns to outside the stylesheet body.
+ * DP, FG, fix handling of xslt:key
+ * FG no iso:title/@class
+ * Experimental optimization 'visit-no-attributes'
+ * KH Experimental added schematron-get-full-path-2 which gives prefixed version for humans
+ * DC Move stylesheet/@version generation to after namespace handling
+ * DC, FG EXSLT namespace handling code
+ * FG add ref and commented code from FG's page on namespaces
+ * Start adding normalize-space() to parameter code
+ * Add a space between diagnostics
+
+ Version: 2007-01-22
+ * DP change = ($start) to = $start and =($phase) to =$phase
+ to run under Saxon 8.8j
+ * FG better title section using ( @id | sch:title)[last()]
+ * Default query language binding is "xslt" not "xslt1"
+
+ Version: 2007-01-19
+ * Simplify message newline code
+ * Remove termination and xpath appending to message options:
+ factor out as iso_schematron_terminator.xsl
+ * Comment out XSLT2 namespace fix temporarily
+
+ Version: 2007-01-18 (First beta candidate for comment)
+ * DC remove xml:space="preserve"
+ * FG improve comment on import statement
+ * DC improve comments on invocation section
+ * Add exploratory support for sch:schema[@queryBinding='xpath']
+ by allowing it and warning as lets are found
+ * Be strict about queryBinding spelling errors
+ * Extra comments on the different queryBindings
+ * KH Add option "message-paths" to generate XPath from output
+ * KH Add option "terminate" to halt with an error after the first assertion
+ * KH refactor paths in schematron-full-path
+ * Improve (?) namespace handling: no dummy attributes for prefix "xsl" generated
+
+ Version: 2007-01-15
+ * FG fix for calling templates
+ * Add formal parameters to default templates: may help XSLT 2
+ * Fix get-schematron-full-path
+ * Include skeleton1-6 is commented out by default
+
+ Version:2007-01-12 (Pre-beta release to Schematron-love-in maillist)
+ * Add many extra parameters to the process-* calls, so that almost
+ all the information in the schema can be provided to client programs.
+ Also, rearrange the parameters to fit in with the ISO schema, which
+ has "rich" and "linkable" attribute groups.
+ * Warn on diagnostics with no ID once only
+ * Improved path reporting, to handle for namespaces
+ * Add process-title dummy template for API
+ * Add command-line parameter allow-foreign (true|false) to suppress
+ warnings one foreign elements and pass them through to the generated
+ stylesheet
+ * remove legacy templates for the old ASCC namespace and no namespace,
+ and use an import statement instead. Much cleaner now!
+ * patterns use @id not @name
+ * titles can contain sub-elements
+ * start change sch:rule to allow attributes, PIs and comments
+ * the default process-* for inline elements add a leading and trailing
+ space, to reduce the chance of concatenation.
+ * add comments to make the generated code clearer
+
+ Version:2006-11-07 (ISO: first release private to schematron-love-in maillist for review)
+ * Duplicate pattern templates, for handling ISO namespace
+ * Add priority onto default and paragraph templates
+ * Add namespace checks
+ * Handle key in xsl namespace not iso
+ * Add include
+ * Improve namespace handling
+ * Preliminary XSLT2 and EXSLT support
+ * Refactor iso:schema for clarity
+
+ Version: 2003-05-26
+ * Fix bug with key
+ Version: 2003-04-16
+ * handle 1.6 let expressions
+ * make key use XSLT names, and allow anywhere
+ Version: 2001-06-13
+ * same skeleton now supports namespace or no namespace
+ * parameters to handlers updated for all 1.5 attributes
+ * diagnostic hints supported: command-line option diagnose=yes|no
+ * phases supported: command-line option phase=#ALL|...
+ * abstract rules
+ * compile-time error messages
+ * add utility routine generate-id-from-path
+
+ Contributors: Rick Jelliffe (original), Oliver Becker (architecture, XSLT2),
+ Miloslav Nic (diagnostic, phase, options), Ludwig Svenonius (abstract)
+ Uche Ogbuji (misc. bug fixes), Jim Ancona (SAXON workaround),
+ Francis Norton (generate-id-from-path), Robert Leftwich, Bryan Rasmussen,
+ Dave Pawson (include, fallback), Florent Georges (namespaces, exslt, attribute
+ context), Benoit Maisonny (attribute context), John Dumps (process-message newline),
+ Cliff Stanford (diagnostics and other newlines)
+
+
+ KNOWN TYPICAL LIMITATIONS:
+ * Don't use <sch:ns prefix="xsl" .../> with a namespace other than the standard
+ XSLT one. This would be a bizarre thing to do anyway.
+ * Don't use other prefixes for the XSLT namespace either; some implementations will
+ not handle it correctly.
+
+ EXTENSIONS:
+ ISO Schematron is designed as a framework with some standard query language
+ bindings. If you need to support other features, please do so safely by making
+ up your own @queryLanguage name: this makes it clear that your schema requires
+ special features. For example, default ISO Schematron does not support user
+ defined functions; so if you want to use the user defined function feature
+ in XSLT, you need to have a schema with some queryBinding attribute name like
+ "XSLT-with-my-functions" or whatever.
+-->
+
+
+
+
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:axsl="http://www.w3.org/1999/XSL/TransformAlias"
+ xmlns:sch="http://www.ascc.net/xml/schematron"
+ xmlns:iso="http://purl.oclc.org/dsdl/schematron"
+ xmlns:exsl="http://exslt.org/common"
+ xmlns:msxsl="urn:schemas-microsoft-com:xslt"
+ extension-element-prefixes="exsl msxsl"
+ >
+<!-- This program implements ISO Schematron, except for abstract patterns which require a preprocess. -->
+
+
+<xsl:namespace-alias stylesheet-prefix="axsl" result-prefix="xsl"/>
+
+
+<!-- Category: top-level-element -->
+<xsl:output method="xml" omit-xml-declaration="no" standalone="yes" indent="yes"/>
+
+
+<xsl:param name="phase">
+ <xsl:choose>
+ <xsl:when test="//sch:schema/@defaultPhase">
+ <xsl:value-of select="//sch:schema/@defaultPhase"/>
+ </xsl:when>
+ <xsl:when test="//iso:schema/@defaultPhase">
+ <xsl:value-of select="//iso:schema/@defaultPhase"/>
+ </xsl:when>
+ <xsl:otherwise>#ALL</xsl:otherwise>
+ </xsl:choose>
+</xsl:param>
+
+<xsl:param name="allow-foreign">false</xsl:param>
+
+<xsl:param name="message-newline">true</xsl:param>
+
+<!-- DPC set to true if contexts should be checked on attribute nodes
+ defaults to true if there is any possibility that a context could match an attribute,
+ err on the side if caution, a context of *[.='@'] would cause this param to defualt to true
+ even though @ is in a string
+-->
+<xsl:param name="attributes">
+ <xsl:choose>
+ <xsl:when test="//iso:rule[contains(@context,'@') or contains(@context,'attribute')]">true</xsl:when>
+ <xsl:otherwise>false</xsl:otherwise>
+ </xsl:choose>
+</xsl:param>
+
+<!-- DPC set to true if contexts should be checked on just elements in the child axis
+ defaults to true if there is any possibility that a context could match an comment or PI
+ err on the side if caution, a context of *[.='('] would cause this param to defualt to true
+ even though ( is in a string, but node() comment() and processing-instruction() all have a (
+-->
+<xsl:param name="only-child-elements">
+ <xsl:choose>
+ <xsl:when test="//iso:rule[contains(@context,'(')]">true</xsl:when>
+ <xsl:otherwise>false</xsl:otherwise>
+ </xsl:choose>
+</xsl:param>
+
+<!-- DPC set to true if contexts should be checked on text nodes nodes (if only-child-elements is false)
+ THIS IS NON CONFORMANT BEHAVIOUR JUST FOR DISCUSSION OF A POSSIBLE CHANGE TO THE
+ SPECIFICATION. THIS PARAM SHOULD GO IF THE FINAL DECISION IS THAT THE SPEC DOES NOT CHANGE.
+ Always defaults to false
+-->
+<xsl:param name="visit-text" select="'false'"/>
+
+<!-- DPC
+ When selecting contexts the specified behaviour is
+ @*|node()[not(self::text())]
+ The automatic settings may use
+ node()[not(self::text())]
+ @*|*
+ *
+ instead for schema for which they are equivalent.
+ If the params are set explictly the above may be used, and also either if
+ @*
+ @*|node()
+ in all cases the result may not be equivalent, for example if you specify no attributes and the schema
+ does have attribute contexts they will be silently ignored.
+
+ after testing it turns out that
+ node()[not(self::text())] is slower in saxon than *|comment()|processing-instruction()
+ which I find a bit surprising but anyway I'll use the longr faster version.
+-->
+<xsl:variable name="context-xpath">
+ <xsl:if test="$attributes='true'">@*|</xsl:if>
+ <xsl:choose>
+ <xsl:when test="$only-child-elements='true'">*</xsl:when>
+ <xsl:when test="$visit-text='true'">node()</xsl:when>
+ <xsl:otherwise>*|comment()|processing-instruction()</xsl:otherwise>
+ </xsl:choose>
+</xsl:variable>
+
+<!-- DPC if this is set to
+ '' use recursive templates to iterate over document tree,
+ 'key' select all contexts with a key rather than walking the tree explictly in each mode
+ '//' select all contexts with // a key rather than walking the tree explictly in each mode (XSLT2 only)
+-->
+<xsl:param name="select-contexts" select="''"/>
+
+
+<xsl:param name="output-encoding"/>
+<!-- e.g. saxon file.xml file.xsl "sch.exslt.imports=.../string.xsl;.../math.xsl" -->
+<xsl:param name="sch.exslt.imports"/>
+
+<!-- Set the language code for messages -->
+<xsl:param name="langCode">default</xsl:param>
+
+<xsl:param name="debug">false</xsl:param>
+
+
+<!-- Set the default for schematron-select-full-path, i.e. the notation for svrl's @location-->
+<xsl:param name="full-path-notation">1</xsl:param>
+
+<!-- Simple namespace check -->
+<xsl:template match="/">
+ <xsl:if test="//sch:*[ancestor::iso:* or descendant::iso:*]">
+ <xsl:message>Schema error: Schematron elements in old and new namespaces found</xsl:message>
+ <xsl:if test=" $debug = 'false' " />
+ </xsl:if>
+
+ <xsl:apply-templates />
+</xsl:template>
+
+
+<!-- ============================================================== -->
+<!-- ISO SCHEMATRON SCHEMA ELEMENT -->
+<!-- Not handled: Abstract patterns. A pre-processor is assumed. -->
+<!-- ============================================================== -->
+
+<!-- SCHEMA -->
+<!-- Default uses XSLT 1 -->
+<xsl:template match="iso:schema[not(@queryBinding) or @queryBinding='xslt'
+ or @queryBinding='xslt1' or @queryBinding='XSLT' or @queryBinding='XSLT1'
+ or @queryBinding='xpath']">
+ <xsl:if test="
+ @queryBinding='xslt1' or @queryBinding='XSLT' or @queryBinding='XSLT1'">
+ <xsl:message>Schema error: in the queryBinding attribute, use 'xslt'</xsl:message>
+ </xsl:if>
+ <axsl:stylesheet>
+ <xsl:apply-templates select="iso:ns"/>
+ <!-- Handle the namespaces before the version attribute: reported to help SAXON -->
+ <xsl:attribute name="version">1.0</xsl:attribute>
+
+ <xsl:apply-templates select="." mode="stylesheetbody"/>
+ <!-- was xsl:call-template name="stylesheetbody"/ -->
+ </axsl:stylesheet>
+</xsl:template>
+
+<!-- Using EXSLT with all modeles (except function module: not applicable) -->
+<xsl:template match="iso:schema[@queryBinding='exslt']" priority="10">
+ <xsl:comment>This XSLT was automatically generated from a Schematron schema.</xsl:comment>
+ <axsl:stylesheet
+ xmlns:date="http://exslt.org/dates-and-times"
+ xmlns:dyn="http://exslt.org/dynamic"
+ xmlns:exsl="http://exslt.org/common"
+ xmlns:math="http://exslt.org/math"
+ xmlns:random="http://exslt.org/random"
+ xmlns:regexp="http://exslt.org/regular-expressions"
+ xmlns:set="http://exslt.org/sets"
+ xmlns:str="http://exslt.org/strings"
+ extension-element-prefixes="date dyn exsl math random regexp set str" >
+
+ <xsl:apply-templates select="iso:ns"/>
+ <!-- Handle the namespaces before the version attribute: reported to help SAXON -->
+ <xsl:attribute name="version">1.0</xsl:attribute>
+
+ <xsl:apply-templates select="." mode="stylesheetbody"/>
+ <!-- was xsl:call-template name="stylesheetbody"/ -->
+ </axsl:stylesheet>
+</xsl:template>
+
+
+<!-- Default uses XSLT 1 -->
+<xsl:template match="iso:schema" priority="-1">
+ <xsl:message terminate="yes" >Fail: This implementation of ISO Schematron does not work with
+ schemas using the "<xsl:value-of select="@queryBinding"/>" query language.</xsl:message>
+</xsl:template>
+
+<xsl:template match="*" mode="stylesheetbody">
+ <!--xsl:template name="stylesheetbody"-->
+ <xsl:comment>Implementers: please note that overriding process-prolog or process-root is
+ the preferred method for meta-stylesheets to use where possible. </xsl:comment><xsl:text>&#10;</xsl:text>
+
+ <!-- These parameters may contain strings with the name and directory of the file being
+ validated. For convenience, if the caller only has the information in a single string,
+ that string could be put in fileDirParameter. The archives parameters are available
+ for ZIP archives.
+ -->
+
+ <axsl:param name="archiveDirParameter" />
+ <axsl:param name="archiveNameParameter" />
+ <axsl:param name="fileNameParameter" />
+ <axsl:param name="fileDirParameter" />
+
+ <xsl:call-template name="iso:exslt.add.imports" />
+ <xsl:text>&#10;&#10;</xsl:text><xsl:comment>PHASES</xsl:comment><xsl:text>&#10;</xsl:text>
+ <xsl:call-template name="handle-phase"/>
+ <xsl:text>&#10;&#10;</xsl:text><xsl:comment>PROLOG</xsl:comment><xsl:text>&#10;</xsl:text>
+ <xsl:call-template name="process-prolog"/>
+ <xsl:text>&#10;&#10;</xsl:text><xsl:comment>KEYS</xsl:comment><xsl:text>&#10;</xsl:text>
+ <xsl:apply-templates mode="do-keys" select="xsl:key "/>
+ <xsl:text>&#10;&#10;</xsl:text><xsl:comment>DEFAULT RULES</xsl:comment><xsl:text>&#10;</xsl:text>
+ <xsl:call-template name="generate-default-rules" />
+ <xsl:text>&#10;&#10;</xsl:text><xsl:comment>SCHEMA METADATA</xsl:comment><xsl:text>&#10;</xsl:text>
+ <xsl:call-template name="handle-root"/>
+ <xsl:text>&#10;&#10;</xsl:text><xsl:comment>SCHEMATRON PATTERNS</xsl:comment><xsl:text>&#10;</xsl:text>
+
+ <xsl:apply-templates select="*[not(self::iso:ns)] " />
+</xsl:template>
+
+ <xsl:template name="iso:exslt.add.imports">
+ <xsl:param name="imports" select="$sch.exslt.imports"/>
+ <xsl:choose>
+ <xsl:when test="contains($imports, ';')">
+ <axsl:import href="{ substring-before($imports, ';') }"/>
+ <xsl:call-template name="iso:exslt.add.imports">
+ <xsl:with-param name="imports" select="substring-after($imports, ';')"/>
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:when test="$imports">
+ <axsl:import href="{ $imports }"/>
+ </xsl:when>
+ </xsl:choose>
+ </xsl:template>
+
+<xsl:template name="handle-phase" >
+ <xsl:if test="not(normalize-space( $phase ) = '#ALL')">
+ <xsl:if test="not(iso:phase[@id = normalize-space( $phase )])">
+ <xsl:message>Phase Error: no phase with name <xsl:value-of select="normalize-space( $phase )"
+ /> has been defined.</xsl:message>
+ </xsl:if>
+ </xsl:if>
+</xsl:template>
+
+<xsl:template name="generate-default-rules">
+ <xsl:text>&#10;&#10;</xsl:text>
+ <xsl:comment>MODE: SCHEMATRON-SELECT-FULL-PATH</xsl:comment><xsl:text>&#10;</xsl:text>
+ <xsl:comment>This mode can be used to generate an ugly though full XPath for locators</xsl:comment><xsl:text>&#10;</xsl:text>
+ <axsl:template match="*" mode="schematron-select-full-path">
+ <xsl:choose>
+ <xsl:when test=" $full-path-notation = '1' ">
+ <!-- Use for computers, but rather unreadable for humans -->
+ <axsl:apply-templates select="." mode="schematron-get-full-path"/>
+ </xsl:when>
+ <xsl:when test=" $full-path-notation = '2' ">
+ <!-- Use for humans, but no good for paths unless namespaces are known out-of-band -->
+ <axsl:apply-templates select="." mode="schematron-get-full-path-2"/>
+ </xsl:when>
+ <xsl:when test=" $full-path-notation = '3' ">
+ <!-- Obsolescent. Use for humans, but no good for paths unless namespaces are known out-of-band -->
+ <axsl:apply-templates select="." mode="schematron-get-full-path-3"/>
+ </xsl:when>
+
+ <xsl:otherwise >
+ <!-- Use for computers, but rather unreadable for humans -->
+ <axsl:apply-templates select="." mode="schematron-get-full-path"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </axsl:template>
+
+
+ <xsl:text>&#10;&#10;</xsl:text>
+ <xsl:comment>MODE: SCHEMATRON-FULL-PATH</xsl:comment><xsl:text>&#10;</xsl:text>
+ <xsl:comment>This mode can be used to generate an ugly though full XPath for locators</xsl:comment><xsl:text>&#10;</xsl:text>
+ <axsl:template match="*" mode="schematron-get-full-path">
+ <axsl:apply-templates select="parent::*" mode="schematron-get-full-path"/>
+
+ <!-- XSLT1 syntax -->
+
+ <axsl:text>/</axsl:text>
+ <axsl:choose>
+ <axsl:when test="namespace-uri()=''">
+ <axsl:value-of select="name()"/>
+ <axsl:variable name="p_1" select="1+
+ count(preceding-sibling::*[name()=name(current())])" />
+ <axsl:if test="$p_1&gt;1 or following-sibling::*[name()=name(current())]">
+ <xsl:text/>[<axsl:value-of select="$p_1"/>]<xsl:text/>
+ </axsl:if>
+ </axsl:when>
+ <axsl:otherwise>
+ <axsl:text>*[local-name()='</axsl:text>
+ <axsl:value-of select="local-name()"/><axsl:text>' and namespace-uri()='</axsl:text>
+ <axsl:value-of select="namespace-uri()"/>
+ <axsl:text>']</axsl:text>
+ <axsl:variable name="p_2" select="1+
+ count(preceding-sibling::*[local-name()=local-name(current())])" />
+ <axsl:if test="$p_2&gt;1 or following-sibling::*[local-name()=local-name(current())]">
+ <xsl:text/>[<axsl:value-of select="$p_2"/>]<xsl:text/>
+ </axsl:if>
+ </axsl:otherwise>
+ </axsl:choose>
+ </axsl:template>
+
+
+ <axsl:template match="@*" mode="schematron-get-full-path">
+
+ <!-- XSLT1 syntax -->
+ <axsl:text>/</axsl:text>
+ <axsl:choose>
+ <axsl:when test="namespace-uri()=''">@<axsl:value-of
+ select="name()"/></axsl:when>
+ <axsl:otherwise>
+ <axsl:text>@*[local-name()='</axsl:text>
+ <axsl:value-of select="local-name()"/>
+ <axsl:text>' and namespace-uri()='</axsl:text>
+ <axsl:value-of select="namespace-uri()"/>
+ <axsl:text>']</axsl:text>
+ </axsl:otherwise>
+ </axsl:choose>
+
+ </axsl:template>
+
+
+ <xsl:text>&#10;&#10;</xsl:text>
+
+ <xsl:comment>MODE: SCHEMATRON-FULL-PATH-2</xsl:comment>
+ <xsl:text>&#10;</xsl:text>
+ <xsl:comment>This mode can be used to generate prefixed XPath for humans</xsl:comment>
+ <xsl:text>&#10;</xsl:text>
+ <!--simplify the error messages by using the namespace prefixes of the
+ instance rather than the generic namespace-uri-styled qualification-->
+ <axsl:template match="node() | @*" mode="schematron-get-full-path-2">
+ <!--report the element hierarchy-->
+ <axsl:for-each select="ancestor-or-self::*">
+ <axsl:text>/</axsl:text>
+ <axsl:value-of select="name(.)"/>
+ <axsl:if test="preceding-sibling::*[name(.)=name(current())]">
+ <axsl:text>[</axsl:text>
+ <axsl:value-of
+ select="count(preceding-sibling::*[name(.)=name(current())])+1"/>
+ <axsl:text>]</axsl:text>
+ </axsl:if>
+ </axsl:for-each>
+ <!--report the attribute-->
+ <axsl:if test="not(self::*)">
+ <axsl:text/>/@<axsl:value-of select="name(.)"/>
+ </axsl:if>
+ </axsl:template>
+
+ <xsl:text>&#10;&#10;</xsl:text>
+ <xsl:comment>MODE: GENERATE-ID-FROM-PATH </xsl:comment><xsl:text>&#10;</xsl:text>
+ <!-- repeatable-id maker derived from Francis Norton's. -->
+ <!-- use this if you need generate ids in separate passes,
+ because generate-id() is not guaranteed to produce the same
+ results each time. These ids are not XML names but closer to paths. -->
+ <axsl:template match="/" mode="generate-id-from-path"/>
+ <axsl:template match="text()" mode="generate-id-from-path">
+ <axsl:apply-templates select="parent::*" mode="generate-id-from-path"/>
+ <axsl:value-of select="concat('.text-', 1+count(preceding-sibling::text()), '-')"/>
+ </axsl:template>
+ <axsl:template match="comment()" mode="generate-id-from-path">
+ <axsl:apply-templates select="parent::*" mode="generate-id-from-path"/>
+ <axsl:value-of select="concat('.comment-', 1+count(preceding-sibling::comment()), '-')"/>
+ </axsl:template>
+ <axsl:template match="processing-instruction()" mode="generate-id-from-path">
+ <axsl:apply-templates select="parent::*" mode="generate-id-from-path"/>
+ <axsl:value-of
+ select="concat('.processing-instruction-', 1+count(preceding-sibling::processing-instruction()), '-')"/>
+ </axsl:template>
+ <axsl:template match="@*" mode="generate-id-from-path">
+ <axsl:apply-templates select="parent::*" mode="generate-id-from-path"/>
+ <axsl:value-of select="concat('.@', name())"/>
+ </axsl:template>
+ <axsl:template match="*" mode="generate-id-from-path" priority="-0.5">
+ <axsl:apply-templates select="parent::*" mode="generate-id-from-path"/>
+ <axsl:text>.</axsl:text>
+<!--
+ <axsl:choose>
+ <axsl:when test="count(. | ../namespace::*) = count(../namespace::*)">
+ <axsl:value-of select="concat('.namespace::-',1+count(namespace::*),'-')"/>
+ </axsl:when>
+ <axsl:otherwise>
+-->
+ <axsl:value-of
+ select="concat('.',name(),'-',1+count(preceding-sibling::*[name()=name(current())]),'-')"/>
+<!--
+ </axsl:otherwise>
+ </axsl:choose>
+-->
+ </axsl:template>
+
+
+ <xsl:comment>MODE: SCHEMATRON-FULL-PATH-3</xsl:comment>
+
+ <xsl:text>&#10;</xsl:text>
+ <xsl:comment>This mode can be used to generate prefixed XPath for humans
+ (Top-level element has index)</xsl:comment>
+ <xsl:text>&#10;</xsl:text>
+ <!--simplify the error messages by using the namespace prefixes of the
+ instance rather than the generic namespace-uri-styled qualification-->
+ <axsl:template match="node() | @*" mode="schematron-get-full-path-3">
+ <!--report the element hierarchy-->
+ <axsl:for-each select="ancestor-or-self::*">
+ <axsl:text>/</axsl:text>
+ <axsl:value-of select="name(.)"/>
+ <axsl:if test="parent::*">
+ <axsl:text>[</axsl:text>
+ <axsl:value-of
+ select="count(preceding-sibling::*[name(.)=name(current())])+1"/>
+ <axsl:text>]</axsl:text>
+ </axsl:if>
+ </axsl:for-each>
+ <!--report the attribute-->
+ <axsl:if test="not(self::*)">
+ <axsl:text/>/@<axsl:value-of select="name(.)"/>
+ </axsl:if>
+ </axsl:template>
+
+ <xsl:text>&#10;&#10;</xsl:text>
+ <xsl:comment>MODE: GENERATE-ID-2 </xsl:comment><xsl:text>&#10;</xsl:text>
+ <!-- repeatable-id maker from David Carlisle. -->
+ <!-- use this if you need generate IDs in separate passes,
+ because generate-id() is not guaranteed to produce the same
+ results each time. These IDs are well-formed XML NMTOKENS -->
+ <axsl:template match="/" mode="generate-id-2">U</axsl:template>
+
+ <axsl:template match="*" mode="generate-id-2" priority="2">
+ <axsl:text>U</axsl:text>
+ <axsl:number level="multiple" count="*"/>
+ </axsl:template>
+
+ <axsl:template match="node()" mode="generate-id-2">
+ <axsl:text>U.</axsl:text>
+ <axsl:number level="multiple" count="*"/>
+ <axsl:text>n</axsl:text>
+ <axsl:number count="node()"/>
+ </axsl:template>
+
+ <axsl:template match="@*" mode="generate-id-2">
+ <axsl:text>U.</axsl:text>
+ <axsl:number level="multiple" count="*"/>
+ <axsl:text>_</axsl:text>
+ <axsl:value-of select="string-length(local-name(.))"/>
+ <axsl:text>_</axsl:text>
+ <axsl:value-of select="translate(name(),':','.')"/>
+ </axsl:template>
+
+
+ <xsl:comment>Strip characters</xsl:comment>
+ <axsl:template match="text()" priority="-1" />
+
+ </xsl:template>
+
+ <xsl:template name="handle-root">
+ <!-- Process the top-level element -->
+ <axsl:template match="/">
+ <xsl:call-template name="process-root">
+ <xsl:with-param
+ name="title" select="(@id | iso:title)[last()]"/>
+ <xsl:with-param name="version" select="'iso'" />
+ <xsl:with-param name="schemaVersion" select="@schemaVersion" />
+ <xsl:with-param name="queryBinding" select="@queryBinding" />
+ <xsl:with-param name="contents">
+ <xsl:apply-templates mode="do-all-patterns"/>
+ </xsl:with-param>
+
+ <!-- "Rich" properties -->
+ <xsl:with-param name="fpi" select="@fpi"/>
+ <xsl:with-param name="icon" select="@icon"/>
+ <xsl:with-param name="id" select="@id"/>
+ <xsl:with-param name="lang" select="@xml:lang"/>
+ <xsl:with-param name="see" select="@see" />
+ <xsl:with-param name="space" select="@xml:space" />
+
+
+ <!-- Non-standard extensions not part of the API yet -->
+ <xsl:with-param name="action" select="@action" />
+ </xsl:call-template>
+ </axsl:template>
+
+
+</xsl:template>
+
+<!-- ============================================================== -->
+<!-- ISO SCHEMATRON ELEMENTS -->
+<!-- ============================================================== -->
+
+ <!-- ISO ACTIVE -->
+ <xsl:template match="iso:active">
+ <xsl:if test="not(@pattern)">
+ <xsl:message>Markup Error: no pattern attribute in &lt;active></xsl:message>
+ </xsl:if>
+
+ <xsl:if test="not(../../iso:pattern[@id = current()/@pattern])
+ and not(../../iso:include)">
+ <xsl:message>Reference Error: the pattern "<xsl:value-of select="@pattern"
+ />" has been activated but is not declared</xsl:message>
+ </xsl:if>
+ </xsl:template>
+
+ <!-- ISO ASSERT and REPORT -->
+ <xsl:template match="iso:assert">
+
+ <xsl:if test="not(@test)">
+ <xsl:message>Markup Error: no test attribute in &lt;assert</xsl:message>
+ </xsl:if>
+ <xsl:text>&#10;&#10; </xsl:text>
+ <xsl:comment>ASSERT <xsl:value-of select="@role" /> </xsl:comment><xsl:text>&#10;</xsl:text>
+
+ <axsl:choose>
+ <axsl:when test="{@test}"/>
+ <axsl:otherwise>
+ <xsl:call-template name="process-assert">
+ <xsl:with-param name="test" select="normalize-space(@test)" />
+ <xsl:with-param name="diagnostics" select="@diagnostics"/>
+ <xsl:with-param name="flag" select="@flag"/>
+
+ <!-- "Rich" properties -->
+ <xsl:with-param name="fpi" select="@fpi"/>
+ <xsl:with-param name="icon" select="@icon"/>
+ <xsl:with-param name="id" select="@id"/>
+ <xsl:with-param name="lang" select="@xml:lang"/>
+ <xsl:with-param name="see" select="@see" />
+ <xsl:with-param name="space" select="@xml:space" />
+
+ <!-- "Linking" properties -->
+ <xsl:with-param name="role" select="@role" />
+ <xsl:with-param name="subject" select="@subject" />
+ </xsl:call-template>
+
+ </axsl:otherwise>
+ </axsl:choose>
+ </xsl:template>
+ <xsl:template match="iso:report">
+
+ <xsl:if test="not(@test)">
+ <xsl:message>Markup Error: no test attribute in &lt;report></xsl:message>
+ </xsl:if>
+
+ <xsl:text>&#10;&#10; </xsl:text>
+ <xsl:comment>REPORT <xsl:value-of select="@role" /> </xsl:comment><xsl:text>&#10;</xsl:text>
+
+ <axsl:if test="{@test}">
+
+ <xsl:call-template name="process-report">
+ <xsl:with-param name="test" select="normalize-space(@test)" />
+ <xsl:with-param name="diagnostics" select="@diagnostics"/>
+ <xsl:with-param name="flag" select="@flag"/>
+
+ <!-- "Rich" properties -->
+ <xsl:with-param name="fpi" select="@fpi"/>
+ <xsl:with-param name="icon" select="@icon"/>
+ <xsl:with-param name="id" select="@id"/>
+ <xsl:with-param name="lang" select="@xml:lang"/>
+ <xsl:with-param name="see" select="@see" />
+ <xsl:with-param name="space" select="@xml:space" />
+
+ <!-- "Linking" properties -->
+ <xsl:with-param name="role" select="@role" />
+ <xsl:with-param name="subject" select="@subject" />
+ </xsl:call-template>
+
+ </axsl:if>
+ </xsl:template>
+
+
+ <!-- ISO DIAGNOSTIC -->
+ <!-- We use a mode here to maintain backwards compatability, instead of adding it
+ to the other mode.
+ -->
+ <xsl:template match="iso:diagnostic" mode="check-diagnostics">
+ <xsl:if test="not(@id)">
+ <xsl:message>Markup Error: no id attribute in &lt;diagnostic></xsl:message>
+ </xsl:if>
+ </xsl:template>
+
+ <xsl:template match="iso:diagnostic" >
+ <xsl:call-template name="process-diagnostic">
+
+ <!-- "Rich" properties -->
+ <xsl:with-param name="fpi" select="@fpi"/>
+ <xsl:with-param name="icon" select="@icon"/>
+ <xsl:with-param name="id" select="@id"/>
+ <xsl:with-param name="lang" select="@xml:lang"/>
+ <xsl:with-param name="see" select="@see" />
+ <xsl:with-param name="space" select="@xml:space" />
+ </xsl:call-template>
+ </xsl:template>
+
+ <!-- ISO DIAGNOSTICS -->
+ <xsl:template match="iso:diagnostics" >
+ <xsl:apply-templates mode="check-diagnostics" select="*" />
+ </xsl:template>
+
+ <!-- ISO DIR -->
+ <xsl:template match="iso:dir" mode="text" >
+ <xsl:call-template name="process-dir">
+ <xsl:with-param name="value" select="@value"/>
+ </xsl:call-template>
+ </xsl:template>
+
+ <!-- ISO EMPH -->
+ <xsl:template match="iso:emph" mode="text">
+
+ <xsl:call-template name="process-emph"/>
+
+ </xsl:template>
+
+ <!-- ISO EXTENDS -->
+ <xsl:template match="iso:extends">
+ <xsl:if test="not(@rule)">
+ <xsl:message>Markup Error: no rule attribute in &lt;extends></xsl:message>
+ </xsl:if>
+ <xsl:if test="not(//iso:rule[@abstract='true'][@id= current()/@rule] )">
+ <xsl:message>Reference Error: the abstract rule "<xsl:value-of select="@rule"
+ />" has been referenced but is not declared</xsl:message>
+ </xsl:if>
+ <xsl:call-template name="IamEmpty" />
+
+ <xsl:if test="//iso:rule[@id=current()/@rule]">
+ <xsl:apply-templates select="//iso:rule[@id=current()/@rule]"
+ mode="extends"/>
+ </xsl:if>
+
+ </xsl:template>
+
+ <!-- KEY: ISO has no KEY -->
+ <!-- NOTE:
+ Key has had a checkered history. Schematron 1.0 allowed it in certain places, but
+ users came up with a different location, which has now been adopted.
+
+ XT, the early XSLT processor, did not implement key and died when it was present.
+ So there are some versions of the Schematron skeleton for XT that strip out all
+ key elements.
+
+ Xalan (e.g. Xalan4C 1.0 and a Xalan4J) also had a funny. A fix involved making
+ a top-level parameter called $hiddenKey and then using that instead of matching
+ "key". This has been removed.
+ -->
+ <xsl:template match="xsl:key" mode="do-keys" >
+ <xsl:if test="not(@name)">
+ <xsl:message>Markup Error: no name attribute in &lt;key></xsl:message>
+ </xsl:if>
+ <xsl:if test="not(@path) and not(@use)">
+ <xsl:message>Markup Error: no path or use attribute in &lt;key></xsl:message>
+ </xsl:if>
+ <xsl:choose>
+ <xsl:when test="parent::iso:rule ">
+ <xsl:call-template name="IamEmpty" />
+ <xsl:choose>
+ <xsl:when test="@path">
+ <axsl:key match="{../@context}" name="{@name}" use="{@path}"/>
+ </xsl:when>
+ <xsl:otherwise>
+ <axsl:key match="{../@context}" name="{@name}" use="{@use}"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:if test="not(@match) ">
+ <xsl:message>Markup Error: no path or use attribute in &lt;key></xsl:message>
+ </xsl:if>
+ <axsl:key>
+ <xsl:copy-of select="@*"/>
+ </axsl:key>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <xsl:template match="xsl:key " /><!-- swallow -->
+
+ <xsl:template match="iso:key " >
+ <xsl:message>Schema error: The key element is not in the ISO Schematron namespace. Use the XSLT namespace.</xsl:message>
+ </xsl:template>
+
+ <!-- ISO INCLUDE -->
+ <!-- This is only a fallback. Include really needs to have been done before this as a separate pass.-->
+
+ <xsl:template match="iso:include[not(normalize-space(@href))]"
+ priority="1">
+ <xsl:if test=" $debug = 'false' ">
+ <xsl:message terminate="yes">Schema error: Empty href= attribute for include directive.</xsl:message>
+ </xsl:if>
+
+ </xsl:template>
+
+ <!-- Extend the URI syntax to allow # refererences -->
+ <!-- Add experimental support for simple containers like /xxx:xxx/iso:pattern to allow better includes -->
+ <xsl:template match="iso:include">
+ <xsl:variable name="document-uri" select="substring-before(concat(@href,'#'), '#')"/>
+ <xsl:variable name="fragment-id" select="substring-after(@href, '#')"/>
+
+ <xsl:choose>
+
+ <xsl:when test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0" >
+ <xsl:message>Error: Impossible URL in Schematron include</xsl:message>
+ </xsl:when>
+
+ <xsl:when test="string-length( $fragment-id ) &gt; 0">
+ <xsl:variable name="theDocument_1" select="document( $document-uri,/ )" />
+ <xsl:variable name="theFragment_1" select="$theDocument_1//iso:*[@id= $fragment-id ]" />
+ <xsl:if test=" $theFragment_1/self::iso:schema ">
+ <xsl:message>Schema error: Use include to include fragments, not a whole schema</xsl:message>
+ </xsl:if>
+ <xsl:apply-templates select=" $theFragment_1"/>
+ </xsl:when>
+
+ <xsl:otherwise>
+ <xsl:variable name="theDocument_2" select="document( $document-uri,/ )" />
+ <xsl:variable name="theFragment_2" select="$theDocument_2/iso:*" />
+ <xsl:variable name="theContainedFragments" select="$theDocument_2/*/iso:*" />
+ <xsl:if test=" $theFragment_2/self::iso:schema or $theContainedFragments/self::iso:schema">
+ <xsl:message>Schema error: Use include to include fragments, not a whole schema</xsl:message>
+ </xsl:if>
+ <xsl:apply-templates select="$theFragment_2 | $theContainedFragments "/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <!-- This is to handle the particular case of including patterns -->
+ <xsl:template match="iso:include" mode="do-all-patterns">
+ <xsl:variable name="document-uri" select="substring-before(concat(@href,'#'), '#')"/>
+ <xsl:variable name="fragment-id" select="substring-after(@href, '#')"/>
+
+ <xsl:choose>
+
+ <xsl:when test="string-length( $document-uri ) = 0 and string-length( $fragment-id ) = 0" >
+ <xsl:message>Error: Impossible URL in Schematron include</xsl:message>
+ </xsl:when>
+
+ <xsl:when test="string-length( $fragment-id ) &gt; 0">
+ <xsl:variable name="theDocument_1" select="document( $document-uri,/ )" />
+ <xsl:variable name="theFragment_1" select="$theDocument_1//iso:*[@id= $fragment-id ]" />
+ <xsl:if test=" $theFragment_1/self::iso:schema ">
+ <xsl:message>Schema error: Use include to include fragments, not a whole schema</xsl:message>
+ </xsl:if>
+ <xsl:apply-templates select=" $theFragment_1" mode="do-all-patterns"/>
+ </xsl:when>
+
+ <xsl:otherwise>
+ <!-- Import the top-level element if it is in schematron namespace,
+ or its children otherwise, to allow a simple containment mechanism. -->
+ <xsl:variable name="theDocument_2" select="document( $document-uri,/ )" />
+ <xsl:variable name="theFragment_2" select="$theDocument_2/iso:*" />
+ <xsl:variable name="theContainedFragments" select="$theDocument_2/*/iso:*" />
+ <xsl:if test=" $theFragment_2/self::iso:schema or $theContainedFragments/self::iso:schema">
+ <xsl:message>Schema error: Use include to include fragments, not a whole schema</xsl:message>
+ </xsl:if>
+ <xsl:apply-templates select="$theFragment_2 | $theContainedFragments "
+ mode="do-all-patterns" />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <!-- ISO LET -->
+ <xsl:template match="iso:let" >
+ <xsl:if test="ancestor::iso:schema[@queryBinding='xpath']">
+ <xsl:message>Warning: Variables should not be used with the "xpath" query language binding.</xsl:message>
+ </xsl:if>
+
+ <!-- lets at the top-level are implemented as parameters -->
+
+ <xsl:choose>
+ <xsl:when test="parent::iso:schema">
+ <!-- it is an error to have an empty param/@select because an XPath is expected -->
+ <axsl:param name="{@name}" select="{@value}">
+ <xsl:if test="string-length(@value) &gt; 0">
+ <xsl:attribute name="select"><xsl:value-of select="@value"/></xsl:attribute>
+ </xsl:if>
+ </axsl:param>
+ </xsl:when>
+ <xsl:otherwise>
+ <axsl:variable name="{@name}" select="{@value}"/>
+ </xsl:otherwise>
+ </xsl:choose>
+
+ </xsl:template>
+
+ <!-- ISO NAME -->
+ <xsl:template match="iso:name" mode="text">
+
+ <xsl:if test="@path">
+ <xsl:call-template name="process-name">
+ <xsl:with-param name="name" select="concat('name(',@path,')')"/>
+ </xsl:call-template>
+ </xsl:if>
+ <xsl:if test="not(@path)">
+ <xsl:call-template name="process-name">
+ <xsl:with-param name="name" select="'name(.)'"/>
+ </xsl:call-template>
+ </xsl:if>
+ <xsl:call-template name="IamEmpty" />
+ </xsl:template>
+
+ <!-- ISO NS -->
+ <!-- Namespace handling is XSLT is quite tricky and implementation dependent -->
+ <xsl:template match="iso:ns">
+ <xsl:call-template name="handle-namespace" />
+ </xsl:template>
+
+ <!-- This template is just to provide the API hook -->
+ <xsl:template match="iso:ns" mode="do-all-patterns" >
+ <xsl:if test="not(@uri)">
+ <xsl:message>Markup Error: no uri attribute in &lt;ns></xsl:message>
+ </xsl:if>
+ <xsl:if test="not(@prefix)">
+ <xsl:message>Markup Error: no prefix attribute in &lt;ns></xsl:message>
+ </xsl:if>
+ <xsl:call-template name="IamEmpty" />
+ <xsl:call-template name="process-ns" >
+ <xsl:with-param name="prefix" select="@prefix"/>
+ <xsl:with-param name="uri" select="@uri"/>
+ </xsl:call-template>
+ </xsl:template>
+
+ <!-- ISO P -->
+ <xsl:template match="iso:schema/iso:p " mode="do-schema-p" >
+ <xsl:call-template name="process-p">
+ <xsl:with-param name="class" select="@class"/>
+ <xsl:with-param name="icon" select="@icon"/>
+ <xsl:with-param name="id" select="@id"/>
+ <xsl:with-param name="lang" select="@xml:lang"/>
+ </xsl:call-template>
+ </xsl:template>
+ <xsl:template match="iso:pattern/iso:p " mode="do-pattern-p" >
+ <xsl:call-template name="process-p">
+ <xsl:with-param name="class" select="@class"/>
+ <xsl:with-param name="icon" select="@icon"/>
+ <xsl:with-param name="id" select="@id"/>
+ <xsl:with-param name="lang" select="@xml:lang"/>
+ </xsl:call-template>
+ </xsl:template>
+
+ <!-- Currently, iso:p in other position are not passed through to the API -->
+ <xsl:template match="iso:phase/iso:p" />
+ <xsl:template match="iso:p " priority="-1" />
+
+ <!-- ISO PATTERN -->
+ <xsl:template match="iso:pattern" mode="do-all-patterns">
+ <xsl:if test="($phase = '#ALL')
+ or (../iso:phase[@id= $phase]/iso:active[@pattern= current()/@id])">
+ <xsl:call-template name="process-pattern">
+ <!-- the following select statement assumes that
+ @id | sch:title returns node-set in document order:
+ we want the title if it is there, otherwise the @id attribute -->
+ <xsl:with-param name="name" select="(@id | iso:title )[last()]"/>
+ <xsl:with-param name="is-a" select="''"/>
+
+ <!-- "Rich" properties -->
+ <xsl:with-param name="fpi" select="@fpi"/>
+ <xsl:with-param name="icon" select="@icon"/>
+ <xsl:with-param name="id" select="@id"/>
+ <xsl:with-param name="lang" select="@xml:lang"/>
+ <xsl:with-param name="see" select="@see" />
+ <xsl:with-param name="space" select="@xml:space" />
+ </xsl:call-template>
+ <xsl:choose>
+ <xsl:when test="$select-contexts='key'">
+ <axsl:apply-templates select="key('M','M{count(preceding-sibling::*)}')" mode="M{count(preceding-sibling::*)}"/>
+ </xsl:when>
+ <xsl:when test="$select-contexts='//'">
+ <axsl:apply-templates mode="M{count(preceding-sibling::*)}">
+ <xsl:attribute name="select">
+ <xsl:text>//(</xsl:text>
+ <xsl:for-each select="iso:rule/@context">
+ <xsl:text>(</xsl:text>
+ <xsl:value-of select="."/>
+ <xsl:text>)</xsl:text>
+ <xsl:if test="position()!=last()">|</xsl:if>
+ </xsl:for-each>
+ <xsl:text>)</xsl:text>
+ <xsl:if test="$visit-text='false'">[not(self::text())]</xsl:if>
+ </xsl:attribute>
+ </axsl:apply-templates>
+ </xsl:when>
+ <xsl:otherwise>
+ <axsl:apply-templates select="/" mode="M{count(preceding-sibling::*)}"/>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:if>
+ </xsl:template>
+
+ <xsl:template match="iso:pattern[@abstract='true']">
+
+ <xsl:message>Schema implementation error: This schema has abstract patterns, yet they are supposed to be preprocessed out already
+ </xsl:message>
+ </xsl:template>
+
+ <!-- Here is the template for the normal case of patterns -->
+ <xsl:template match="iso:pattern[not(@abstract='true')]">
+
+ <xsl:if test="($phase = '#ALL')
+ or (../iso:phase[@id= $phase]/iso:active[@pattern= current()/@id])">
+
+ <xsl:text>&#10;&#10;</xsl:text>
+ <xsl:comment>PATTERN <xsl:value-of select="@id" /> <xsl:value-of select="iso:title" /> </xsl:comment><xsl:text>&#10;</xsl:text>
+ <xsl:apply-templates />
+
+ <!-- DPC select-contexts test -->
+ <xsl:if test="not($select-contexts)">
+ <axsl:template match="text()" priority="-1" mode="M{count(preceding-sibling::*)}">
+ <!-- strip characters -->
+ </axsl:template>
+
+ <!-- DPC introduce context-xpath variable -->
+ <axsl:template match="@*|node()"
+ priority="-2"
+ mode="M{ count(preceding-sibling::*) }">
+ <axsl:apply-templates select="{$context-xpath}" mode="M{count(preceding-sibling::*)}"/>
+ </axsl:template>
+ </xsl:if>
+ </xsl:if>
+ </xsl:template>
+
+ <!-- ISO PHASE -->
+ <xsl:template match="iso:phase" >
+ <xsl:if test="not(@id)">
+ <xsl:message>Markup Error: no id attribute in &lt;phase></xsl:message>
+ </xsl:if>
+ <xsl:apply-templates/>
+ </xsl:template>
+
+ <!-- ISO RULE -->
+ <xsl:template match="iso:rule[not(@abstract='true')] ">
+ <xsl:if test="not(@context)">
+ <xsl:message>Markup Error: no context attribute in &lt;rule></xsl:message>
+ </xsl:if>
+ <xsl:text>&#10;&#10; </xsl:text>
+ <xsl:comment>RULE <xsl:value-of select="@id" /> </xsl:comment><xsl:text>&#10;</xsl:text>
+ <xsl:if test="iso:title">
+ <xsl:comment><xsl:value-of select="iso:title" /></xsl:comment>
+ </xsl:if>
+ <!-- DPC select-contexts -->
+ <xsl:if test="$select-contexts='key'">
+ <axsl:key name="M"
+ match="{@context}"
+ use="'M{count(../preceding-sibling::*)}'"/>
+ </xsl:if>
+
+
+<!-- DPC priorities count up from 1000 not down from 4000 (templates in same priority order as before) -->
+ <axsl:template match="{@context}"
+ priority="{1000 + count(following-sibling::*)}" mode="M{count(../preceding-sibling::*)}">
+ <xsl:call-template name="process-rule">
+ <xsl:with-param name="context" select="@context"/>
+
+ <!-- "Rich" properties -->
+ <xsl:with-param name="fpi" select="@fpi"/>
+ <xsl:with-param name="icon" select="@icon"/>
+ <xsl:with-param name="id" select="@id"/>
+ <xsl:with-param name="lang" select="@xml:lang"/>
+ <xsl:with-param name="see" select="@see" />
+ <xsl:with-param name="space" select="@xml:space" />
+
+ <!-- "Linking" properties -->
+ <xsl:with-param name="role" select="@role" />
+ <xsl:with-param name="subject" select="@subject" />
+ </xsl:call-template>
+ <xsl:apply-templates/>
+ <!-- DPC introduce context-xpath and select-contexts variables -->
+ <xsl:if test="not($select-contexts)">
+ <axsl:apply-templates select="{$context-xpath}" mode="M{count(../preceding-sibling::*)}"/>
+ </xsl:if>
+ </axsl:template>
+ </xsl:template>
+
+
+ <!-- ISO ABSTRACT RULE -->
+ <xsl:template match="iso:rule[@abstract='true'] " >
+ <xsl:if test=" not(@id)">
+ <xsl:message>Markup Error: no id attribute on abstract &lt;rule></xsl:message>
+ </xsl:if>
+ <xsl:if test="@context">
+ <xsl:message>Markup Error: (2) context attribute on abstract &lt;rule></xsl:message>
+ </xsl:if>
+ </xsl:template>
+
+ <xsl:template match="iso:rule[@abstract='true']"
+ mode="extends" >
+ <xsl:if test="@context">
+ <xsl:message>Markup Error: context attribute on abstract &lt;rule></xsl:message>
+ </xsl:if>
+ <xsl:apply-templates/>
+ </xsl:template>
+
+ <!-- ISO SPAN -->
+ <xsl:template match="iso:span" mode="text">
+ <xsl:call-template name="process-span">
+ <xsl:with-param name="class" select="@class"/>
+ </xsl:call-template>
+ </xsl:template>
+
+ <!-- ISO TITLE -->
+
+ <xsl:template match="iso:schema/iso:title" priority="1">
+ <xsl:call-template name="process-schema-title" />
+ </xsl:template>
+
+
+ <xsl:template match="iso:title" >
+ <xsl:call-template name="process-title" />
+ </xsl:template>
+
+
+ <!-- ISO VALUE-OF -->
+ <xsl:template match="iso:value-of" mode="text" >
+ <xsl:if test="not(@select)">
+ <xsl:message>Markup Error: no select attribute in &lt;value-of></xsl:message>
+ </xsl:if>
+ <xsl:call-template name="IamEmpty" />
+
+ <xsl:choose>
+ <xsl:when test="@select">
+ <xsl:call-template name="process-value-of">
+ <xsl:with-param name="select" select="@select"/>
+ </xsl:call-template>
+ </xsl:when>
+ <xsl:otherwise >
+ <xsl:call-template name="process-value-of">
+ <xsl:with-param name="select" select="'.'"/>
+ </xsl:call-template>
+ </xsl:otherwise>
+ </xsl:choose>
+
+ </xsl:template>
+
+
+<!-- ============================================================== -->
+<!-- DEFAULT TEXT HANDLING -->
+<!-- ============================================================== -->
+ <xsl:template match="text()" priority="-1" mode="do-keys">
+ <!-- strip characters -->
+ </xsl:template>
+ <xsl:template match="text()" priority="-1" mode="do-all-patterns">
+ <!-- strip characters -->
+ </xsl:template>
+ <xsl:template match="text()" priority="-1" mode="do-schema-p">
+ <!-- strip characters -->
+ </xsl:template>
+ <xsl:template match="text()" priority="-1" mode="do-pattern-p">
+ <!-- strip characters -->
+ </xsl:template>
+
+ <xsl:template match="text()" priority="-1">
+ <!-- Strip characters -->
+ </xsl:template>
+
+ <xsl:template match="text()" mode="text">
+ <xsl:value-of select="."/>
+ </xsl:template>
+
+ <xsl:template match="text()" mode="inline-text">
+ <xsl:value-of select="."/>
+ </xsl:template>
+
+<!-- ============================================================== -->
+<!-- UTILITY TEMPLATES -->
+<!-- ============================================================== -->
+<xsl:template name="IamEmpty">
+ <xsl:if test="count( * )">
+ <xsl:message>
+ <xsl:text>Warning: </xsl:text>
+ <xsl:value-of select="name(.)"/>
+ <xsl:text> must not contain any child elements</xsl:text>
+ </xsl:message>
+ </xsl:if>
+</xsl:template>
+
+<xsl:template name="diagnosticsSplit">
+ <!-- Process at the current point the first of the <diagnostic> elements
+ referred to parameter str, and then recurse -->
+ <xsl:param name="str"/>
+ <xsl:variable name="start">
+ <xsl:choose>
+ <xsl:when test="contains($str,' ')">
+ <xsl:value-of select="substring-before($str,' ')"/>
+ </xsl:when>
+ <xsl:otherwise><xsl:value-of select="$str"/></xsl:otherwise>
+ </xsl:choose>
+ </xsl:variable>
+
+ <xsl:variable name="end">
+ <xsl:if test="contains($str,' ')">
+ <xsl:value-of select="substring-after($str,' ')"/>
+ </xsl:if>
+ </xsl:variable>
+
+ <!-- This works with all namespaces -->
+ <xsl:if test="not(string-length(normalize-space($start)) = 0)
+ and not(//iso:diagnostic[@id = $start])
+ and not(//sch:diagnostic[@id = $start])
+ and not(//diagnostic[@id = $start])">
+ <xsl:message>Reference error: A diagnostic "<xsl:value-of select="string($start)"
+ />" has been referenced but is not declared</xsl:message>
+ </xsl:if>
+
+ <xsl:if test="string-length(normalize-space($start)) > 0">
+ <xsl:text> </xsl:text>
+ <xsl:apply-templates
+ select="//iso:diagnostic[@id = $start ]
+ | //sch:diagnostic[@id = $start ]
+ | //diagnostic[@id= $start ]"/>
+ </xsl:if>
+
+ <xsl:if test="not($end='')">
+ <xsl:call-template name="diagnosticsSplit">
+ <xsl:with-param name="str" select="$end"/>
+ </xsl:call-template>
+ </xsl:if>
+</xsl:template>
+
+<!-- It would be nice to use this but xsl:namespace does not
+ allow a fallback -->
+<!--xsl:template name="handle-namespace" version="2.0">
+ <xsl:namespace name="{@prefix}" select="@uri">
+</xsl:template-->
+
+<xsl:template name="handle-namespace">
+ <!-- experimental code from http://eccnet.eccnet.com/pipermail/schematron-love-in/2006-June/000104.html -->
+ <!-- Handle namespaces differently for exslt systems, msxml, and default, only using XSLT1 syntax -->
+ <!-- For more info see http://fgeorges.blogspot.com/2007/01/creating-namespace-nodes-in-xslt-10.html -->
+ <xsl:choose>
+ <!-- The following code works for XSLT1 -->
+ <xsl:when test="function-available('exsl:node-set')">
+ <xsl:variable name="ns-dummy-elements">
+ <xsl:element name="{@prefix}:dummy" namespace="{@uri}"/>
+ </xsl:variable>
+ <xsl:variable name="p" select="@prefix"/>
+ <xsl:copy-of select="exsl:node-set($ns-dummy-elements)
+ /*/namespace::*[local-name()=$p]"/>
+ </xsl:when>
+
+ <!-- End XSLT1 code -->
+
+ <!-- Not tested yet
+ <xsl:when test="function-available('msxsl:node-set')">
+ <xsl:variable name="ns-dummy-elements">
+ <xsl:element name="{ $prefix }:e" namespace="{ $uri }"/>
+ </xsl:variable>
+ <xsl:copy-of select="msxsl:node-set($ns-dummy-elements)/*/namespace::*"/>
+ </xsl:when>
+ -->
+
+ <xsl:when test="@prefix = 'xsl' ">
+ <!-- Do not generate dummy attributes with the xsl: prefix, as these
+ are errors against XSLT, because we presume that the output
+ stylesheet uses the xsl prefix. In any case, there would already
+ be a namespace declaration for the XSLT namespace generated
+ automatically, presumably using "xsl:".
+ -->
+ </xsl:when>
+
+ <xsl:when test="@uri = 'http://www.w3.org/1999/XSL/Transform'">
+ <xsl:message terminate="yes">
+ <xsl:text>Using the XSLT namespace with a prefix other than "xsl" in </xsl:text>
+ <xsl:text>Schematron rules is not supported </xsl:text>
+ <xsl:text>in this processor: </xsl:text>
+ <xsl:value-of select="system-property('xsl:vendor')"/>
+ </xsl:message>
+ </xsl:when>
+
+ <xsl:otherwise>
+ <xsl:attribute name="{concat(@prefix,':dummy-for-xmlns')}" namespace="{@uri}" />
+
+ </xsl:otherwise>
+ </xsl:choose>
+
+
+</xsl:template>
+
+<!-- ============================================================== -->
+<!-- UNEXPECTED ELEMENTS -->
+<!-- ============================================================== -->
+
+ <xsl:template match="iso:*" priority="-2">
+ <xsl:message>
+ <xsl:text>Error: unrecognized element in ISO Schematron namespace: check spelling
+ and capitalization</xsl:text>
+ <xsl:value-of select="name(.)"/>
+ </xsl:message>
+ </xsl:template>
+
+
+ <!-- Swallow old namespace elements: there is an upfront test for them elsewhere -->
+ <xsl:template match="sch:*" priority="-2" />
+
+ <xsl:template match="*" priority="-3">
+ <xsl:choose>
+ <xsl:when test=" $allow-foreign = 'false' ">
+ <xsl:message>
+ <xsl:text>Warning: unrecognized element </xsl:text>
+ <xsl:value-of select="name(.)"/>
+ </xsl:message>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:copy-of select="." />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+ <xsl:template match="iso:*" mode="text" priority="-2" />
+ <xsl:template match="*" mode="text" priority="-3">
+ <xsl:choose>
+ <xsl:when test=" $allow-foreign = 'false' ">
+ <xsl:message>
+ <xsl:text>Warning: unrecognized element </xsl:text>
+ <xsl:value-of select="name(.)"/>
+ </xsl:message>
+ </xsl:when>
+ <xsl:otherwise>
+ <xsl:copy-of select="." />
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+<!-- ============================================================== -->
+<!-- DEFAULT NAMED TEMPLATES -->
+<!-- These are the actions that are performed unless overridden -->
+<!-- ============================================================== -->
+
+ <xsl:template name="process-prolog"/>
+ <!-- no params -->
+
+ <xsl:template name="process-root">
+ <xsl:param name="contents"/>
+ <xsl:param name="id" />
+ <xsl:param name="version" />
+ <xsl:param name="schemaVersion" />
+ <xsl:param name="queryBinding" />
+ <xsl:param name="title" />
+
+
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+
+ <xsl:copy-of select="$contents"/>
+ </xsl:template>
+
+ <xsl:template name="process-assert">
+
+ <xsl:param name="test"/>
+ <xsl:param name="diagnostics" />
+ <xsl:param name="id" />
+ <xsl:param name="flag" />
+
+ <!-- "Linkable" parameters -->
+ <xsl:param name="role"/>
+ <xsl:param name="subject"/>
+
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+
+
+ <xsl:call-template name="process-message">
+ <xsl:with-param name="pattern" select="$test"/>
+ <xsl:with-param name="role" select="$role"/>
+ </xsl:call-template>
+
+
+ </xsl:template>
+
+ <xsl:template name="process-report">
+ <xsl:param name="test"/>
+ <xsl:param name="diagnostics" />
+ <xsl:param name="id" />
+ <xsl:param name="flag" />
+
+ <!-- "Linkable" parameters -->
+ <xsl:param name="role"/>
+ <xsl:param name="subject"/>
+
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+
+ <xsl:call-template name="process-message">
+ <xsl:with-param name="pattern" select="$test"/>
+ <xsl:with-param name="role" select="$role"/>
+ </xsl:call-template>
+ </xsl:template>
+
+ <xsl:template name="process-diagnostic">
+ <xsl:param name="id" />
+
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+
+ <!-- We generate too much whitespace rather than risking concatenation -->
+ <axsl:text> </axsl:text>
+ <xsl:apply-templates mode="text"/>
+ <axsl:text> </axsl:text>
+ </xsl:template>
+
+ <xsl:template name="process-dir">
+ <xsl:param name="value" />
+
+ <!-- We generate too much whitespace rather than risking concatenation -->
+ <axsl:text> </axsl:text>
+ <xsl:apply-templates mode="inline-text"/>
+ <axsl:text> </axsl:text>
+ </xsl:template>
+
+ <xsl:template name="process-emph">
+ <!-- We generate too much whitespace rather than risking concatenation -->
+ <axsl:text> </axsl:text>
+ <xsl:apply-templates mode="inline-text"/>
+ <axsl:text> </axsl:text>
+ </xsl:template>
+
+ <xsl:template name="process-name">
+ <xsl:param name="name"/>
+
+ <!-- We generate too much whitespace rather than risking concatenation -->
+ <axsl:text> </axsl:text>
+ <axsl:value-of select="{$name}"/>
+ <axsl:text> </axsl:text>
+
+ </xsl:template>
+
+ <xsl:template name="process-ns" >
+ <!-- Note that process-ns is for reporting. The sch:ns elements are
+ independently used in the sch:schema template to provide namespace bindings -->
+ <xsl:param name="prefix"/>
+ <xsl:param name="uri" />
+ </xsl:template>
+
+ <xsl:template name="process-p">
+ <xsl:param name="id" />
+ <xsl:param name="class" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ </xsl:template>
+
+ <xsl:template name="process-pattern">
+ <xsl:param name="id" />
+ <xsl:param name="name" />
+ <xsl:param name="is-a" />
+
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+ </xsl:template>
+
+
+ <xsl:template name="process-rule">
+ <xsl:param name="context" />
+
+ <xsl:param name="id" />
+ <xsl:param name="flag" />
+
+ <!-- "Linkable" parameters -->
+ <xsl:param name="role"/>
+ <xsl:param name="subject"/>
+
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+ </xsl:template>
+
+ <xsl:template name="process-span" >
+ <xsl:param name="class" />
+
+ <!-- We generate too much whitespace rather than risking concatenation -->
+ <axsl:text> </axsl:text>
+ <xsl:apply-templates mode="inline-text"/>
+ <axsl:text> </axsl:text>
+ </xsl:template>
+
+ <xsl:template name="process-title" >
+ <xsl:param name="class" />
+ <xsl:call-template name="process-p">
+ <xsl:with-param name="class">title</xsl:with-param>
+ </xsl:call-template>
+ </xsl:template>
+
+ <xsl:template name="process-schema-title" >
+ <xsl:param name="class" />
+ <xsl:call-template name="process-title">
+ <xsl:with-param name="class">schema-title</xsl:with-param>
+ </xsl:call-template>
+ </xsl:template>
+
+ <xsl:template name="process-value-of">
+ <xsl:param name="select"/>
+
+ <!-- We generate too much whitespace rather than risking concatenation -->
+ <axsl:text> </axsl:text>
+ <axsl:value-of select="{$select}"/>
+ <axsl:text> </axsl:text>
+ </xsl:template>
+
+ <!-- default output action: the simplest customization is to just override this -->
+ <xsl:template name="process-message">
+ <xsl:param name="pattern" />
+ <xsl:param name="role" />
+
+ <xsl:apply-templates mode="text"/>
+ <xsl:if test=" $message-newline = 'true'" >
+ <axsl:value-of select="string('&#10;')"/>
+ </xsl:if>
+
+ </xsl:template>
+</xsl:stylesheet>
+
+
+
diff --git a/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl
new file mode 100644
index 0000000..dae74ff
--- /dev/null
+++ b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/iso_svrl_for_xslt1.xsl
@@ -0,0 +1,588 @@
+<?xml version="1.0" ?>
+<!--
+ ISO_SVRL.xsl
+
+ Implementation of Schematron Validation Report Language from ISO Schematron
+ ISO/IEC 19757 Document Schema Definition Languages (DSDL)
+ Part 3: Rule-based validation Schematron
+ Annex D: Schematron Validation Report Language
+
+ This ISO Standard is available free as a Publicly Available Specification in PDF from ISO.
+ Also see www.schematron.com for drafts and other information.
+
+ This implementation of SVRL is designed to run with the "Skeleton" implementation
+ of Schematron which Oliver Becker devised. The skeleton code provides a
+ Schematron implementation but with named templates for handling all output;
+ the skeleton provides basic templates for output using this API, but client
+ validators can be written to import the skeleton and override the default output
+ templates as required. (In order to understand this, you must understand that
+ a named template such as "process-assert" in this XSLT stylesheet overrides and
+ replaces any template with the same name in the imported skeleton XSLT file.)
+
+ The other important thing to understand in this code is that there are different
+ versions of the Schematron skeleton. These track the development of Schematron through
+ Schematron 1.5, Schematron 1.6 and now ISO Schematron. One only skeleton must be
+ imported. The code has templates for the different skeletons commented out for
+ convenience. ISO Schematron has a different namespace than Schematron 1.5 and 1.6;
+ so the ISO Schematron skeleton has been written itself with an optional import
+ statement to in turn import the Schematron 1.6 skeleton. This will allow you to
+ validate with schemas from either namespace.
+
+
+ History:
+ 2009-03-18
+ * Fix atrribute with space "see " which generates wrong name in some processors
+ 2008-08-11
+ * RJ Fix attribute/@select which saxon allows in XSLT 1
+ 2008-08-07
+ * RJ Add output-encoding attribute to specify final encoding to use
+ * Alter allow-foreign functionality so that Schematron span, emph and dir elements make
+ it to the output, for better formatting and because span can be used to mark up
+ semantically interesting information embedded in diagnostics, which reduces the
+ need to extend SVRL itself
+ * Diagnostic-reference had an invalid attribute @id that duplicated @diagnostic: removed
+ 2008-08-06
+ * RJ Fix invalid output: svrl:diagnostic-reference is not contained in an svrl:text
+ * Output comment to SVRL file giving filename if available (from command-line parameter)
+ 2008-08-04
+ * RJ move sch: prefix to schold: prefix to prevent confusion (we want people to
+ be able to switch from old namespace to new namespace without changing the
+ sch: prefix, so it is better to keep that prefix completely out of the XSLT)
+ * Extra signature fixes (PH)
+ 2008-08-03
+ * Repair missing class parameter on process-p
+ 2008-07-31
+ * Update skeleton names
+ 2007-04-03
+ * Add option generate-fired-rule (RG)
+ 2007-02-07
+ * Prefer true|false for parameters. But allow yes|no on some old for compatability
+ * DP Diagnostics output to svrl:text. Diagnosis put out after assertion text.
+ * Removed non-SVRL elements and attributes: better handled as an extra layer that invokes this one
+ * Add more formal parameters
+ * Correct confusion between $schemaVersion and $queryBinding
+ * Indent
+ * Validate against RNC schemas for XSLT 1 and 2 (with regex tests removed)
+ * Validate output with UniversalTest.sch against RNC schema for ISO SVRL
+
+ 2007-02-01
+ * DP. Update formal parameters of overriding named templates to handle more attributes.
+ * DP. Refactor handling of rich and linkable parameters to a named template.
+
+ 2007-01-22
+ * DP change svrl:ns to svrl:ns-in-attribute-value
+ * Change default when no queryBinding from "unknown" to "xslt"
+
+ 2007-01-18:
+ * Improve documentation
+ * KH Add command-line options to generate paths or not
+ * Use axsl:attribute rather than xsl:attribute to shut XSLT2 up
+ * Add extra command-line options to pass to the iso_schematron_skeleton
+
+ 2006-12-01: iso_svrl.xsl Rick Jelliffe,
+ * update namespace,
+ * update phase handling,
+ * add flag param to process-assert and process-report & @ flag on output
+
+ 2001: Conformance1-5.xsl Rick Jelliffe,
+ * Created, using the skeleton code contributed by Oliver Becker
+-->
+<!--
+ Derived from Conformance1-5.xsl.
+
+ Copyright (c) 2001, 2006 Rick Jelliffe and Academia Sinica Computing Center, Taiwan
+
+ This software is provided 'as-is', without any express or implied warranty.
+ In no event will the authors be held liable for any damages arising from
+ the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it freely,
+ subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not claim
+ that you wrote the original software. If you use this software in a product,
+ an acknowledgment in the product documentation would be appreciated but is
+ not required.
+
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+
+ 3. This notice may not be removed or altered from any source distribution.
+-->
+
+<!-- Ideas nabbed from schematrons by Francis N., Miloslav N. and David C. -->
+
+<!-- The command-line parameters are:
+ phase NMTOKEN | "#ALL" (default) Select the phase for validation
+ allow-foreign "true" | "false" (default) Pass non-Schematron elements and rich markup to the generated stylesheet
+ diagnose= true | false|yes|no Add the diagnostics to the assertion test in reports (yes|no are obsolete)
+ generate-paths=true|false|yes|no generate the @location attribute with XPaths (yes|no are obsolete)
+ sch.exslt.imports semi-colon delimited string of filenames for some EXSLT implementations
+ optimize "visit-no-attributes" Use only when the schema has no attributes as the context nodes
+ generate-fired-rule "true"(default) | "false" Generate fired-rule elements
+
+-->
+
+<xsl:stylesheet
+ version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:xs="http://www.w3.org/2001/XMLSchema"
+ xmlns:axsl="http://www.w3.org/1999/XSL/TransformAlias"
+ xmlns:schold="http://www.ascc.net/xml/schematron"
+ xmlns:iso="http://purl.oclc.org/dsdl/schematron"
+ xmlns:svrl="http://purl.oclc.org/dsdl/svrl"
+>
+
+<!-- Select the import statement and adjust the path as
+ necessary for your system.
+ If not XSLT2 then also remove svrl:active-pattern/@document="{document-uri()}" from process-pattern()
+-->
+<!--
+<xsl:import href="iso_schematron_skeleton_for_saxon.xsl"/>
+-->
+
+<xsl:import href="iso_schematron_skeleton_for_xslt1.xsl"/>
+ <!--
+<xsl:import href="iso_schematron_skeleton.xsl"/>
+<xsl:import href="skeleton1-5.xsl"/>
+<xsl:import href="skeleton1-6.xsl"/>
+-->
+
+<xsl:param name="diagnose" >true</xsl:param>
+<xsl:param name="phase" >
+ <xsl:choose>
+ <!-- Handle Schematron 1.5 and 1.6 phases -->
+ <xsl:when test="//schold:schema/@defaultPhase">
+ <xsl:value-of select="//schold:schema/@defaultPhase"/>
+ </xsl:when>
+ <!-- Handle ISO Schematron phases -->
+ <xsl:when test="//iso:schema/@defaultPhase">
+ <xsl:value-of select="//iso:schema/@defaultPhase"/>
+ </xsl:when>
+ <xsl:otherwise>#ALL</xsl:otherwise>
+ </xsl:choose>
+</xsl:param>
+<xsl:param name="allow-foreign" >false</xsl:param>
+<xsl:param name="generate-paths" >true</xsl:param>
+<xsl:param name="generate-fired-rule" >true</xsl:param>
+<xsl:param name="optimize"/>
+
+<xsl:param name="output-encoding" ></xsl:param>
+
+<!-- e.g. saxon file.xml file.xsl "sch.exslt.imports=.../string.xsl;.../math.xsl" -->
+<xsl:param name="sch.exslt.imports" />
+
+
+
+<!-- Experimental: If this file called, then must be generating svrl -->
+<xsl:variable name="svrlTest" select="true()" />
+
+
+
+<!-- ================================================================ -->
+
+<xsl:template name="process-prolog">
+ <axsl:output method="xml" omit-xml-declaration="no" standalone="yes"
+ indent="yes">
+ <xsl:if test=" string-length($output-encoding) &gt; 0">
+ <xsl:attribute name="encoding"><xsl:value-of select=" $output-encoding" /></xsl:attribute>
+ </xsl:if>
+ </axsl:output>
+
+</xsl:template>
+
+<!-- Overrides skeleton.xsl -->
+<xsl:template name="process-root">
+ <xsl:param name="title"/>
+ <xsl:param name="contents" />
+ <xsl:param name="queryBinding" >xslt1</xsl:param>
+ <xsl:param name="schemaVersion" />
+ <xsl:param name="id" />
+ <xsl:param name="version"/>
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+
+ <svrl:schematron-output title="{$title}" schemaVersion="{$schemaVersion}" >
+ <xsl:if test=" string-length( normalize-space( $phase )) &gt; 0 and
+ not( normalize-space( $phase ) = '#ALL') ">
+ <axsl:attribute name="phase">
+ <xsl:value-of select=" $phase " />
+ </axsl:attribute>
+ </xsl:if>
+ <xsl:if test=" $allow-foreign = 'true'">
+ </xsl:if>
+ <xsl:if test=" $allow-foreign = 'true'">
+
+ <xsl:call-template name='richParms'>
+ <xsl:with-param name="fpi" select="$fpi" />
+ <xsl:with-param name="icon" select="$icon"/>
+ <xsl:with-param name="lang" select="$lang"/>
+ <xsl:with-param name="see" select="$see" />
+ <xsl:with-param name="space" select="$space" />
+ </xsl:call-template>
+ </xsl:if>
+
+ <axsl:comment><axsl:value-of select="$archiveDirParameter"/> &#xA0;
+ <axsl:value-of select="$archiveNameParameter"/> &#xA0;
+ <axsl:value-of select="$fileNameParameter"/> &#xA0;
+ <axsl:value-of select="$fileDirParameter"/></axsl:comment>
+
+
+ <xsl:apply-templates mode="do-schema-p" />
+ <xsl:copy-of select="$contents" />
+ </svrl:schematron-output>
+</xsl:template>
+
+
+<xsl:template name="process-assert">
+ <xsl:param name="test"/>
+ <xsl:param name="diagnostics" />
+ <xsl:param name="id" />
+ <xsl:param name="flag" />
+ <!-- "Linkable" parameters -->
+ <xsl:param name="role"/>
+ <xsl:param name="subject"/>
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+ <svrl:failed-assert test="{$test}" >
+ <xsl:if test="string-length( $id ) &gt; 0">
+ <axsl:attribute name="id">
+ <xsl:value-of select=" $id " />
+ </axsl:attribute>
+ </xsl:if>
+ <xsl:if test=" string-length( $flag ) &gt; 0">
+ <axsl:attribute name="flag">
+ <xsl:value-of select=" $flag " />
+ </axsl:attribute>
+ </xsl:if>
+ <!-- Process rich attributes. -->
+ <xsl:call-template name="richParms">
+ <xsl:with-param name="fpi" select="$fpi"/>
+ <xsl:with-param name="icon" select="$icon"/>
+ <xsl:with-param name="lang" select="$lang"/>
+ <xsl:with-param name="see" select="$see" />
+ <xsl:with-param name="space" select="$space" />
+ </xsl:call-template>
+ <xsl:call-template name='linkableParms'>
+ <xsl:with-param name="role" select="$role" />
+ <xsl:with-param name="subject" select="$subject"/>
+ </xsl:call-template>
+ <xsl:if test=" $generate-paths = 'true' or $generate-paths= 'yes' ">
+ <!-- true/false is the new way -->
+ <axsl:attribute name="location">
+ <axsl:apply-templates select="." mode="schematron-get-full-path"/>
+ </axsl:attribute>
+ </xsl:if>
+
+ <svrl:text>
+ <xsl:apply-templates mode="text" />
+
+ </svrl:text>
+ <xsl:if test="$diagnose = 'yes' or $diagnose= 'true' ">
+ <!-- true/false is the new way -->
+ <xsl:call-template name="diagnosticsSplit">
+ <xsl:with-param name="str" select="$diagnostics"/>
+ </xsl:call-template>
+ </xsl:if>
+ </svrl:failed-assert>
+</xsl:template>
+
+<xsl:template name="process-report">
+ <xsl:param name="id"/>
+ <xsl:param name="test"/>
+ <xsl:param name="diagnostics"/>
+ <xsl:param name="flag" />
+ <!-- "Linkable" parameters -->
+ <xsl:param name="role"/>
+ <xsl:param name="subject"/>
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+ <svrl:successful-report test="{$test}" >
+ <xsl:if test=" string-length( $id ) &gt; 0">
+ <axsl:attribute name="id">
+ <xsl:value-of select=" $id " />
+ </axsl:attribute>
+ </xsl:if>
+ <xsl:if test=" string-length( $flag ) &gt; 0">
+ <axsl:attribute name="flag">
+ <xsl:value-of select=" $flag " />
+ </axsl:attribute>
+ </xsl:if>
+
+ <!-- Process rich attributes. -->
+ <xsl:call-template name="richParms">
+ <xsl:with-param name="fpi" select="$fpi"/>
+ <xsl:with-param name="icon" select="$icon"/>
+ <xsl:with-param name="lang" select="$lang"/>
+ <xsl:with-param name="see" select="$see" />
+ <xsl:with-param name="space" select="$space" />
+ </xsl:call-template>
+ <xsl:call-template name='linkableParms'>
+ <xsl:with-param name="role" select="$role" />
+ <xsl:with-param name="subject" select="$subject"/>
+ </xsl:call-template>
+ <xsl:if test=" $generate-paths = 'yes' or $generate-paths = 'true' ">
+ <!-- true/false is the new way -->
+ <axsl:attribute name="location">
+ <axsl:apply-templates select="." mode="schematron-get-full-path"/>
+ </axsl:attribute>
+ </xsl:if>
+
+ <svrl:text>
+ <xsl:apply-templates mode="text" />
+
+ </svrl:text>
+ <xsl:if test="$diagnose = 'yes' or $diagnose='true' ">
+ <!-- true/false is the new way -->
+ <xsl:call-template name="diagnosticsSplit">
+ <xsl:with-param name="str" select="$diagnostics"/>
+ </xsl:call-template>
+ </xsl:if>
+ </svrl:successful-report>
+</xsl:template>
+
+
+ <!-- Overrides skeleton -->
+ <xsl:template name="process-dir" >
+ <xsl:param name="value" />
+ <xsl:choose>
+ <xsl:when test=" $allow-foreign = 'true'">
+ <xsl:copy-of select="."/>
+ </xsl:when>
+
+ <xsl:otherwise>
+ <!-- We generate too much whitespace rather than risking concatenation -->
+ <axsl:text> </axsl:text>
+ <xsl:apply-templates mode="inline-text"/>
+ <axsl:text> </axsl:text>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+<xsl:template name="process-diagnostic">
+ <xsl:param name="id"/>
+ <!-- Rich parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+ <svrl:diagnostic-reference diagnostic="{$id}" >
+
+ <xsl:call-template name="richParms">
+ <xsl:with-param name="fpi" select="$fpi"/>
+ <xsl:with-param name="icon" select="$icon"/>
+ <xsl:with-param name="lang" select="$lang"/>
+ <xsl:with-param name="see" select="$see" />
+ <xsl:with-param name="space" select="$space" />
+ </xsl:call-template>
+<xsl:text>
+</xsl:text>
+
+ <xsl:apply-templates mode="text"/>
+
+ </svrl:diagnostic-reference>
+</xsl:template>
+
+
+ <!-- Overrides skeleton -->
+ <xsl:template name="process-emph" >
+ <xsl:param name="class" />
+ <xsl:choose>
+ <xsl:when test=" $allow-foreign = 'true'">
+ <xsl:copy-of select="."/>
+ </xsl:when>
+ <xsl:otherwise>
+ <!-- We generate too much whitespace rather than risking concatenation -->
+ <axsl:text> </axsl:text>
+ <xsl:apply-templates mode="inline-text"/>
+ <axsl:text> </axsl:text>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+<xsl:template name="process-rule">
+ <xsl:param name="id"/>
+ <xsl:param name="context"/>
+ <xsl:param name="flag"/>
+ <!-- "Linkable" parameters -->
+ <xsl:param name="role"/>
+ <xsl:param name="subject"/>
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+ <xsl:if test=" $generate-fired-rule = 'true'">
+ <svrl:fired-rule context="{$context}" >
+ <!-- Process rich attributes. -->
+ <xsl:call-template name="richParms">
+ <xsl:with-param name="fpi" select="$fpi"/>
+ <xsl:with-param name="icon" select="$icon"/>
+ <xsl:with-param name="lang" select="$lang"/>
+ <xsl:with-param name="see" select="$see" />
+ <xsl:with-param name="space" select="$space" />
+ </xsl:call-template>
+ <xsl:if test=" string( $id )">
+ <xsl:attribute name="id">
+ <xsl:value-of select=" $id " />
+ </xsl:attribute>
+ </xsl:if>
+ <xsl:if test=" string-length( $role ) &gt; 0">
+ <xsl:attribute name="role">
+ <xsl:value-of select=" $role " />
+ </xsl:attribute>
+ </xsl:if>
+ </svrl:fired-rule>
+</xsl:if>
+</xsl:template>
+
+<xsl:template name="process-ns">
+ <xsl:param name="prefix"/>
+ <xsl:param name="uri"/>
+ <svrl:ns-prefix-in-attribute-values uri="{$uri}" prefix="{$prefix}" />
+</xsl:template>
+
+<xsl:template name="process-p">
+ <xsl:param name="icon"/>
+ <xsl:param name="class"/>
+ <xsl:param name="id"/>
+ <xsl:param name="lang"/>
+
+ <svrl:text>
+ <xsl:apply-templates mode="text"/>
+ </svrl:text>
+</xsl:template>
+
+<xsl:template name="process-pattern">
+ <xsl:param name="name"/>
+ <xsl:param name="id"/>
+ <xsl:param name="is-a"/>
+
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+ <svrl:active-pattern >
+ <xsl:if test=" string( $id )">
+ <axsl:attribute name="id">
+ <xsl:value-of select=" $id " />
+ </axsl:attribute>
+ </xsl:if>
+ <xsl:if test=" string( $name )">
+ <axsl:attribute name="name">
+ <xsl:value-of select=" $name " />
+ </axsl:attribute>
+ </xsl:if>
+
+ <xsl:call-template name='richParms'>
+ <xsl:with-param name="fpi" select="$fpi"/>
+ <xsl:with-param name="icon" select="$icon"/>
+ <xsl:with-param name="lang" select="$lang"/>
+ <xsl:with-param name="see" select="$see" />
+ <xsl:with-param name="space" select="$space" />
+ </xsl:call-template>
+
+ <!-- ?? report that this screws up iso:title processing -->
+ <xsl:apply-templates mode="do-pattern-p"/>
+ <!-- ?? Seems that this apply-templates is never triggered DP -->
+ <axsl:apply-templates />
+ </svrl:active-pattern>
+</xsl:template>
+
+<!-- Overrides skeleton -->
+<xsl:template name="process-message" >
+ <xsl:param name="pattern"/>
+ <xsl:param name="role"/>
+</xsl:template>
+
+
+ <!-- Overrides skeleton -->
+ <xsl:template name="process-span" >
+ <xsl:param name="class" />
+ <xsl:choose>
+ <xsl:when test=" $allow-foreign = 'true'">
+ <xsl:copy-of select="."/>
+ </xsl:when>
+ <xsl:otherwise>
+ <!-- We generate too much whitespace rather than risking concatenation -->
+ <axsl:text> </axsl:text>
+ <xsl:apply-templates mode="inline-text"/>
+ <axsl:text> </axsl:text>
+ </xsl:otherwise>
+ </xsl:choose>
+ </xsl:template>
+
+<!-- =========================================================================== -->
+<!-- processing rich parameters. -->
+<xsl:template name='richParms'>
+ <!-- "Rich" parameters -->
+ <xsl:param name="fpi" />
+ <xsl:param name="icon" />
+ <xsl:param name="lang" />
+ <xsl:param name="see" />
+ <xsl:param name="space" />
+ <!-- Process rich attributes. -->
+ <xsl:if test=" $allow-foreign = 'true'">
+ <xsl:if test="string($fpi)">
+ <axsl:attribute name="fpi">
+ <xsl:value-of select="$fpi"/>
+ </axsl:attribute>
+ </xsl:if>
+ <xsl:if test="string($icon)">
+ <axsl:attribute name="icon">
+ <xsl:value-of select="$icon"/>
+ </axsl:attribute>
+ </xsl:if>
+ <xsl:if test="string($see)">
+ <axsl:attribute name="see">
+ <xsl:value-of select="$see"/>
+ </axsl:attribute>
+ </xsl:if>
+ </xsl:if>
+ <xsl:if test="string($space)">
+ <axsl:attribute name="xml:space">
+ <xsl:value-of select="$space"/>
+ </axsl:attribute>
+ </xsl:if>
+ <xsl:if test="string($lang)">
+ <axsl:attribute name="xml:lang">
+ <xsl:value-of select="$lang"/>
+ </axsl:attribute>
+ </xsl:if>
+</xsl:template>
+
+<!-- processing linkable parameters. -->
+<xsl:template name='linkableParms'>
+ <xsl:param name="role"/>
+ <xsl:param name="subject"/>
+
+ <!-- ISO SVRL has a role attribute to match the Schematron role attribute -->
+ <xsl:if test=" string($role )">
+ <axsl:attribute name="role">
+ <xsl:value-of select=" $role " />
+ </axsl:attribute>
+ </xsl:if>
+ <!-- ISO SVRL does not have a subject attribute to match the Schematron subject attribute.
+ Instead, the Schematron subject attribute is folded into the location attribute -->
+</xsl:template>
+
+
+</xsl:stylesheet>
+
diff --git a/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt
new file mode 100644
index 0000000..e5d6dfc
--- /dev/null
+++ b/src/lxml/isoschematron/resources/xsl/iso-schematron-xslt1/readme.txt
@@ -0,0 +1,84 @@
+ISO SCHEMATRON 2010
+
+XSLT implementation by Rick Jelliffe with assistance from members of Schematron-love-in maillist.
+
+2010-04-21
+
+Two distributions are available. One is for XSLT1 engines.
+The other is for XSLT2 engines, such as SAXON 9.
+
+
+This version of Schematron splits the process into a pipeline of several different XSLT stages.
+
+1) First, preprocess your Schematron schema with iso_dsdl_include.xsl.
+This is a macro processor to assemble the schema from various parts.
+If your schema is not in separate parts, you can skip this stage.
+This stage also generates error messages for some common XPath syntax problems.
+
+2) Second, preprocess the output from stage 1 with iso_abstract_expand.xsl.
+This is a macro processor to convert abstract patterns to real patterns.
+If your schema does not use abstract patterns, you can skip this
+stage.
+
+3) Third, compile the Schematron schema into an XSLT script.
+This will typically use iso_svrl_for_xslt1.xsl or iso_svrl_for_xslt2.xsl
+(which in turn invoke iso_schematron_skeleton_for_xslt1.xsl or iso_schematron_skeleton_for_saxon.xsl)
+However, other "meta-stylesheets" are also in common use; the principle of operation is the same.
+If your schema uses Schematron phases, supply these as command line/invocation parameters
+to this process.
+
+4) Fourth, run the script generated by stage 3 against the document being validated.
+If you are using the SVRL script, then the output of validation will be an XML document.
+If your schema uses Schematron parameters, supply these as command line/invocation parameters
+to this process.
+
+
+The XSLT2 distribution also features several next generation features,
+such as validating multiple documents. See the source code for details.
+
+Schematron assertions can be written in any language, of course; the file
+sch-messages-en.xhtml contains the diagnostics messages from the XSLT2 skeleton
+in English, and this can be used as template to localize the skeleton's
+error messages. Note that typically programming errors in Schematron are XPath
+errors, which requires localized messages from the XSLT engine.
+
+ANT
+---
+To give an example of how to process a document, here is a sample ANT task.
+
+<target name="schematron-compile-test" >
+
+ <!-- expand inclusions -->
+ <xslt basedir="test/schematron"
+ style="iso_dsdl_include.xsl" in="test.sch" out="test1.sch">
+ <classpath>
+ <pathelement location="${lib.dir}/saxon9.jar"/>
+ </classpath>
+ </xslt>
+
+ <!-- expand abstract patterns -->
+ <xslt basedir="test/schematron"
+ style="iso_abstract_expand.xsl" in="test1.sch" out="test2.sch">
+ <classpath>
+ <pathelement location="${lib.dir}/saxon9.jar"/>
+ </classpath>
+ </xslt>
+
+
+
+ <!-- compile it -->
+ <xslt basedir="test/schematron"
+ style="iso_svrl_for_xslt2.xsl" in="test2.sch" out="test.xsl">
+ <classpath>
+ <pathelement location="${lib.dir}/saxon9.jar"/>
+ </classpath>
+ </xslt>
+
+ <!-- validate -->
+ <xslt basedir="test/schematron"
+ style="test.xsl" in="instance.xml" out="instance.svrlt">
+ <classpath>
+ <pathelement location="${lib.dir}/saxon9.jar"/>
+ </classpath>
+ </xslt>
+ </target>
diff --git a/src/lxml/iterparse.pxi b/src/lxml/iterparse.pxi
new file mode 100644
index 0000000..4c20506
--- /dev/null
+++ b/src/lxml/iterparse.pxi
@@ -0,0 +1,438 @@
+# iterparse -- event-driven parsing
+
+DEF __ITERPARSE_CHUNK_SIZE = 32768
+
+cdef class iterparse:
+ u"""iterparse(self, source, events=("end",), tag=None, \
+ attribute_defaults=False, dtd_validation=False, \
+ load_dtd=False, no_network=True, remove_blank_text=False, \
+ remove_comments=False, remove_pis=False, encoding=None, \
+ html=False, recover=None, huge_tree=False, schema=None)
+
+ Incremental parser.
+
+ Parses XML into a tree and generates tuples (event, element) in a
+ SAX-like fashion. ``event`` is any of 'start', 'end', 'start-ns',
+ 'end-ns'.
+
+ For 'start' and 'end', ``element`` is the Element that the parser just
+ found opening or closing. For 'start-ns', it is a tuple (prefix, URI) of
+ a new namespace declaration. For 'end-ns', it is simply None. Note that
+ all start and end events are guaranteed to be properly nested.
+
+ The keyword argument ``events`` specifies a sequence of event type names
+ that should be generated. By default, only 'end' events will be
+ generated.
+
+ The additional ``tag`` argument restricts the 'start' and 'end' events to
+ those elements that match the given tag. The ``tag`` argument can also be
+ a sequence of tags to allow matching more than one tag. By default,
+ events are generated for all elements. Note that the 'start-ns' and
+ 'end-ns' events are not impacted by this restriction.
+
+ The other keyword arguments in the constructor are mainly based on the
+ libxml2 parser configuration. A DTD will also be loaded if validation or
+ attribute default values are requested.
+
+ Available boolean keyword arguments:
+ - attribute_defaults: read default attributes from DTD
+ - dtd_validation: validate (if DTD is available)
+ - load_dtd: use DTD for parsing
+ - no_network: prevent network access for related files
+ - remove_blank_text: discard blank text nodes
+ - remove_comments: discard comments
+ - remove_pis: discard processing instructions
+ - strip_cdata: replace CDATA sections by normal text content (default: True)
+ - compact: safe memory for short text content (default: True)
+ - resolve_entities: replace entities by their text value (default: True)
+ - huge_tree: disable security restrictions and support very deep trees
+ and very long text content (only affects libxml2 2.7+)
+ - html: parse input as HTML (default: XML)
+ - recover: try hard to parse through broken input (default: True for HTML,
+ False otherwise)
+
+ Other keyword arguments:
+ - encoding: override the document encoding
+ - schema: an XMLSchema to validate against
+ """
+ cdef _FeedParser _parser
+ cdef object _tag
+ cdef object _events
+ cdef readonly object root
+ cdef object _source
+ cdef object _filename
+ cdef object _error
+ cdef bint _close_source_after_read
+
+ def __init__(self, source, events=(u"end",), *, tag=None,
+ attribute_defaults=False, dtd_validation=False,
+ load_dtd=False, no_network=True, remove_blank_text=False,
+ compact=True, resolve_entities=True, remove_comments=False,
+ remove_pis=False, strip_cdata=True, encoding=None,
+ html=False, recover=None, huge_tree=False, collect_ids=True,
+ XMLSchema schema=None):
+ if not hasattr(source, 'read'):
+ self._filename = source
+ if python.IS_PYTHON2:
+ source = _encodeFilename(source)
+ source = open(source, 'rb')
+ self._close_source_after_read = True
+ else:
+ self._filename = _getFilenameForFile(source)
+ self._close_source_after_read = False
+
+ if recover is None:
+ recover = html
+
+ if html:
+ # make sure we're not looking for namespaces
+ events = [event for event in events
+ if event not in ('start-ns', 'end-ns')]
+ parser = HTMLPullParser(
+ events,
+ tag=tag,
+ recover=recover,
+ base_url=self._filename,
+ encoding=encoding,
+ remove_blank_text=remove_blank_text,
+ remove_comments=remove_comments,
+ remove_pis=remove_pis,
+ strip_cdata=strip_cdata,
+ no_network=no_network,
+ target=None, # TODO
+ schema=schema,
+ compact=compact)
+ else:
+ parser = XMLPullParser(
+ events,
+ tag=tag,
+ recover=recover,
+ base_url=self._filename,
+ encoding=encoding,
+ attribute_defaults=attribute_defaults,
+ dtd_validation=dtd_validation,
+ load_dtd=load_dtd,
+ no_network=no_network,
+ schema=schema,
+ huge_tree=huge_tree,
+ remove_blank_text=remove_blank_text,
+ resolve_entities=resolve_entities,
+ remove_comments=remove_comments,
+ remove_pis=remove_pis,
+ strip_cdata=strip_cdata,
+ collect_ids=True,
+ target=None, # TODO
+ compact=compact)
+
+ self._events = parser.read_events()
+ self._parser = parser
+ self._source = source
+
+ @property
+ def error_log(self):
+ """The error log of the last (or current) parser run.
+ """
+ return self._parser.feed_error_log
+
+ @property
+ def resolvers(self):
+ """The custom resolver registry of the last (or current) parser run.
+ """
+ return self._parser.resolvers
+
+ @property
+ def version(self):
+ """The version of the underlying XML parser."""
+ return self._parser.version
+
+ def set_element_class_lookup(self, ElementClassLookup lookup = None):
+ u"""set_element_class_lookup(self, lookup = None)
+
+ Set a lookup scheme for element classes generated from this parser.
+
+ Reset it by passing None or nothing.
+ """
+ self._parser.set_element_class_lookup(lookup)
+
+ def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
+ u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
+
+ Creates a new element associated with this parser.
+ """
+ self._parser.makeelement(
+ _tag, attrib=None, nsmap=None, **_extra)
+
+ @cython.final
+ cdef _close_source(self):
+ if self._source is None:
+ return
+ if not self._close_source_after_read:
+ self._source = None
+ return
+ try:
+ close = self._source.close
+ except AttributeError:
+ close = None
+ finally:
+ self._source = None
+ if close is not None:
+ close()
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ try:
+ return next(self._events)
+ except StopIteration:
+ pass
+ context = <_SaxParserContext>self._parser._getPushParserContext()
+ if self._source is not None:
+ done = False
+ while not done:
+ try:
+ done = self._read_more_events(context)
+ return next(self._events)
+ except StopIteration:
+ pass # no events yet
+ except Exception as e:
+ self._error = e
+ self._close_source()
+ try:
+ return next(self._events)
+ except StopIteration:
+ break
+ # nothing left to read or return
+ if self._error is not None:
+ error = self._error
+ self._error = None
+ raise error
+ if (context._validator is not None
+ and not context._validator.isvalid()):
+ _raiseParseError(context._c_ctxt, self._filename,
+ context._error_log)
+ # no errors => all done
+ raise StopIteration
+
+ @cython.final
+ cdef bint _read_more_events(self, _SaxParserContext context) except -123:
+ data = self._source.read(__ITERPARSE_CHUNK_SIZE)
+ if not isinstance(data, bytes):
+ self._close_source()
+ raise TypeError("reading file objects must return bytes objects")
+ if not data:
+ try:
+ self.root = self._parser.close()
+ finally:
+ self._close_source()
+ return True
+ self._parser.feed(data)
+ return False
+
+
+cdef enum _IterwalkSkipStates:
+ IWSKIP_NEXT_IS_START
+ IWSKIP_SKIP_NEXT
+ IWSKIP_CAN_SKIP
+ IWSKIP_CANNOT_SKIP
+
+
+cdef class iterwalk:
+ u"""iterwalk(self, element_or_tree, events=("end",), tag=None)
+
+ A tree walker that generates events from an existing tree as if it
+ was parsing XML data with ``iterparse()``.
+
+ Just as for ``iterparse()``, the ``tag`` argument can be a single tag or a
+ sequence of tags.
+
+ After receiving a 'start' or 'start-ns' event, the children and
+ descendants of the current element can be excluded from iteration
+ by calling the ``skip_subtree()`` method.
+ """
+ cdef _MultiTagMatcher _matcher
+ cdef list _node_stack
+ cdef list _events
+ cdef object _pop_event
+ cdef object _include_siblings
+ cdef int _index
+ cdef int _event_filter
+ cdef _IterwalkSkipStates _skip_state
+
+ def __init__(self, element_or_tree, events=(u"end",), tag=None):
+ cdef _Element root
+ cdef int ns_count
+ root = _rootNodeOrRaise(element_or_tree)
+ self._event_filter = _buildParseEventFilter(events)
+ if tag is None or tag == '*':
+ self._matcher = None
+ else:
+ self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
+ self._node_stack = []
+ self._events = []
+ self._pop_event = self._events.pop
+ self._skip_state = IWSKIP_CANNOT_SKIP # ignore all skip requests by default
+
+ if self._event_filter:
+ self._index = 0
+ if self._matcher is not None and self._event_filter & PARSE_EVENT_FILTER_START:
+ self._matcher.cacheTags(root._doc)
+
+ # When processing an ElementTree, add events for the preceding comments/PIs.
+ if self._event_filter & (PARSE_EVENT_FILTER_COMMENT | PARSE_EVENT_FILTER_PI):
+ if isinstance(element_or_tree, _ElementTree):
+ self._include_siblings = root
+ for elem in list(root.itersiblings(preceding=True))[::-1]:
+ if self._event_filter & PARSE_EVENT_FILTER_COMMENT and elem.tag is Comment:
+ self._events.append((u'comment', elem))
+ elif self._event_filter & PARSE_EVENT_FILTER_PI and elem.tag is PI:
+ self._events.append((u'pi', elem))
+
+ ns_count = self._start_node(root)
+ self._node_stack.append( (root, ns_count) )
+ else:
+ self._index = -1
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ cdef xmlNode* c_child
+ cdef _Element node
+ cdef _Element next_node
+ cdef int ns_count = 0
+ if self._events:
+ return self._next_event()
+ if self._matcher is not None and self._index >= 0:
+ node = self._node_stack[self._index][0]
+ self._matcher.cacheTags(node._doc)
+
+ # find next node
+ while self._index >= 0:
+ node = self._node_stack[self._index][0]
+
+ if self._skip_state == IWSKIP_SKIP_NEXT:
+ c_child = NULL
+ else:
+ c_child = self._process_non_elements(
+ node._doc, _findChildForwards(node._c_node, 0))
+ self._skip_state = IWSKIP_CANNOT_SKIP
+
+ while c_child is NULL:
+ # back off through parents
+ self._index -= 1
+ node = self._end_node()
+ if self._index < 0:
+ break
+ c_child = self._process_non_elements(
+ node._doc, _nextElement(node._c_node))
+
+ if c_child is not NULL:
+ next_node = _elementFactory(node._doc, c_child)
+ if self._event_filter & (PARSE_EVENT_FILTER_START |
+ PARSE_EVENT_FILTER_START_NS):
+ ns_count = self._start_node(next_node)
+ elif self._event_filter & PARSE_EVENT_FILTER_END_NS:
+ ns_count = _countNsDefs(next_node._c_node)
+ self._node_stack.append( (next_node, ns_count) )
+ self._index += 1
+ if self._events:
+ return self._next_event()
+
+ if self._include_siblings is not None:
+ node, self._include_siblings = self._include_siblings, None
+ self._process_non_elements(node._doc, _nextElement(node._c_node))
+ if self._events:
+ return self._next_event()
+
+ raise StopIteration
+
+ @cython.final
+ cdef xmlNode* _process_non_elements(self, _Document doc, xmlNode* c_node):
+ while c_node is not NULL and c_node.type != tree.XML_ELEMENT_NODE:
+ if c_node.type == tree.XML_COMMENT_NODE:
+ if self._event_filter & PARSE_EVENT_FILTER_COMMENT:
+ self._events.append(
+ (u"comment", _elementFactory(doc, c_node)))
+ c_node = _nextElement(c_node)
+ elif c_node.type == tree.XML_PI_NODE:
+ if self._event_filter & PARSE_EVENT_FILTER_PI:
+ self._events.append(
+ (u"pi", _elementFactory(doc, c_node)))
+ c_node = _nextElement(c_node)
+ else:
+ break
+ return c_node
+
+ @cython.final
+ cdef _next_event(self):
+ if self._skip_state == IWSKIP_NEXT_IS_START:
+ if self._events[0][0] in (u'start', u'start-ns'):
+ self._skip_state = IWSKIP_CAN_SKIP
+ return self._pop_event(0)
+
+ def skip_subtree(self):
+ """Prevent descending into the current subtree.
+ Instead, the next returned event will be the 'end' event of the current element
+ (if included), ignoring any children or descendants.
+
+ This has no effect right after an 'end' or 'end-ns' event.
+ """
+ if self._skip_state == IWSKIP_CAN_SKIP:
+ self._skip_state = IWSKIP_SKIP_NEXT
+
+ @cython.final
+ cdef int _start_node(self, _Element node) except -1:
+ cdef int ns_count
+ if self._event_filter & PARSE_EVENT_FILTER_START_NS:
+ ns_count = _appendStartNsEvents(node._c_node, self._events)
+ if self._events:
+ self._skip_state = IWSKIP_NEXT_IS_START
+ elif self._event_filter & PARSE_EVENT_FILTER_END_NS:
+ ns_count = _countNsDefs(node._c_node)
+ else:
+ ns_count = 0
+ if self._event_filter & PARSE_EVENT_FILTER_START:
+ if self._matcher is None or self._matcher.matches(node._c_node):
+ self._events.append( (u"start", node) )
+ self._skip_state = IWSKIP_NEXT_IS_START
+ return ns_count
+
+ @cython.final
+ cdef _Element _end_node(self):
+ cdef _Element node
+ cdef int i, ns_count
+ node, ns_count = self._node_stack.pop()
+ if self._event_filter & PARSE_EVENT_FILTER_END:
+ if self._matcher is None or self._matcher.matches(node._c_node):
+ self._events.append( (u"end", node) )
+ if self._event_filter & PARSE_EVENT_FILTER_END_NS and ns_count:
+ event = (u"end-ns", None)
+ for i in range(ns_count):
+ self._events.append(event)
+ return node
+
+
+cdef int _countNsDefs(xmlNode* c_node):
+ cdef xmlNs* c_ns
+ cdef int count
+ count = 0
+ c_ns = c_node.nsDef
+ while c_ns is not NULL:
+ count += 1
+ c_ns = c_ns.next
+ return count
+
+
+cdef int _appendStartNsEvents(xmlNode* c_node, list event_list) except -1:
+ cdef xmlNs* c_ns
+ cdef int count
+ count = 0
+ c_ns = c_node.nsDef
+ while c_ns is not NULL:
+ ns_tuple = (funicode(c_ns.prefix) if c_ns.prefix is not NULL else '',
+ funicode(c_ns.href))
+ event_list.append( (u"start-ns", ns_tuple) )
+ count += 1
+ c_ns = c_ns.next
+ return count
diff --git a/src/lxml/lxml_endian.h b/src/lxml/lxml_endian.h
new file mode 100644
index 0000000..f53cb7a
--- /dev/null
+++ b/src/lxml/lxml_endian.h
@@ -0,0 +1,14 @@
+#ifndef PY_BIG_ENDIAN
+
+#ifdef _MSC_VER
+typedef unsigned __int32 uint32_t;
+#else
+#include <stdint.h>
+#endif
+
+static CYTHON_INLINE int _lx__is_big_endian(void) {
+ union {uint32_t i; char c[4];} x = {0x01020304};
+ return x.c[0] == 1;
+}
+#define PY_BIG_ENDIAN _lx__is_big_endian()
+#endif
diff --git a/src/lxml/nsclasses.pxi b/src/lxml/nsclasses.pxi
new file mode 100644
index 0000000..274277d
--- /dev/null
+++ b/src/lxml/nsclasses.pxi
@@ -0,0 +1,281 @@
+# module-level API for namespace implementations
+
+cdef class LxmlRegistryError(LxmlError):
+ """Base class of lxml registry errors.
+ """
+
+cdef class NamespaceRegistryError(LxmlRegistryError):
+ """Error registering a namespace extension.
+ """
+
+
+@cython.internal
+cdef class _NamespaceRegistry:
+ u"Dictionary-like namespace registry"
+ cdef object _ns_uri
+ cdef bytes _ns_uri_utf
+ cdef dict _entries
+ cdef char* _c_ns_uri_utf
+ def __cinit__(self, ns_uri):
+ self._ns_uri = ns_uri
+ if ns_uri is None:
+ self._ns_uri_utf = None
+ self._c_ns_uri_utf = NULL
+ else:
+ self._ns_uri_utf = _utf8(ns_uri)
+ self._c_ns_uri_utf = _cstr(self._ns_uri_utf)
+ self._entries = {}
+
+ def update(self, class_dict_iterable):
+ u"""update(self, class_dict_iterable)
+
+ Forgivingly update the registry.
+
+ ``class_dict_iterable`` may be a dict or some other iterable
+ that yields (name, value) pairs.
+
+ If a value does not match the required type for this registry,
+ or if the name starts with '_', it will be silently discarded.
+ This allows registrations at the module or class level using
+ vars(), globals() etc."""
+ if hasattr(class_dict_iterable, u'items'):
+ class_dict_iterable = class_dict_iterable.items()
+ for name, item in class_dict_iterable:
+ if (name is None or name[:1] != '_') and callable(item):
+ self[name] = item
+
+ def __getitem__(self, name):
+ if name is not None:
+ name = _utf8(name)
+ return self._get(name)
+
+ def __delitem__(self, name):
+ if name is not None:
+ name = _utf8(name)
+ del self._entries[name]
+
+ cdef object _get(self, object name):
+ cdef python.PyObject* dict_result
+ dict_result = python.PyDict_GetItem(self._entries, name)
+ if dict_result is NULL:
+ raise KeyError, u"Name not registered."
+ return <object>dict_result
+
+ cdef object _getForString(self, char* name):
+ cdef python.PyObject* dict_result
+ dict_result = python.PyDict_GetItem(self._entries, name)
+ if dict_result is NULL:
+ raise KeyError, u"Name not registered."
+ return <object>dict_result
+
+ def __iter__(self):
+ return iter(self._entries)
+
+ def items(self):
+ return list(self._entries.items())
+
+ def iteritems(self):
+ return iter(self._entries.items())
+
+ def clear(self):
+ self._entries.clear()
+
+ def __call__(self, obj):
+ # Usage as decorator:
+ # ns = lookup.get_namespace("...")
+ # @ns('abc')
+ # class element(ElementBase): pass
+ #
+ # @ns
+ # class elementname(ElementBase): pass
+
+ if obj is None or python._isString(obj):
+ # @ns(None) or @ns('tag')
+ return partial(self.__deco, obj)
+ # plain @ns decorator
+ self[obj.__name__] = obj
+ return obj
+
+ def __deco(self, name, obj):
+ self[name] = obj
+ return obj
+
+
+@cython.final
+@cython.internal
+cdef class _ClassNamespaceRegistry(_NamespaceRegistry):
+ u"Dictionary-like registry for namespace implementation classes"
+ def __setitem__(self, name, item):
+ if not isinstance(item, type) or not issubclass(item, ElementBase):
+ raise NamespaceRegistryError, \
+ u"Registered element classes must be subtypes of ElementBase"
+ if name is not None:
+ name = _utf8(name)
+ self._entries[name] = item
+
+ def __repr__(self):
+ return u"Namespace(%r)" % self._ns_uri
+
+
+cdef class ElementNamespaceClassLookup(FallbackElementClassLookup):
+ u"""ElementNamespaceClassLookup(self, fallback=None)
+
+ Element class lookup scheme that searches the Element class in the
+ Namespace registry.
+
+ Usage:
+
+ >>> lookup = ElementNamespaceClassLookup()
+ >>> ns_elements = lookup.get_namespace("http://schema.org/Movie")
+
+ >>> @ns_elements
+ ... class movie(ElementBase):
+ ... "Element implementation for 'movie' tag (using class name) in schema namespace."
+
+ >>> @ns_elements("movie")
+ ... class MovieElement(ElementBase):
+ ... "Element implementation for 'movie' tag (explicit tag name) in schema namespace."
+ """
+ cdef dict _namespace_registries
+ def __cinit__(self):
+ self._namespace_registries = {}
+
+ def __init__(self, ElementClassLookup fallback=None):
+ FallbackElementClassLookup.__init__(self, fallback)
+ self._lookup_function = _find_nselement_class
+
+ def get_namespace(self, ns_uri):
+ u"""get_namespace(self, ns_uri)
+
+ Retrieve the namespace object associated with the given URI.
+ Pass None for the empty namespace.
+
+ Creates a new namespace object if it does not yet exist."""
+ if ns_uri:
+ ns_utf = _utf8(ns_uri)
+ else:
+ ns_utf = None
+ try:
+ return self._namespace_registries[ns_utf]
+ except KeyError:
+ registry = self._namespace_registries[ns_utf] = \
+ _ClassNamespaceRegistry(ns_uri)
+ return registry
+
+cdef object _find_nselement_class(state, _Document doc, xmlNode* c_node):
+ cdef python.PyObject* dict_result
+ cdef ElementNamespaceClassLookup lookup
+ cdef _NamespaceRegistry registry
+ if state is None:
+ return _lookupDefaultElementClass(None, doc, c_node)
+
+ lookup = <ElementNamespaceClassLookup>state
+ if c_node.type != tree.XML_ELEMENT_NODE:
+ return _callLookupFallback(lookup, doc, c_node)
+
+ c_namespace_utf = _getNs(c_node)
+ if c_namespace_utf is not NULL:
+ dict_result = python.PyDict_GetItem(
+ lookup._namespace_registries, <unsigned char*>c_namespace_utf)
+ else:
+ dict_result = python.PyDict_GetItem(
+ lookup._namespace_registries, None)
+ if dict_result is not NULL:
+ registry = <_NamespaceRegistry>dict_result
+ classes = registry._entries
+
+ if c_node.name is not NULL:
+ dict_result = python.PyDict_GetItem(
+ classes, <unsigned char*>c_node.name)
+ else:
+ dict_result = NULL
+
+ if dict_result is NULL:
+ dict_result = python.PyDict_GetItem(classes, None)
+
+ if dict_result is not NULL:
+ return <object>dict_result
+ return _callLookupFallback(lookup, doc, c_node)
+
+
+################################################################################
+# XPath extension functions
+
+cdef dict __FUNCTION_NAMESPACE_REGISTRIES
+__FUNCTION_NAMESPACE_REGISTRIES = {}
+
+def FunctionNamespace(ns_uri):
+ u"""FunctionNamespace(ns_uri)
+
+ Retrieve the function namespace object associated with the given
+ URI.
+
+ Creates a new one if it does not yet exist. A function namespace
+ can only be used to register extension functions.
+
+ Usage:
+
+ >>> ns_functions = FunctionNamespace("http://schema.org/Movie")
+
+ >>> @ns_functions # uses function name
+ ... def add2(x):
+ ... return x + 2
+
+ >>> @ns_functions("add3") # uses explicit name
+ ... def add_three(x):
+ ... return x + 3
+ """
+ ns_utf = _utf8(ns_uri) if ns_uri else None
+ try:
+ return __FUNCTION_NAMESPACE_REGISTRIES[ns_utf]
+ except KeyError:
+ registry = __FUNCTION_NAMESPACE_REGISTRIES[ns_utf] = \
+ _XPathFunctionNamespaceRegistry(ns_uri)
+ return registry
+
+@cython.internal
+cdef class _FunctionNamespaceRegistry(_NamespaceRegistry):
+ def __setitem__(self, name, item):
+ if not callable(item):
+ raise NamespaceRegistryError, \
+ u"Registered functions must be callable."
+ if not name:
+ raise ValueError, \
+ u"extensions must have non empty names"
+ self._entries[_utf8(name)] = item
+
+ def __repr__(self):
+ return u"FunctionNamespace(%r)" % self._ns_uri
+
+@cython.final
+@cython.internal
+cdef class _XPathFunctionNamespaceRegistry(_FunctionNamespaceRegistry):
+ cdef object _prefix
+ cdef bytes _prefix_utf
+
+ property prefix:
+ u"Namespace prefix for extension functions."
+ def __del__(self):
+ self._prefix = None # no prefix configured
+ self._prefix_utf = None
+ def __get__(self):
+ if self._prefix is None:
+ return ''
+ else:
+ return self._prefix
+ def __set__(self, prefix):
+ if prefix == '':
+ prefix = None # empty prefix
+ self._prefix_utf = _utf8(prefix) if prefix is not None else None
+ self._prefix = prefix
+
+cdef list _find_all_extension_prefixes():
+ u"Internal lookup function to find all function prefixes for XSLT/XPath."
+ cdef _XPathFunctionNamespaceRegistry registry
+ cdef list ns_prefixes = []
+ for registry in __FUNCTION_NAMESPACE_REGISTRIES.itervalues():
+ if registry._prefix_utf is not None:
+ if registry._ns_uri_utf is not None:
+ ns_prefixes.append(
+ (registry._prefix_utf, registry._ns_uri_utf))
+ return ns_prefixes
diff --git a/src/lxml/objectify.pyx b/src/lxml/objectify.pyx
new file mode 100644
index 0000000..d1880ff
--- /dev/null
+++ b/src/lxml/objectify.pyx
@@ -0,0 +1,1981 @@
+# cython: binding=True
+# cython: auto_pickle=False
+# cython: language_level=2
+
+"""
+The ``lxml.objectify`` module implements a Python object API for XML.
+It is based on `lxml.etree`.
+"""
+
+from __future__ import absolute_import
+
+cimport cython
+
+from lxml.includes.etreepublic cimport _Document, _Element, ElementBase, ElementClassLookup
+from lxml.includes.etreepublic cimport elementFactory, import_lxml__etree, textOf, pyunicode
+from lxml.includes.tree cimport const_xmlChar, _xcstr
+from lxml cimport python
+from lxml.includes cimport tree
+
+cimport lxml.includes.etreepublic as cetree
+cimport libc.string as cstring_h # not to be confused with stdlib 'string'
+from libc.string cimport const_char
+
+__all__ = [u'BoolElement', u'DataElement', u'E', u'Element', u'ElementMaker',
+ u'FloatElement', u'IntElement', u'LongElement', u'NoneElement',
+ u'NumberElement', u'ObjectPath', u'ObjectifiedDataElement',
+ u'ObjectifiedElement', u'ObjectifyElementClassLookup',
+ u'PYTYPE_ATTRIBUTE', u'PyType', u'StringElement', u'SubElement',
+ u'XML', u'annotate', u'deannotate', u'dump', u'enable_recursive_str',
+ u'fromstring', u'getRegisteredTypes', u'makeparser', u'parse',
+ u'pyannotate', u'pytypename', u'set_default_parser',
+ u'set_pytype_attribute_tag', u'xsiannotate']
+
+cdef object etree
+from lxml import etree
+# initialize C-API of lxml.etree
+import_lxml__etree()
+
+__version__ = etree.__version__
+
+cdef object re
+import re
+
+cdef tuple IGNORABLE_ERRORS = (ValueError, TypeError)
+cdef object is_special_method = re.compile(u'__.*__$').match
+
+
+# Duplicated from apihelpers.pxi, since dependencies obstruct
+# including apihelpers.pxi.
+cdef strrepr(s):
+ """Build a representation of strings which we can use in __repr__
+ methods, e.g. _Element.__repr__().
+ """
+ return s.encode('unicode-escape') if python.IS_PYTHON2 else s
+
+
+cdef object _typename(object t):
+ cdef const_char* c_name
+ c_name = python._fqtypename(t)
+ s = cstring_h.strrchr(c_name, c'.')
+ if s is not NULL:
+ c_name = s + 1
+ return pyunicode(<const_xmlChar*>c_name)
+
+
+# namespace/name for "pytype" hint attribute
+cdef object PYTYPE_NAMESPACE
+cdef bytes PYTYPE_NAMESPACE_UTF8
+cdef const_xmlChar* _PYTYPE_NAMESPACE
+
+cdef object PYTYPE_ATTRIBUTE_NAME
+cdef bytes PYTYPE_ATTRIBUTE_NAME_UTF8
+cdef const_xmlChar* _PYTYPE_ATTRIBUTE_NAME
+
+PYTYPE_ATTRIBUTE = None
+
+cdef unicode TREE_PYTYPE_NAME = u"TREE"
+
+cdef tuple _unicodeAndUtf8(s):
+ return s, python.PyUnicode_AsUTF8String(s)
+
+def set_pytype_attribute_tag(attribute_tag=None):
+ u"""set_pytype_attribute_tag(attribute_tag=None)
+ Change name and namespace of the XML attribute that holds Python type
+ information.
+
+ Do not use this unless you know what you are doing.
+
+ Reset by calling without argument.
+
+ Default: "{http://codespeak.net/lxml/objectify/pytype}pytype"
+ """
+ global PYTYPE_ATTRIBUTE, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME
+ global PYTYPE_NAMESPACE, PYTYPE_NAMESPACE_UTF8
+ global PYTYPE_ATTRIBUTE_NAME, PYTYPE_ATTRIBUTE_NAME_UTF8
+ if attribute_tag is None:
+ PYTYPE_NAMESPACE, PYTYPE_NAMESPACE_UTF8 = \
+ _unicodeAndUtf8(u"http://codespeak.net/lxml/objectify/pytype")
+ PYTYPE_ATTRIBUTE_NAME, PYTYPE_ATTRIBUTE_NAME_UTF8 = \
+ _unicodeAndUtf8(u"pytype")
+ else:
+ PYTYPE_NAMESPACE_UTF8, PYTYPE_ATTRIBUTE_NAME_UTF8 = \
+ cetree.getNsTag(attribute_tag)
+ PYTYPE_NAMESPACE = PYTYPE_NAMESPACE_UTF8.decode('utf8')
+ PYTYPE_ATTRIBUTE_NAME = PYTYPE_ATTRIBUTE_NAME_UTF8.decode('utf8')
+
+ _PYTYPE_NAMESPACE = PYTYPE_NAMESPACE_UTF8
+ _PYTYPE_ATTRIBUTE_NAME = PYTYPE_ATTRIBUTE_NAME_UTF8
+ PYTYPE_ATTRIBUTE = cetree.namespacedNameFromNsName(
+ _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME)
+
+set_pytype_attribute_tag()
+
+
+# namespaces for XML Schema
+cdef object XML_SCHEMA_NS, XML_SCHEMA_NS_UTF8
+XML_SCHEMA_NS, XML_SCHEMA_NS_UTF8 = \
+ _unicodeAndUtf8(u"http://www.w3.org/2001/XMLSchema")
+cdef const_xmlChar* _XML_SCHEMA_NS = _xcstr(XML_SCHEMA_NS_UTF8)
+
+cdef object XML_SCHEMA_INSTANCE_NS, XML_SCHEMA_INSTANCE_NS_UTF8
+XML_SCHEMA_INSTANCE_NS, XML_SCHEMA_INSTANCE_NS_UTF8 = \
+ _unicodeAndUtf8(u"http://www.w3.org/2001/XMLSchema-instance")
+cdef const_xmlChar* _XML_SCHEMA_INSTANCE_NS = _xcstr(XML_SCHEMA_INSTANCE_NS_UTF8)
+
+cdef object XML_SCHEMA_INSTANCE_NIL_ATTR = u"{%s}nil" % XML_SCHEMA_INSTANCE_NS
+cdef object XML_SCHEMA_INSTANCE_TYPE_ATTR = u"{%s}type" % XML_SCHEMA_INSTANCE_NS
+
+
+################################################################################
+# Element class for the main API
+
+cdef class ObjectifiedElement(ElementBase):
+ u"""Main XML Element class.
+
+ Element children are accessed as object attributes. Multiple children
+ with the same name are available through a list index. Example::
+
+ >>> root = XML("<root><c1><c2>0</c2><c2>1</c2></c1></root>")
+ >>> second_c2 = root.c1.c2[1]
+ >>> print(second_c2.text)
+ 1
+
+ Note that you cannot (and must not) instantiate this class or its
+ subclasses.
+ """
+ def __iter__(self):
+ u"""Iterate over self and all siblings with the same tag.
+ """
+ parent = self.getparent()
+ if parent is None:
+ return iter([self])
+ return etree.ElementChildIterator(parent, tag=self.tag)
+
+ def __str__(self):
+ if __RECURSIVE_STR:
+ return _dump(self, 0)
+ else:
+ return textOf(self._c_node) or u''
+
+ # pickle support for objectified Element
+ def __reduce__(self):
+ return fromstring, (etree.tostring(self),)
+
+ @property
+ def text(self):
+ return textOf(self._c_node)
+
+ @property
+ def __dict__(self):
+ """A fake implementation for __dict__ to support dir() etc.
+
+ Note that this only considers the first child with a given name.
+ """
+ cdef _Element child
+ cdef dict children
+ c_ns = tree._getNs(self._c_node)
+ tag = u"{%s}*" % pyunicode(c_ns) if c_ns is not NULL else None
+ children = {}
+ for child in etree.ElementChildIterator(self, tag=tag):
+ if c_ns is NULL and tree._getNs(child._c_node) is not NULL:
+ continue
+ name = pyunicode(child._c_node.name)
+ if name not in children:
+ children[name] = child
+ return children
+
+ def __len__(self):
+ u"""Count self and siblings with the same tag.
+ """
+ return _countSiblings(self._c_node)
+
+ def countchildren(self):
+ u"""countchildren(self)
+
+ Return the number of children of this element, regardless of their
+ name.
+ """
+ # copied from etree
+ cdef Py_ssize_t c
+ cdef tree.xmlNode* c_node
+ c = 0
+ c_node = self._c_node.children
+ while c_node is not NULL:
+ if tree._isElement(c_node):
+ c += 1
+ c_node = c_node.next
+ return c
+
+ def getchildren(self):
+ u"""getchildren(self)
+
+ Returns a sequence of all direct children. The elements are
+ returned in document order.
+ """
+ cdef tree.xmlNode* c_node
+ result = []
+ c_node = self._c_node.children
+ while c_node is not NULL:
+ if tree._isElement(c_node):
+ result.append(cetree.elementFactory(self._doc, c_node))
+ c_node = c_node.next
+ return result
+
+ def __getattr__(self, tag):
+ u"""Return the (first) child with the given tag name. If no namespace
+ is provided, the child will be looked up in the same one as self.
+ """
+ if is_special_method(tag):
+ return object.__getattr__(self, tag)
+ return _lookupChildOrRaise(self, tag)
+
+ def __setattr__(self, tag, value):
+ u"""Set the value of the (first) child with the given tag name. If no
+ namespace is provided, the child will be looked up in the same one as
+ self.
+ """
+ cdef _Element element
+ # properties are looked up /after/ __setattr__, so we must emulate them
+ if tag == u'text' or tag == u'pyval':
+ # read-only !
+ raise TypeError, f"attribute '{tag}' of '{_typename(self)}' objects is not writable"
+ elif tag == u'tail':
+ cetree.setTailText(self._c_node, value)
+ return
+ elif tag == u'tag':
+ ElementBase.tag.__set__(self, value)
+ return
+ elif tag == u'base':
+ ElementBase.base.__set__(self, value)
+ return
+ tag = _buildChildTag(self, tag)
+ element = _lookupChild(self, tag)
+ if element is None:
+ _appendValue(self, tag, value)
+ else:
+ _replaceElement(element, value)
+
+ def __delattr__(self, tag):
+ child = _lookupChildOrRaise(self, tag)
+ self.remove(child)
+
+ def addattr(self, tag, value):
+ u"""addattr(self, tag, value)
+
+ Add a child value to the element.
+
+ As opposed to append(), it sets a data value, not an element.
+ """
+ _appendValue(self, _buildChildTag(self, tag), value)
+
+ def __getitem__(self, key):
+ u"""Return a sibling, counting from the first child of the parent. The
+ method behaves like both a dict and a sequence.
+
+ * If argument is an integer, returns the sibling at that position.
+
+ * If argument is a string, does the same as getattr(). This can be
+ used to provide namespaces for element lookup, or to look up
+ children with special names (``text`` etc.).
+
+ * If argument is a slice object, returns the matching slice.
+ """
+ cdef tree.xmlNode* c_self_node
+ cdef tree.xmlNode* c_parent
+ cdef tree.xmlNode* c_node
+ cdef Py_ssize_t c_index
+ if python._isString(key):
+ return _lookupChildOrRaise(self, key)
+ elif isinstance(key, slice):
+ return list(self)[key]
+ # normal item access
+ c_index = key # raises TypeError if necessary
+ c_self_node = self._c_node
+ c_parent = c_self_node.parent
+ if c_parent is NULL:
+ if c_index == 0 or c_index == -1:
+ return self
+ raise IndexError, unicode(key)
+ if c_index < 0:
+ c_node = c_parent.last
+ else:
+ c_node = c_parent.children
+ c_node = _findFollowingSibling(
+ c_node, tree._getNs(c_self_node), c_self_node.name, c_index)
+ if c_node is NULL:
+ raise IndexError, unicode(key)
+ return elementFactory(self._doc, c_node)
+
+ def __setitem__(self, key, value):
+ u"""Set the value of a sibling, counting from the first child of the
+ parent. Implements key assignment, item assignment and slice
+ assignment.
+
+ * If argument is an integer, sets the sibling at that position.
+
+ * If argument is a string, does the same as setattr(). This is used
+ to provide namespaces for element lookup.
+
+ * If argument is a sequence (list, tuple, etc.), assign the contained
+ items to the siblings.
+ """
+ cdef _Element element
+ cdef tree.xmlNode* c_node
+ if python._isString(key):
+ key = _buildChildTag(self, key)
+ element = _lookupChild(self, key)
+ if element is None:
+ _appendValue(self, key, value)
+ else:
+ _replaceElement(element, value)
+ return
+
+ if self._c_node.parent is NULL:
+ # the 'root[i] = ...' case
+ raise TypeError, u"assignment to root element is invalid"
+
+ if isinstance(key, slice):
+ # slice assignment
+ _setSlice(key, self, value)
+ else:
+ # normal index assignment
+ if key < 0:
+ c_node = self._c_node.parent.last
+ else:
+ c_node = self._c_node.parent.children
+ c_node = _findFollowingSibling(
+ c_node, tree._getNs(self._c_node), self._c_node.name, key)
+ if c_node is NULL:
+ raise IndexError, unicode(key)
+ element = elementFactory(self._doc, c_node)
+ _replaceElement(element, value)
+
+ def __delitem__(self, key):
+ parent = self.getparent()
+ if parent is None:
+ raise TypeError, u"deleting items not supported by root element"
+ if isinstance(key, slice):
+ # slice deletion
+ del_items = list(self)[key]
+ remove = parent.remove
+ for el in del_items:
+ remove(el)
+ else:
+ # normal index deletion
+ sibling = self.__getitem__(key)
+ parent.remove(sibling)
+
+ def descendantpaths(self, prefix=None):
+ u"""descendantpaths(self, prefix=None)
+
+ Returns a list of object path expressions for all descendants.
+ """
+ if prefix is not None and not python._isString(prefix):
+ prefix = u'.'.join(prefix)
+ return _build_descendant_paths(self._c_node, prefix)
+
+
+cdef inline bint _tagMatches(tree.xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name):
+ if c_node.name != c_name:
+ return 0
+ if c_href == NULL:
+ return 1
+ c_node_href = tree._getNs(c_node)
+ if c_node_href == NULL:
+ return c_href[0] == c'\0'
+ return tree.xmlStrcmp(c_node_href, c_href) == 0
+
+
+cdef Py_ssize_t _countSiblings(tree.xmlNode* c_start_node):
+ cdef tree.xmlNode* c_node
+ cdef Py_ssize_t count
+ c_tag = c_start_node.name
+ c_href = tree._getNs(c_start_node)
+ count = 1
+ c_node = c_start_node.next
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ELEMENT_NODE and \
+ _tagMatches(c_node, c_href, c_tag):
+ count += 1
+ c_node = c_node.next
+ c_node = c_start_node.prev
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ELEMENT_NODE and \
+ _tagMatches(c_node, c_href, c_tag):
+ count += 1
+ c_node = c_node.prev
+ return count
+
+cdef tree.xmlNode* _findFollowingSibling(tree.xmlNode* c_node,
+ const_xmlChar* href, const_xmlChar* name,
+ Py_ssize_t index):
+ cdef tree.xmlNode* (*next)(tree.xmlNode*)
+ if index >= 0:
+ next = cetree.nextElement
+ else:
+ index = -1 - index
+ next = cetree.previousElement
+ while c_node is not NULL:
+ if c_node.type == tree.XML_ELEMENT_NODE and \
+ _tagMatches(c_node, href, name):
+ index = index - 1
+ if index < 0:
+ return c_node
+ c_node = next(c_node)
+ return NULL
+
+cdef object _lookupChild(_Element parent, tag):
+ cdef tree.xmlNode* c_result
+ cdef tree.xmlNode* c_node
+ c_node = parent._c_node
+ ns, tag = cetree.getNsTagWithEmptyNs(tag)
+ c_tag = tree.xmlDictExists(
+ c_node.doc.dict, _xcstr(tag), python.PyBytes_GET_SIZE(tag))
+ if c_tag is NULL:
+ return None # not in the hash map => not in the tree
+ if ns is None:
+ # either inherit ns from parent or use empty (i.e. no) namespace
+ c_href = tree._getNs(c_node) or <const_xmlChar*>''
+ else:
+ c_href = _xcstr(ns)
+ c_result = _findFollowingSibling(c_node.children, c_href, c_tag, 0)
+ if c_result is NULL:
+ return None
+ return elementFactory(parent._doc, c_result)
+
+cdef object _lookupChildOrRaise(_Element parent, tag):
+ element = _lookupChild(parent, tag)
+ if element is None:
+ raise AttributeError, u"no such child: " + _buildChildTag(parent, tag)
+ return element
+
+cdef object _buildChildTag(_Element parent, tag):
+ ns, tag = cetree.getNsTag(tag)
+ c_tag = _xcstr(tag)
+ c_href = tree._getNs(parent._c_node) if ns is None else _xcstr(ns)
+ return cetree.namespacedNameFromNsName(c_href, c_tag)
+
+cdef _replaceElement(_Element element, value):
+ cdef _Element new_element
+ if isinstance(value, _Element):
+ # deep copy the new element
+ new_element = cetree.deepcopyNodeToDocument(
+ element._doc, (<_Element>value)._c_node)
+ new_element.tag = element.tag
+ elif isinstance(value, (list, tuple)):
+ element[:] = value
+ return
+ else:
+ new_element = element.makeelement(element.tag)
+ _setElementValue(new_element, value)
+ element.getparent().replace(element, new_element)
+
+cdef _appendValue(_Element parent, tag, value):
+ cdef _Element new_element
+ if isinstance(value, _Element):
+ # deep copy the new element
+ new_element = cetree.deepcopyNodeToDocument(
+ parent._doc, (<_Element>value)._c_node)
+ new_element.tag = tag
+ cetree.appendChildToElement(parent, new_element)
+ elif isinstance(value, (list, tuple)):
+ for item in value:
+ _appendValue(parent, tag, item)
+ else:
+ new_element = cetree.makeElement(
+ tag, parent._doc, None, None, None, None, None)
+ _setElementValue(new_element, value)
+ cetree.appendChildToElement(parent, new_element)
+
+cdef _setElementValue(_Element element, value):
+ if value is None:
+ cetree.setAttributeValue(
+ element, XML_SCHEMA_INSTANCE_NIL_ATTR, u"true")
+ elif isinstance(value, _Element):
+ _replaceElement(element, value)
+ return
+ else:
+ cetree.delAttributeFromNsName(
+ element._c_node, _XML_SCHEMA_INSTANCE_NS, <unsigned char*>"nil")
+ if python._isString(value):
+ pytype_name = u"str"
+ py_type = <PyType>_PYTYPE_DICT.get(pytype_name)
+ else:
+ pytype_name = _typename(value)
+ py_type = <PyType>_PYTYPE_DICT.get(pytype_name)
+ if py_type is not None:
+ value = py_type.stringify(value)
+ else:
+ value = unicode(value)
+ if py_type is not None:
+ cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, pytype_name)
+ else:
+ cetree.delAttributeFromNsName(
+ element._c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME)
+ cetree.setNodeText(element._c_node, value)
+
+cdef _setSlice(sliceobject, _Element target, items):
+ cdef _Element parent
+ cdef tree.xmlNode* c_node
+ cdef Py_ssize_t c_step, c_start, pos
+ # collect existing slice
+ if (<slice>sliceobject).step is None:
+ c_step = 1
+ else:
+ c_step = (<slice>sliceobject).step
+ if c_step == 0:
+ raise ValueError, u"Invalid slice"
+ cdef list del_items = target[sliceobject]
+
+ # collect new values
+ new_items = []
+ tag = target.tag
+ for item in items:
+ if isinstance(item, _Element):
+ # deep copy the new element
+ new_element = cetree.deepcopyNodeToDocument(
+ target._doc, (<_Element>item)._c_node)
+ new_element.tag = tag
+ else:
+ new_element = cetree.makeElement(
+ tag, target._doc, None, None, None, None, None)
+ _setElementValue(new_element, item)
+ new_items.append(new_element)
+
+ # sanity check - raise what a list would raise
+ if c_step != 1 and len(del_items) != len(new_items):
+ raise ValueError, \
+ f"attempt to assign sequence of size {len(new_items)} to extended slice of size {len(del_items)}"
+
+ # replace existing items
+ pos = 0
+ parent = target.getparent()
+ replace = parent.replace
+ while pos < len(new_items) and pos < len(del_items):
+ replace(del_items[pos], new_items[pos])
+ pos += 1
+ # remove leftover items
+ if pos < len(del_items):
+ remove = parent.remove
+ while pos < len(del_items):
+ remove(del_items[pos])
+ pos += 1
+ # append remaining new items
+ if pos < len(new_items):
+ # the sanity check above guarantees (step == 1)
+ if pos > 0:
+ item = new_items[pos-1]
+ else:
+ if (<slice>sliceobject).start > 0:
+ c_node = parent._c_node.children
+ else:
+ c_node = parent._c_node.last
+ c_node = _findFollowingSibling(
+ c_node, tree._getNs(target._c_node), target._c_node.name,
+ (<slice>sliceobject).start - 1)
+ if c_node is NULL:
+ while pos < len(new_items):
+ cetree.appendChildToElement(parent, new_items[pos])
+ pos += 1
+ return
+ item = cetree.elementFactory(parent._doc, c_node)
+ while pos < len(new_items):
+ add = item.addnext
+ item = new_items[pos]
+ add(item)
+ pos += 1
+
+################################################################################
+# Data type support in subclasses
+
+cdef class ObjectifiedDataElement(ObjectifiedElement):
+ u"""This is the base class for all data type Elements. Subclasses should
+ override the 'pyval' property and possibly the __str__ method.
+ """
+ @property
+ def pyval(self):
+ return textOf(self._c_node)
+
+ def __str__(self):
+ return textOf(self._c_node) or ''
+
+ def __repr__(self):
+ return strrepr(textOf(self._c_node) or '')
+
+ def _setText(self, s):
+ u"""For use in subclasses only. Don't use unless you know what you are
+ doing.
+ """
+ cetree.setNodeText(self._c_node, s)
+
+cdef class NumberElement(ObjectifiedDataElement):
+ cdef object _parse_value
+ def _setValueParser(self, function):
+ u"""Set the function that parses the Python value from a string.
+
+ Do not use this unless you know what you are doing.
+ """
+ self._parse_value = function
+
+ @property
+ def pyval(self):
+ return _parseNumber(self)
+
+ def __int__(self):
+ return int(_parseNumber(self))
+
+ def __long__(self):
+ return long(_parseNumber(self))
+
+ def __float__(self):
+ return float(_parseNumber(self))
+
+ def __complex__(self):
+ return complex(_parseNumber(self))
+
+ def __str__(self):
+ return unicode(_parseNumber(self))
+
+ def __repr__(self):
+ return repr(_parseNumber(self))
+
+ def __oct__(self):
+ return oct(_parseNumber(self))
+
+ def __hex__(self):
+ return hex(_parseNumber(self))
+
+ def __richcmp__(self, other, int op):
+ return _richcmpPyvals(self, other, op)
+
+ def __hash__(self):
+ return hash(_parseNumber(self))
+
+ def __add__(self, other):
+ return _numericValueOf(self) + _numericValueOf(other)
+
+ def __sub__(self, other):
+ return _numericValueOf(self) - _numericValueOf(other)
+
+ def __mul__(self, other):
+ return _numericValueOf(self) * _numericValueOf(other)
+
+ def __div__(self, other):
+ return _numericValueOf(self) / _numericValueOf(other)
+
+ def __truediv__(self, other):
+ return _numericValueOf(self) / _numericValueOf(other)
+
+ def __mod__(self, other):
+ return _numericValueOf(self) % _numericValueOf(other)
+
+ def __pow__(self, other, modulo):
+ if modulo is None:
+ return _numericValueOf(self) ** _numericValueOf(other)
+ else:
+ return pow(_numericValueOf(self), _numericValueOf(other), modulo)
+
+ def __neg__(self):
+ return - _numericValueOf(self)
+
+ def __pos__(self):
+ return + _numericValueOf(self)
+
+ def __abs__(self):
+ return abs( _numericValueOf(self) )
+
+ def __nonzero__(self):
+ return bool(_numericValueOf(self))
+
+ def __invert__(self):
+ return ~ _numericValueOf(self)
+
+ def __lshift__(self, other):
+ return _numericValueOf(self) << _numericValueOf(other)
+
+ def __rshift__(self, other):
+ return _numericValueOf(self) >> _numericValueOf(other)
+
+ def __and__(self, other):
+ return _numericValueOf(self) & _numericValueOf(other)
+
+ def __or__(self, other):
+ return _numericValueOf(self) | _numericValueOf(other)
+
+ def __xor__(self, other):
+ return _numericValueOf(self) ^ _numericValueOf(other)
+
+cdef class IntElement(NumberElement):
+ def _init(self):
+ self._parse_value = int
+
+ def __index__(self):
+ return int(_parseNumber(self))
+
+cdef class LongElement(NumberElement):
+ def _init(self):
+ self._parse_value = long
+
+ def __index__(self):
+ return int(_parseNumber(self))
+
+cdef class FloatElement(NumberElement):
+ def _init(self):
+ self._parse_value = float
+
+cdef class StringElement(ObjectifiedDataElement):
+ u"""String data class.
+
+ Note that this class does *not* support the sequence protocol of strings:
+ len(), iter(), str_attr[0], str_attr[0:1], etc. are *not* supported.
+ Instead, use the .text attribute to get a 'real' string.
+ """
+ @property
+ def pyval(self):
+ return textOf(self._c_node) or u''
+
+ def __repr__(self):
+ return repr(textOf(self._c_node) or u'')
+
+ def strlen(self):
+ text = textOf(self._c_node)
+ if text is None:
+ return 0
+ else:
+ return len(text)
+
+ def __nonzero__(self):
+ return bool(textOf(self._c_node))
+
+ def __richcmp__(self, other, int op):
+ return _richcmpPyvals(self, other, op)
+
+ def __hash__(self):
+ return hash(textOf(self._c_node) or u'')
+
+ def __add__(self, other):
+ text = _strValueOf(self)
+ other = _strValueOf(other)
+ if text is None:
+ return other
+ if other is None:
+ return text
+ return text + other
+
+ def __mul__(self, other):
+ if isinstance(self, StringElement):
+ return textOf((<StringElement>self)._c_node) * _numericValueOf(other)
+ elif isinstance(other, StringElement):
+ return _numericValueOf(self) * textOf((<StringElement>other)._c_node)
+ else:
+ raise TypeError, u"invalid types for * operator"
+
+ def __mod__(self, other):
+ return _strValueOf(self) % other
+
+ def __int__(self):
+ return int(textOf(self._c_node))
+
+ def __long__(self):
+ return long(textOf(self._c_node))
+
+ def __float__(self):
+ return float(textOf(self._c_node))
+
+ def __complex__(self):
+ return complex(textOf(self._c_node))
+
+cdef class NoneElement(ObjectifiedDataElement):
+ def __str__(self):
+ return u"None"
+
+ def __repr__(self):
+ return "None"
+
+ def __nonzero__(self):
+ return False
+
+ def __richcmp__(self, other, int op):
+ if other is None or self is None:
+ return python.PyObject_RichCompare(None, None, op)
+ if isinstance(self, NoneElement):
+ return python.PyObject_RichCompare(None, other, op)
+ else:
+ return python.PyObject_RichCompare(self, None, op)
+
+ def __hash__(self):
+ return hash(None)
+
+ @property
+ def pyval(self):
+ return None
+
+
+cdef class BoolElement(IntElement):
+ u"""Boolean type base on string values: 'true' or 'false'.
+
+ Note that this inherits from IntElement to mimic the behaviour of
+ Python's bool type.
+ """
+ def _init(self):
+ self._parse_value = __parseBool
+
+ def __nonzero__(self):
+ return __parseBool(textOf(self._c_node))
+
+ def __richcmp__(self, other, int op):
+ return _richcmpPyvals(self, other, op)
+
+ def __hash__(self):
+ return hash(__parseBool(textOf(self._c_node)))
+
+ def __str__(self):
+ return unicode(__parseBool(textOf(self._c_node)))
+
+ def __repr__(self):
+ return repr(__parseBool(textOf(self._c_node)))
+
+ @property
+ def pyval(self):
+ return __parseBool(textOf(self._c_node))
+
+def __checkBool(s):
+ cdef int value = -1
+ if s is not None:
+ value = __parseBoolAsInt(s)
+ if value == -1:
+ raise ValueError
+
+cpdef bint __parseBool(s) except -1:
+ cdef int value
+ if s is None:
+ return False
+ value = __parseBoolAsInt(s)
+ if value == -1:
+ raise ValueError, f"Invalid boolean value: '{s}'"
+ return value
+
+cdef inline int __parseBoolAsInt(text) except -2:
+ if text == 'false':
+ return 0
+ elif text == 'true':
+ return 1
+ elif text == '0':
+ return 0
+ elif text == '1':
+ return 1
+ return -1
+
+cdef object _parseNumber(NumberElement element):
+ return element._parse_value(textOf(element._c_node))
+
+cdef object _strValueOf(obj):
+ if python._isString(obj):
+ return obj
+ if isinstance(obj, _Element):
+ return textOf((<_Element>obj)._c_node) or u''
+ if obj is None:
+ return u''
+ return unicode(obj)
+
+cdef object _numericValueOf(obj):
+ if isinstance(obj, NumberElement):
+ return _parseNumber(<NumberElement>obj)
+ try:
+ # not always numeric, but Python will raise the right exception
+ return obj.pyval
+ except AttributeError:
+ pass
+ return obj
+
+cdef _richcmpPyvals(left, right, int op):
+ left = getattr(left, 'pyval', left)
+ right = getattr(right, 'pyval', right)
+ return python.PyObject_RichCompare(left, right, op)
+
+
+################################################################################
+# Python type registry
+
+cdef class PyType:
+ u"""PyType(self, name, type_check, type_class, stringify=None)
+ User defined type.
+
+ Named type that contains a type check function, a type class that
+ inherits from ObjectifiedDataElement and an optional "stringification"
+ function. The type check must take a string as argument and raise
+ ValueError or TypeError if it cannot handle the string value. It may be
+ None in which case it is not considered for type guessing. For registered
+ named types, the 'stringify' function (or unicode() if None) is used to
+ convert a Python object with type name 'name' to the string representation
+ stored in the XML tree.
+
+ Example::
+
+ PyType('int', int, MyIntClass).register()
+
+ Note that the order in which types are registered matters. The first
+ matching type will be used.
+ """
+ cdef readonly object name
+ cdef readonly object type_check
+ cdef readonly object stringify
+ cdef object _type
+ cdef list _schema_types
+ def __init__(self, name, type_check, type_class, stringify=None):
+ if isinstance(name, bytes):
+ name = (<bytes>name).decode('ascii')
+ elif not isinstance(name, unicode):
+ raise TypeError, u"Type name must be a string"
+ if type_check is not None and not callable(type_check):
+ raise TypeError, u"Type check function must be callable (or None)"
+ if name != TREE_PYTYPE_NAME and \
+ not issubclass(type_class, ObjectifiedDataElement):
+ raise TypeError, \
+ u"Data classes must inherit from ObjectifiedDataElement"
+ self.name = name
+ self._type = type_class
+ self.type_check = type_check
+ if stringify is None:
+ stringify = unicode
+ self.stringify = stringify
+ self._schema_types = []
+
+ def __repr__(self):
+ return "PyType(%s, %s)" % (self.name, self._type.__name__)
+
+ def register(self, before=None, after=None):
+ u"""register(self, before=None, after=None)
+
+ Register the type.
+
+ The additional keyword arguments 'before' and 'after' accept a
+ sequence of type names that must appear before/after the new type in
+ the type list. If any of them is not currently known, it is simply
+ ignored. Raises ValueError if the dependencies cannot be fulfilled.
+ """
+ if self.name == TREE_PYTYPE_NAME:
+ raise ValueError, u"Cannot register tree type"
+ if self.type_check is not None:
+ for item in _TYPE_CHECKS:
+ if item[0] is self.type_check:
+ _TYPE_CHECKS.remove(item)
+ break
+ entry = (self.type_check, self)
+ first_pos = 0
+ last_pos = -1
+ if before or after:
+ if before is None:
+ before = ()
+ elif after is None:
+ after = ()
+ for i, (check, pytype) in enumerate(_TYPE_CHECKS):
+ if last_pos == -1 and pytype.name in before:
+ last_pos = i
+ if pytype.name in after:
+ first_pos = i+1
+ if last_pos == -1:
+ _TYPE_CHECKS.append(entry)
+ elif first_pos > last_pos:
+ raise ValueError, u"inconsistent before/after dependencies"
+ else:
+ _TYPE_CHECKS.insert(last_pos, entry)
+
+ _PYTYPE_DICT[self.name] = self
+ for xs_type in self._schema_types:
+ _SCHEMA_TYPE_DICT[xs_type] = self
+
+ def unregister(self):
+ u"unregister(self)"
+ if _PYTYPE_DICT.get(self.name) is self:
+ del _PYTYPE_DICT[self.name]
+ for xs_type, pytype in list(_SCHEMA_TYPE_DICT.items()):
+ if pytype is self:
+ del _SCHEMA_TYPE_DICT[xs_type]
+ if self.type_check is None:
+ return
+ try:
+ _TYPE_CHECKS.remove( (self.type_check, self) )
+ except ValueError:
+ pass
+
+ property xmlSchemaTypes:
+ u"""The list of XML Schema datatypes this Python type maps to.
+
+ Note that this must be set before registering the type!
+ """
+ def __get__(self):
+ return self._schema_types
+ def __set__(self, types):
+ self._schema_types = list(map(unicode, types))
+
+
+cdef dict _PYTYPE_DICT = {}
+cdef dict _SCHEMA_TYPE_DICT = {}
+cdef list _TYPE_CHECKS = []
+
+def __lower_bool(b):
+ return u"true" if b else u"false"
+
+cdef _pytypename(obj):
+ return u"str" if python._isString(obj) else _typename(obj)
+
+def pytypename(obj):
+ u"""pytypename(obj)
+
+ Find the name of the corresponding PyType for a Python object.
+ """
+ return _pytypename(obj)
+
+cdef _registerPyTypes():
+ pytype = PyType(u'int', int, IntElement)
+ pytype.xmlSchemaTypes = (u"integer", u"int", u"short", u"byte", u"unsignedShort",
+ u"unsignedByte", u"nonPositiveInteger",
+ u"negativeInteger", u"long", u"nonNegativeInteger",
+ u"unsignedLong", u"unsignedInt", u"positiveInteger",)
+ pytype.register()
+
+ # 'long' type just for backwards compatibility
+ pytype = PyType(u'long', None, IntElement)
+ pytype.register()
+
+ pytype = PyType(u'float', float, FloatElement, repr)
+ pytype.xmlSchemaTypes = (u"double", u"float")
+ pytype.register()
+
+ pytype = PyType(u'bool', __checkBool, BoolElement, __lower_bool)
+ pytype.xmlSchemaTypes = (u"boolean",)
+ pytype.register()
+
+ pytype = PyType(u'str', None, StringElement)
+ pytype.xmlSchemaTypes = (u"string", u"normalizedString", u"token", u"language",
+ u"Name", u"NCName", u"ID", u"IDREF", u"ENTITY",
+ u"NMTOKEN", )
+ pytype.register()
+
+ # since lxml 2.0
+ pytype = PyType(u'NoneType', None, NoneElement)
+ pytype.register()
+
+ # backwards compatibility
+ pytype = PyType(u'none', None, NoneElement)
+ pytype.register()
+
+# non-registered PyType for inner tree elements
+cdef PyType TREE_PYTYPE = PyType(TREE_PYTYPE_NAME, None, ObjectifiedElement)
+
+_registerPyTypes()
+
+def getRegisteredTypes():
+ u"""getRegisteredTypes()
+
+ Returns a list of the currently registered PyType objects.
+
+ To add a new type, retrieve this list and call unregister() for all
+ entries. Then add the new type at a suitable position (possibly replacing
+ an existing one) and call register() for all entries.
+
+ This is necessary if the new type interferes with the type check functions
+ of existing ones (normally only int/float/bool) and must the tried before
+ other types. To add a type that is not yet parsable by the current type
+ check functions, you can simply register() it, which will append it to the
+ end of the type list.
+ """
+ cdef list types = []
+ cdef set known = set()
+ for check, pytype in _TYPE_CHECKS:
+ name = pytype.name
+ if name not in known:
+ known.add(name)
+ types.append(pytype)
+ for pytype in _PYTYPE_DICT.values():
+ name = pytype.name
+ if name not in known:
+ known.add(name)
+ types.append(pytype)
+ return types
+
+cdef PyType _guessPyType(value, PyType defaulttype):
+ if value is None:
+ return None
+ for type_check, tested_pytype in _TYPE_CHECKS:
+ try:
+ type_check(value)
+ return <PyType>tested_pytype
+ except IGNORABLE_ERRORS:
+ # could not be parsed as the specified type => ignore
+ pass
+ return defaulttype
+
+cdef object _guessElementClass(tree.xmlNode* c_node):
+ value = textOf(c_node)
+ if value is None:
+ return None
+ if value == '':
+ return StringElement
+
+ for type_check, pytype in _TYPE_CHECKS:
+ try:
+ type_check(value)
+ return (<PyType>pytype)._type
+ except IGNORABLE_ERRORS:
+ pass
+ return None
+
+################################################################################
+# adapted ElementMaker supports registered PyTypes
+
+@cython.final
+@cython.internal
+cdef class _ObjectifyElementMakerCaller:
+ cdef object _tag
+ cdef object _nsmap
+ cdef object _element_factory
+ cdef bint _annotate
+
+ def __call__(self, *children, **attrib):
+ u"__call__(self, *children, **attrib)"
+ cdef _ObjectifyElementMakerCaller elementMaker
+ cdef _Element element
+ cdef _Element childElement
+ cdef bint has_children
+ cdef bint has_string_value
+ if self._element_factory is None:
+ element = _makeElement(self._tag, None, attrib, self._nsmap)
+ else:
+ element = self._element_factory(self._tag, attrib, self._nsmap)
+
+ pytype_name = None
+ has_children = False
+ has_string_value = False
+ for child in children:
+ if child is None:
+ if len(children) == 1:
+ cetree.setAttributeValue(
+ element, XML_SCHEMA_INSTANCE_NIL_ATTR, u"true")
+ elif python._isString(child):
+ _add_text(element, child)
+ has_string_value = True
+ elif isinstance(child, _Element):
+ cetree.appendChildToElement(element, <_Element>child)
+ has_children = True
+ elif isinstance(child, _ObjectifyElementMakerCaller):
+ elementMaker = <_ObjectifyElementMakerCaller>child
+ if elementMaker._element_factory is None:
+ cetree.makeSubElement(element, elementMaker._tag,
+ None, None, None, None)
+ else:
+ childElement = elementMaker._element_factory(
+ elementMaker._tag)
+ cetree.appendChildToElement(element, childElement)
+ has_children = True
+ elif isinstance(child, dict):
+ for name, value in child.items():
+ # keyword arguments in attrib take precedence
+ if name in attrib:
+ continue
+ pytype = _PYTYPE_DICT.get(_typename(value))
+ if pytype is not None:
+ value = (<PyType>pytype).stringify(value)
+ elif not python._isString(value):
+ value = unicode(value)
+ cetree.setAttributeValue(element, name, value)
+ else:
+ if pytype_name is not None:
+ # concatenation always makes the result a string
+ has_string_value = True
+ pytype_name = _typename(child)
+ pytype = _PYTYPE_DICT.get(_typename(child))
+ if pytype is not None:
+ _add_text(element, (<PyType>pytype).stringify(child))
+ else:
+ has_string_value = True
+ child = unicode(child)
+ _add_text(element, child)
+
+ if self._annotate and not has_children:
+ if has_string_value:
+ cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, u"str")
+ elif pytype_name is not None:
+ cetree.setAttributeValue(element, PYTYPE_ATTRIBUTE, pytype_name)
+
+ return element
+
+cdef _add_text(_Element elem, text):
+ # add text to the tree in construction, either as element text or
+ # tail text, depending on the current tree state
+ cdef tree.xmlNode* c_child
+ c_child = cetree.findChildBackwards(elem._c_node, 0)
+ if c_child is not NULL:
+ old = cetree.tailOf(c_child)
+ if old is not None:
+ text = old + text
+ cetree.setTailText(c_child, text)
+ else:
+ old = cetree.textOf(elem._c_node)
+ if old is not None:
+ text = old + text
+ cetree.setNodeText(elem._c_node, text)
+
+cdef class ElementMaker:
+ u"""ElementMaker(self, namespace=None, nsmap=None, annotate=True, makeelement=None)
+
+ An ElementMaker that can be used for constructing trees.
+
+ Example::
+
+ >>> M = ElementMaker(annotate=False)
+ >>> attributes = {'class': 'par'}
+ >>> html = M.html( M.body( M.p('hello', attributes, M.br, 'objectify', style="font-weight: bold") ) )
+
+ >>> from lxml.etree import tostring
+ >>> print(tostring(html, method='html').decode('ascii'))
+ <html><body><p style="font-weight: bold" class="par">hello<br>objectify</p></body></html>
+
+ To create tags that are not valid Python identifiers, call the factory
+ directly and pass the tag name as first argument::
+
+ >>> root = M('tricky-tag', 'some text')
+ >>> print(root.tag)
+ tricky-tag
+ >>> print(root.text)
+ some text
+
+ Note that this module has a predefined ElementMaker instance called ``E``.
+ """
+ cdef object _makeelement
+ cdef object _namespace
+ cdef object _nsmap
+ cdef bint _annotate
+ cdef dict _cache
+ def __init__(self, *, namespace=None, nsmap=None, annotate=True,
+ makeelement=None):
+ if nsmap is None:
+ nsmap = _DEFAULT_NSMAP if annotate else {}
+ self._nsmap = nsmap
+ self._namespace = None if namespace is None else u"{%s}" % namespace
+ self._annotate = annotate
+ if makeelement is not None:
+ if not callable(makeelement):
+ raise TypeError(
+ f"argument of 'makeelement' parameter must be callable, got {type(makeelement)}")
+ self._makeelement = makeelement
+ else:
+ self._makeelement = None
+ self._cache = {}
+
+ @cython.final
+ cdef _build_element_maker(self, tag, bint caching):
+ cdef _ObjectifyElementMakerCaller element_maker
+ element_maker = _ObjectifyElementMakerCaller.__new__(_ObjectifyElementMakerCaller)
+ if self._namespace is not None and tag[0] != u"{":
+ element_maker._tag = self._namespace + tag
+ else:
+ element_maker._tag = tag
+ element_maker._nsmap = self._nsmap
+ element_maker._annotate = self._annotate
+ element_maker._element_factory = self._makeelement
+ if caching:
+ if len(self._cache) > 200:
+ self._cache.clear()
+ self._cache[tag] = element_maker
+ return element_maker
+
+ def __getattr__(self, tag):
+ element_maker = self._cache.get(tag)
+ if element_maker is None:
+ if is_special_method(tag):
+ return object.__getattr__(self, tag)
+ return self._build_element_maker(tag, caching=True)
+ return element_maker
+
+ def __call__(self, tag, *args, **kwargs):
+ element_maker = self._cache.get(tag)
+ if element_maker is None:
+ element_maker = self._build_element_maker(
+ tag, caching=not is_special_method(tag))
+ return element_maker(*args, **kwargs)
+
+################################################################################
+# Recursive element dumping
+
+cdef bint __RECURSIVE_STR = 0 # default: off
+
+def enable_recursive_str(on=True):
+ u"""enable_recursive_str(on=True)
+
+ Enable a recursively generated tree representation for str(element),
+ based on objectify.dump(element).
+ """
+ global __RECURSIVE_STR
+ __RECURSIVE_STR = on
+
+def dump(_Element element not None):
+ u"""dump(_Element element not None)
+
+ Return a recursively generated string representation of an element.
+ """
+ return _dump(element, 0)
+
+cdef object _dump(_Element element, int indent):
+ indentstr = u" " * indent
+ if isinstance(element, ObjectifiedDataElement):
+ value = repr(element)
+ else:
+ value = textOf(element._c_node)
+ if value is not None:
+ if not value.strip():
+ value = None
+ else:
+ value = repr(value)
+ result = f"{indentstr}{element.tag} = {value} [{_typename(element)}]\n"
+ xsi_ns = u"{%s}" % XML_SCHEMA_INSTANCE_NS
+ pytype_ns = u"{%s}" % PYTYPE_NAMESPACE
+ for name, value in sorted(cetree.iterattributes(element, 3)):
+ if u'{' in name:
+ if name == PYTYPE_ATTRIBUTE:
+ if value == TREE_PYTYPE_NAME:
+ continue
+ else:
+ name = name.replace(pytype_ns, u'py:')
+ name = name.replace(xsi_ns, u'xsi:')
+ result += f"{indentstr} * {name} = {value!r}\n"
+
+ indent += 1
+ for child in element.iterchildren():
+ result += _dump(child, indent)
+ if indent == 1:
+ return result[:-1] # strip last '\n'
+ else:
+ return result
+
+
+################################################################################
+# Pickle support for objectified ElementTree
+
+def __unpickleElementTree(data):
+ return etree.ElementTree(fromstring(data))
+
+cdef _setupPickle(elementTreeReduceFunction):
+ if python.IS_PYTHON2:
+ import copy_reg as copyreg
+ else:
+ import copyreg
+ copyreg.pickle(etree._ElementTree,
+ elementTreeReduceFunction, __unpickleElementTree)
+
+def pickleReduceElementTree(obj):
+ return __unpickleElementTree, (etree.tostring(obj),)
+
+_setupPickle(pickleReduceElementTree)
+del pickleReduceElementTree
+
+################################################################################
+# Element class lookup
+
+cdef class ObjectifyElementClassLookup(ElementClassLookup):
+ u"""ObjectifyElementClassLookup(self, tree_class=None, empty_data_class=None)
+ Element class lookup method that uses the objectify classes.
+ """
+ cdef object empty_data_class
+ cdef object tree_class
+ def __init__(self, tree_class=None, empty_data_class=None):
+ u"""Lookup mechanism for objectify.
+
+ The default Element classes can be replaced by passing subclasses of
+ ObjectifiedElement and ObjectifiedDataElement as keyword arguments.
+ 'tree_class' defines inner tree classes (defaults to
+ ObjectifiedElement), 'empty_data_class' defines the default class for
+ empty data elements (defaults to StringElement).
+ """
+ self._lookup_function = _lookupElementClass
+ if tree_class is None:
+ tree_class = ObjectifiedElement
+ self.tree_class = tree_class
+ if empty_data_class is None:
+ empty_data_class = StringElement
+ self.empty_data_class = empty_data_class
+
+cdef object _lookupElementClass(state, _Document doc, tree.xmlNode* c_node):
+ cdef ObjectifyElementClassLookup lookup
+ lookup = <ObjectifyElementClassLookup>state
+ # if element has children => no data class
+ if cetree.hasChild(c_node):
+ return lookup.tree_class
+
+ # if element is defined as xsi:nil, return NoneElement class
+ if u"true" == cetree.attributeValueFromNsName(
+ c_node, _XML_SCHEMA_INSTANCE_NS, <unsigned char*>"nil"):
+ return NoneElement
+
+ # check for Python type hint
+ value = cetree.attributeValueFromNsName(
+ c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME)
+ if value is not None:
+ if value == TREE_PYTYPE_NAME:
+ return lookup.tree_class
+ py_type = <PyType>_PYTYPE_DICT.get(value)
+ if py_type is not None:
+ return py_type._type
+ # unknown 'pyval' => try to figure it out ourself, just go on
+
+ # check for XML Schema type hint
+ value = cetree.attributeValueFromNsName(
+ c_node, _XML_SCHEMA_INSTANCE_NS, <unsigned char*>"type")
+
+ if value is not None:
+ schema_type = <PyType>_SCHEMA_TYPE_DICT.get(value)
+ if schema_type is None and u':' in value:
+ prefix, value = value.split(u':', 1)
+ schema_type = <PyType>_SCHEMA_TYPE_DICT.get(value)
+ if schema_type is not None:
+ return schema_type._type
+
+ # otherwise determine class based on text content type
+ el_class = _guessElementClass(c_node)
+ if el_class is not None:
+ return el_class
+
+ # if element is a root node => default to tree node
+ if c_node.parent is NULL or not tree._isElement(c_node.parent):
+ return lookup.tree_class
+
+ return lookup.empty_data_class
+
+
+################################################################################
+# Type annotations
+
+cdef PyType _check_type(tree.xmlNode* c_node, PyType pytype):
+ if pytype is None:
+ return None
+ value = textOf(c_node)
+ try:
+ pytype.type_check(value)
+ return pytype
+ except IGNORABLE_ERRORS:
+ # could not be parsed as the specified type => ignore
+ pass
+ return None
+
+def pyannotate(element_or_tree, *, ignore_old=False, ignore_xsi=False,
+ empty_pytype=None):
+ u"""pyannotate(element_or_tree, ignore_old=False, ignore_xsi=False, empty_pytype=None)
+
+ Recursively annotates the elements of an XML tree with 'pytype'
+ attributes.
+
+ If the 'ignore_old' keyword argument is True (the default), current 'pytype'
+ attributes will be ignored and replaced. Otherwise, they will be checked
+ and only replaced if they no longer fit the current text value.
+
+ Setting the keyword argument ``ignore_xsi`` to True makes the function
+ additionally ignore existing ``xsi:type`` annotations. The default is to
+ use them as a type hint.
+
+ The default annotation of empty elements can be set with the
+ ``empty_pytype`` keyword argument. The default is not to annotate empty
+ elements. Pass 'str', for example, to make string values the default.
+ """
+ cdef _Element element
+ element = cetree.rootNodeOrRaise(element_or_tree)
+ _annotate(element, 0, 1, ignore_xsi, ignore_old, None, empty_pytype)
+
+def xsiannotate(element_or_tree, *, ignore_old=False, ignore_pytype=False,
+ empty_type=None):
+ u"""xsiannotate(element_or_tree, ignore_old=False, ignore_pytype=False, empty_type=None)
+
+ Recursively annotates the elements of an XML tree with 'xsi:type'
+ attributes.
+
+ If the 'ignore_old' keyword argument is True (the default), current
+ 'xsi:type' attributes will be ignored and replaced. Otherwise, they will be
+ checked and only replaced if they no longer fit the current text value.
+
+ Note that the mapping from Python types to XSI types is usually ambiguous.
+ Currently, only the first XSI type name in the corresponding PyType
+ definition will be used for annotation. Thus, you should consider naming
+ the widest type first if you define additional types.
+
+ Setting the keyword argument ``ignore_pytype`` to True makes the function
+ additionally ignore existing ``pytype`` annotations. The default is to
+ use them as a type hint.
+
+ The default annotation of empty elements can be set with the
+ ``empty_type`` keyword argument. The default is not to annotate empty
+ elements. Pass 'string', for example, to make string values the default.
+ """
+ cdef _Element element
+ element = cetree.rootNodeOrRaise(element_or_tree)
+ _annotate(element, 1, 0, ignore_old, ignore_pytype, empty_type, None)
+
+def annotate(element_or_tree, *, ignore_old=True, ignore_xsi=False,
+ empty_pytype=None, empty_type=None, annotate_xsi=0,
+ annotate_pytype=1):
+ u"""annotate(element_or_tree, ignore_old=True, ignore_xsi=False, empty_pytype=None, empty_type=None, annotate_xsi=0, annotate_pytype=1)
+
+ Recursively annotates the elements of an XML tree with 'xsi:type'
+ and/or 'py:pytype' attributes.
+
+ If the 'ignore_old' keyword argument is True (the default), current
+ 'py:pytype' attributes will be ignored for the type annotation. Set to False
+ if you want reuse existing 'py:pytype' information (iff appropriate for the
+ element text value).
+
+ If the 'ignore_xsi' keyword argument is False (the default), existing
+ 'xsi:type' attributes will be used for the type annotation, if they fit the
+ element text values.
+
+ Note that the mapping from Python types to XSI types is usually ambiguous.
+ Currently, only the first XSI type name in the corresponding PyType
+ definition will be used for annotation. Thus, you should consider naming
+ the widest type first if you define additional types.
+
+ The default 'py:pytype' annotation of empty elements can be set with the
+ ``empty_pytype`` keyword argument. Pass 'str', for example, to make
+ string values the default.
+
+ The default 'xsi:type' annotation of empty elements can be set with the
+ ``empty_type`` keyword argument. The default is not to annotate empty
+ elements. Pass 'string', for example, to make string values the default.
+
+ The keyword arguments 'annotate_xsi' (default: 0) and 'annotate_pytype'
+ (default: 1) control which kind(s) of annotation to use.
+ """
+ cdef _Element element
+ element = cetree.rootNodeOrRaise(element_or_tree)
+ _annotate(element, annotate_xsi, annotate_pytype, ignore_xsi,
+ ignore_old, empty_type, empty_pytype)
+
+
+cdef _annotate(_Element element, bint annotate_xsi, bint annotate_pytype,
+ bint ignore_xsi, bint ignore_pytype,
+ empty_type_name, empty_pytype_name):
+ cdef _Document doc
+ cdef tree.xmlNode* c_node
+ cdef PyType empty_pytype, StrType, NoneType
+
+ if not annotate_xsi and not annotate_pytype:
+ return
+
+ if empty_type_name is not None:
+ if isinstance(empty_type_name, bytes):
+ empty_type_name = (<bytes>empty_type_name).decode("ascii")
+ empty_pytype = <PyType>_SCHEMA_TYPE_DICT.get(empty_type_name)
+ elif empty_pytype_name is not None:
+ if isinstance(empty_pytype_name, bytes):
+ empty_pytype_name = (<bytes>empty_pytype_name).decode("ascii")
+ empty_pytype = <PyType>_PYTYPE_DICT.get(empty_pytype_name)
+ else:
+ empty_pytype = None
+
+ StrType = <PyType>_PYTYPE_DICT.get(u'str')
+ NoneType = <PyType>_PYTYPE_DICT.get(u'NoneType')
+
+ doc = element._doc
+ c_node = element._c_node
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_node, c_node, 1)
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ _annotate_element(c_node, doc, annotate_xsi, annotate_pytype,
+ ignore_xsi, ignore_pytype,
+ empty_type_name, empty_pytype, StrType, NoneType)
+ tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+
+cdef int _annotate_element(tree.xmlNode* c_node, _Document doc,
+ bint annotate_xsi, bint annotate_pytype,
+ bint ignore_xsi, bint ignore_pytype,
+ empty_type_name, PyType empty_pytype,
+ PyType StrType, PyType NoneType) except -1:
+ cdef tree.xmlNs* c_ns
+ cdef PyType pytype = None
+ typename = None
+ istree = 0
+
+ # if element is defined as xsi:nil, represent it as None
+ if cetree.attributeValueFromNsName(
+ c_node, _XML_SCHEMA_INSTANCE_NS, <unsigned char*>"nil") == "true":
+ pytype = NoneType
+
+ if pytype is None and not ignore_xsi:
+ # check that old xsi type value is valid
+ typename = cetree.attributeValueFromNsName(
+ c_node, _XML_SCHEMA_INSTANCE_NS, <unsigned char*>"type")
+ if typename is not None:
+ pytype = <PyType>_SCHEMA_TYPE_DICT.get(typename)
+ if pytype is None and u':' in typename:
+ prefix, typename = typename.split(u':', 1)
+ pytype = <PyType>_SCHEMA_TYPE_DICT.get(typename)
+ if pytype is not None and pytype is not StrType:
+ # StrType does not have a typecheck but is the default
+ # anyway, so just accept it if given as type
+ # information
+ pytype = _check_type(c_node, pytype)
+ if pytype is None:
+ typename = None
+
+ if pytype is None and not ignore_pytype:
+ # check that old pytype value is valid
+ old_pytypename = cetree.attributeValueFromNsName(
+ c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME)
+ if old_pytypename is not None:
+ if old_pytypename == TREE_PYTYPE_NAME:
+ if not cetree.hasChild(c_node):
+ # only case where we should keep it,
+ # everything else is clear enough
+ pytype = TREE_PYTYPE
+ else:
+ if old_pytypename == 'none':
+ # transition from lxml 1.x
+ old_pytypename = "NoneType"
+ pytype = <PyType>_PYTYPE_DICT.get(old_pytypename)
+ if pytype is not None and pytype is not StrType:
+ # StrType does not have a typecheck but is the
+ # default anyway, so just accept it if given as
+ # type information
+ pytype = _check_type(c_node, pytype)
+
+ if pytype is None:
+ # try to guess type
+ if not cetree.hasChild(c_node):
+ # element has no children => data class
+ pytype = _guessPyType(textOf(c_node), StrType)
+ else:
+ istree = 1
+
+ if pytype is None:
+ # use default type for empty elements
+ if cetree.hasText(c_node):
+ pytype = StrType
+ else:
+ pytype = empty_pytype
+ if typename is None:
+ typename = empty_type_name
+
+ if pytype is not None:
+ if typename is None:
+ if not istree:
+ if pytype._schema_types:
+ # pytype->xsi:type is a 1:n mapping
+ # simply take the first
+ typename = pytype._schema_types[0]
+ elif typename not in pytype._schema_types:
+ typename = pytype._schema_types[0]
+
+ if annotate_xsi:
+ if typename is None or istree:
+ cetree.delAttributeFromNsName(
+ c_node, _XML_SCHEMA_INSTANCE_NS, <unsigned char*>"type")
+ else:
+ # update or create attribute
+ typename_utf8 = cetree.utf8(typename)
+ c_ns = cetree.findOrBuildNodeNsPrefix(
+ doc, c_node, _XML_SCHEMA_NS, <unsigned char*>'xsd')
+ if c_ns is not NULL:
+ if b':' in typename_utf8:
+ prefix, name = typename_utf8.split(b':', 1)
+ if c_ns.prefix is NULL or c_ns.prefix[0] == c'\0':
+ typename_utf8 = name
+ elif tree.xmlStrcmp(_xcstr(prefix), c_ns.prefix) != 0:
+ typename_utf8 = (<unsigned char*>c_ns.prefix) + b':' + name
+ elif c_ns.prefix is not NULL and c_ns.prefix[0] != c'\0':
+ typename_utf8 = (<unsigned char*>c_ns.prefix) + b':' + typename_utf8
+ c_ns = cetree.findOrBuildNodeNsPrefix(
+ doc, c_node, _XML_SCHEMA_INSTANCE_NS, <unsigned char*>'xsi')
+ tree.xmlSetNsProp(c_node, c_ns, <unsigned char*>"type", _xcstr(typename_utf8))
+
+ if annotate_pytype:
+ if pytype is None:
+ # delete attribute if it exists
+ cetree.delAttributeFromNsName(
+ c_node, _PYTYPE_NAMESPACE, _PYTYPE_ATTRIBUTE_NAME)
+ else:
+ # update or create attribute
+ c_ns = cetree.findOrBuildNodeNsPrefix(
+ doc, c_node, _PYTYPE_NAMESPACE, <unsigned char*>'py')
+ pytype_name = cetree.utf8(pytype.name)
+ tree.xmlSetNsProp(c_node, c_ns, _PYTYPE_ATTRIBUTE_NAME,
+ _xcstr(pytype_name))
+ if pytype is NoneType:
+ c_ns = cetree.findOrBuildNodeNsPrefix(
+ doc, c_node, _XML_SCHEMA_INSTANCE_NS, <unsigned char*>'xsi')
+ tree.xmlSetNsProp(c_node, c_ns, <unsigned char*>"nil", <unsigned char*>"true")
+
+ return 0
+
+cdef object _strip_attributes = etree.strip_attributes
+cdef object _cleanup_namespaces = etree.cleanup_namespaces
+
+def deannotate(element_or_tree, *, bint pytype=True, bint xsi=True,
+ bint xsi_nil=False, bint cleanup_namespaces=False):
+ u"""deannotate(element_or_tree, pytype=True, xsi=True, xsi_nil=False, cleanup_namespaces=False)
+
+ Recursively de-annotate the elements of an XML tree by removing 'py:pytype'
+ and/or 'xsi:type' attributes and/or 'xsi:nil' attributes.
+
+ If the 'pytype' keyword argument is True (the default), 'py:pytype'
+ attributes will be removed. If the 'xsi' keyword argument is True (the
+ default), 'xsi:type' attributes will be removed.
+ If the 'xsi_nil' keyword argument is True (default: False), 'xsi:nil'
+ attributes will be removed.
+
+ Note that this does not touch the namespace declarations by
+ default. If you want to remove unused namespace declarations from
+ the tree, pass the option ``cleanup_namespaces=True``.
+ """
+ cdef list attribute_names = []
+
+ if pytype:
+ attribute_names.append(PYTYPE_ATTRIBUTE)
+ if xsi:
+ attribute_names.append(XML_SCHEMA_INSTANCE_TYPE_ATTR)
+ if xsi_nil:
+ attribute_names.append(XML_SCHEMA_INSTANCE_NIL_ATTR)
+
+ _strip_attributes(element_or_tree, *attribute_names)
+ if cleanup_namespaces:
+ _cleanup_namespaces(element_or_tree)
+
+################################################################################
+# Module level parser setup
+
+cdef object __DEFAULT_PARSER
+__DEFAULT_PARSER = etree.XMLParser(remove_blank_text=True)
+__DEFAULT_PARSER.set_element_class_lookup( ObjectifyElementClassLookup() )
+
+cdef object objectify_parser
+objectify_parser = __DEFAULT_PARSER
+
+def set_default_parser(new_parser = None):
+ u"""set_default_parser(new_parser = None)
+
+ Replace the default parser used by objectify's Element() and
+ fromstring() functions.
+
+ The new parser must be an etree.XMLParser.
+
+ Call without arguments to reset to the original parser.
+ """
+ global objectify_parser
+ if new_parser is None:
+ objectify_parser = __DEFAULT_PARSER
+ elif isinstance(new_parser, etree.XMLParser):
+ objectify_parser = new_parser
+ else:
+ raise TypeError, u"parser must inherit from lxml.etree.XMLParser"
+
+def makeparser(**kw):
+ u"""makeparser(remove_blank_text=True, **kw)
+
+ Create a new XML parser for objectify trees.
+
+ You can pass all keyword arguments that are supported by
+ ``etree.XMLParser()``. Note that this parser defaults to removing
+ blank text. You can disable this by passing the
+ ``remove_blank_text`` boolean keyword option yourself.
+ """
+ if 'remove_blank_text' not in kw:
+ kw['remove_blank_text'] = True
+ parser = etree.XMLParser(**kw)
+ parser.set_element_class_lookup( ObjectifyElementClassLookup() )
+ return parser
+
+cdef _Element _makeElement(tag, text, attrib, nsmap):
+ return cetree.makeElement(tag, None, objectify_parser, text, None, attrib, nsmap)
+
+################################################################################
+# Module level factory functions
+
+cdef object _fromstring
+_fromstring = etree.fromstring
+
+SubElement = etree.SubElement
+
+def fromstring(xml, parser=None, *, base_url=None):
+ u"""fromstring(xml, parser=None, base_url=None)
+
+ Objectify specific version of the lxml.etree fromstring() function
+ that uses the objectify parser.
+
+ You can pass a different parser as second argument.
+
+ The ``base_url`` keyword argument allows to set the original base URL of
+ the document to support relative Paths when looking up external entities
+ (DTD, XInclude, ...).
+ """
+ if parser is None:
+ parser = objectify_parser
+ return _fromstring(xml, parser, base_url=base_url)
+
+def XML(xml, parser=None, *, base_url=None):
+ u"""XML(xml, parser=None, base_url=None)
+
+ Objectify specific version of the lxml.etree XML() literal factory
+ that uses the objectify parser.
+
+ You can pass a different parser as second argument.
+
+ The ``base_url`` keyword argument allows to set the original base URL of
+ the document to support relative Paths when looking up external entities
+ (DTD, XInclude, ...).
+ """
+ if parser is None:
+ parser = objectify_parser
+ return _fromstring(xml, parser, base_url=base_url)
+
+cdef object _parse
+_parse = etree.parse
+
+def parse(f, parser=None, *, base_url=None):
+ u"""parse(f, parser=None, base_url=None)
+
+ Parse a file or file-like object with the objectify parser.
+
+ You can pass a different parser as second argument.
+
+ The ``base_url`` keyword allows setting a URL for the document
+ when parsing from a file-like object. This is needed when looking
+ up external entities (DTD, XInclude, ...) with relative paths.
+ """
+ if parser is None:
+ parser = objectify_parser
+ return _parse(f, parser, base_url=base_url)
+
+cdef dict _DEFAULT_NSMAP = {
+ "py" : PYTYPE_NAMESPACE,
+ "xsi" : XML_SCHEMA_INSTANCE_NS,
+ "xsd" : XML_SCHEMA_NS
+}
+
+E = ElementMaker()
+
+def Element(_tag, attrib=None, nsmap=None, *, _pytype=None, **_attributes):
+ u"""Element(_tag, attrib=None, nsmap=None, _pytype=None, **_attributes)
+
+ Objectify specific version of the lxml.etree Element() factory that
+ always creates a structural (tree) element.
+
+ NOTE: requires parser based element class lookup activated in lxml.etree!
+ """
+ if attrib is not None:
+ if _attributes:
+ attrib = dict(attrib)
+ attrib.update(_attributes)
+ _attributes = attrib
+ if _pytype is None:
+ _pytype = TREE_PYTYPE_NAME
+ if nsmap is None:
+ nsmap = _DEFAULT_NSMAP
+ _attributes[PYTYPE_ATTRIBUTE] = _pytype
+ return _makeElement(_tag, None, _attributes, nsmap)
+
+def DataElement(_value, attrib=None, nsmap=None, *, _pytype=None, _xsi=None,
+ **_attributes):
+ u"""DataElement(_value, attrib=None, nsmap=None, _pytype=None, _xsi=None, **_attributes)
+
+ Create a new element from a Python value and XML attributes taken from
+ keyword arguments or a dictionary passed as second argument.
+
+ Automatically adds a 'pytype' attribute for the Python type of the value,
+ if the type can be identified. If '_pytype' or '_xsi' are among the
+ keyword arguments, they will be used instead.
+
+ If the _value argument is an ObjectifiedDataElement instance, its py:pytype,
+ xsi:type and other attributes and nsmap are reused unless they are redefined
+ in attrib and/or keyword arguments.
+ """
+ if nsmap is None:
+ nsmap = _DEFAULT_NSMAP
+ if attrib is not None and attrib:
+ if _attributes:
+ attrib = dict(attrib)
+ attrib.update(_attributes)
+ _attributes = attrib
+ if isinstance(_value, ObjectifiedElement):
+ if _pytype is None:
+ if _xsi is None and not _attributes and nsmap is _DEFAULT_NSMAP:
+ # special case: no change!
+ return _value.__copy__()
+ if isinstance(_value, ObjectifiedDataElement):
+ # reuse existing nsmap unless redefined in nsmap parameter
+ temp = _value.nsmap
+ if temp is not None and temp:
+ temp = dict(temp)
+ temp.update(nsmap)
+ nsmap = temp
+ # reuse existing attributes unless redefined in attrib/_attributes
+ temp = _value.attrib
+ if temp is not None and temp:
+ temp = dict(temp)
+ temp.update(_attributes)
+ _attributes = temp
+ # reuse existing xsi:type or py:pytype attributes, unless provided as
+ # arguments
+ if _xsi is None and _pytype is None:
+ _xsi = _attributes.get(XML_SCHEMA_INSTANCE_TYPE_ATTR)
+ _pytype = _attributes.get(PYTYPE_ATTRIBUTE)
+
+ if _xsi is not None:
+ if u':' in _xsi:
+ prefix, name = _xsi.split(u':', 1)
+ ns = nsmap.get(prefix)
+ if ns != XML_SCHEMA_NS:
+ raise ValueError, u"XSD types require the XSD namespace"
+ elif nsmap is _DEFAULT_NSMAP:
+ name = _xsi
+ _xsi = u'xsd:' + _xsi
+ else:
+ name = _xsi
+ for prefix, ns in nsmap.items():
+ if ns == XML_SCHEMA_NS:
+ if prefix is not None and prefix:
+ _xsi = prefix + u':' + _xsi
+ break
+ else:
+ raise ValueError, u"XSD types require the XSD namespace"
+ _attributes[XML_SCHEMA_INSTANCE_TYPE_ATTR] = _xsi
+ if _pytype is None:
+ # allow using unregistered or even wrong xsi:type names
+ py_type = <PyType>_SCHEMA_TYPE_DICT.get(_xsi)
+ if py_type is None:
+ py_type = <PyType>_SCHEMA_TYPE_DICT.get(name)
+ if py_type is not None:
+ _pytype = py_type.name
+
+ if _pytype is None:
+ _pytype = _pytypename(_value)
+
+ if _value is None and _pytype != u"str":
+ _pytype = _pytype or u"NoneType"
+ strval = None
+ elif python._isString(_value):
+ strval = _value
+ elif isinstance(_value, bool):
+ if _value:
+ strval = u"true"
+ else:
+ strval = u"false"
+ else:
+ py_type = <PyType>_PYTYPE_DICT.get(_pytype)
+ stringify = unicode if py_type is None else py_type.stringify
+ strval = stringify(_value)
+
+ if _pytype is not None:
+ if _pytype == u"NoneType" or _pytype == u"none":
+ strval = None
+ _attributes[XML_SCHEMA_INSTANCE_NIL_ATTR] = u"true"
+ else:
+ # check if type information from arguments is valid
+ py_type = <PyType>_PYTYPE_DICT.get(_pytype)
+ if py_type is not None:
+ if py_type.type_check is not None:
+ py_type.type_check(strval)
+ _attributes[PYTYPE_ATTRIBUTE] = _pytype
+
+ return _makeElement(u"value", strval, _attributes, nsmap)
+
+
+################################################################################
+# ObjectPath
+
+include "objectpath.pxi"
diff --git a/src/lxml/objectpath.pxi b/src/lxml/objectpath.pxi
new file mode 100644
index 0000000..2e8d192
--- /dev/null
+++ b/src/lxml/objectpath.pxi
@@ -0,0 +1,332 @@
+################################################################################
+# ObjectPath
+
+ctypedef struct _ObjectPath:
+ const_xmlChar* href
+ const_xmlChar* name
+ Py_ssize_t index
+
+
+cdef object _NO_DEFAULT = object()
+
+
+cdef class ObjectPath:
+ u"""ObjectPath(path)
+ Immutable object that represents a compiled object path.
+
+ Example for a path: 'root.child[1].{other}child[25]'
+ """
+ cdef readonly object find
+ cdef list _path
+ cdef object _path_str
+ cdef _ObjectPath* _c_path
+ cdef Py_ssize_t _path_len
+ def __init__(self, path):
+ if python._isString(path):
+ self._path = _parse_object_path_string(path)
+ self._path_str = path
+ else:
+ self._path = _parse_object_path_list(path)
+ self._path_str = u'.'.join(path)
+ self._path_len = len(self._path)
+ self._c_path = _build_object_path_segments(self._path)
+ self.find = self.__call__
+
+ def __dealloc__(self):
+ if self._c_path is not NULL:
+ python.lxml_free(self._c_path)
+
+ def __str__(self):
+ return self._path_str
+
+ def __call__(self, _Element root not None, *_default):
+ u"""Follow the attribute path in the object structure and return the
+ target attribute value.
+
+ If it it not found, either returns a default value (if one was passed
+ as second argument) or raises AttributeError.
+ """
+ if _default:
+ if len(_default) > 1:
+ raise TypeError, u"invalid number of arguments: needs one or two"
+ default = _default[0]
+ else:
+ default = _NO_DEFAULT
+ return _find_object_path(root, self._c_path, self._path_len, default)
+
+ def hasattr(self, _Element root not None):
+ u"hasattr(self, root)"
+ try:
+ _find_object_path(root, self._c_path, self._path_len, _NO_DEFAULT)
+ except AttributeError:
+ return False
+ return True
+
+ def setattr(self, _Element root not None, value):
+ u"""setattr(self, root, value)
+
+ Set the value of the target element in a subtree.
+
+ If any of the children on the path does not exist, it is created.
+ """
+ _create_object_path(root, self._c_path, self._path_len, 1, value)
+
+ def addattr(self, _Element root not None, value):
+ u"""addattr(self, root, value)
+
+ Append a value to the target element in a subtree.
+
+ If any of the children on the path does not exist, it is created.
+ """
+ _create_object_path(root, self._c_path, self._path_len, 0, value)
+
+
+cdef object __MATCH_PATH_SEGMENT = re.compile(
+ ur"(\.?)\s*(?:\{([^}]*)\})?\s*([^.{}\[\]\s]+)\s*(?:\[\s*([-0-9]+)\s*\])?",
+ re.U).match
+
+cdef tuple _RELATIVE_PATH_SEGMENT = (None, None, 0)
+
+
+cdef list _parse_object_path_string(_path):
+ u"""Parse object path string into a (ns, name, index) list.
+ """
+ cdef bint has_dot
+ cdef unicode path
+ new_path = []
+ if isinstance(_path, bytes):
+ path = (<bytes>_path).decode('ascii')
+ elif type(_path) is not unicode:
+ path = unicode(_path)
+ else:
+ path = _path
+ path = path.strip()
+ if path == u'.':
+ return [_RELATIVE_PATH_SEGMENT]
+ path_pos = 0
+ while path:
+ match = __MATCH_PATH_SEGMENT(path, path_pos)
+ if match is None:
+ break
+
+ dot, ns, name, index = match.groups()
+ index = int(index) if index else 0
+ has_dot = dot == u'.'
+ if not new_path:
+ if has_dot:
+ # path '.child' => ignore root
+ new_path.append(_RELATIVE_PATH_SEGMENT)
+ elif index:
+ raise ValueError, u"index not allowed on root node"
+ elif not has_dot:
+ raise ValueError, u"invalid path"
+ if ns is not None:
+ ns = python.PyUnicode_AsUTF8String(ns)
+ name = python.PyUnicode_AsUTF8String(name)
+ new_path.append( (ns, name, index) )
+
+ path_pos = match.end()
+ if not new_path or len(path) > path_pos:
+ raise ValueError, u"invalid path"
+ return new_path
+
+
+cdef list _parse_object_path_list(path):
+ u"""Parse object path sequence into a (ns, name, index) list.
+ """
+ new_path = []
+ for item in path:
+ item = item.strip()
+ if not new_path and item == u'':
+ # path '.child' => ignore root
+ ns = name = None
+ index = 0
+ else:
+ ns, name = cetree.getNsTag(item)
+ c_name = _xcstr(name)
+ index_pos = tree.xmlStrchr(c_name, c'[')
+ if index_pos is NULL:
+ index = 0
+ else:
+ index_end = tree.xmlStrchr(index_pos + 1, c']')
+ if index_end is NULL:
+ raise ValueError, u"index must be enclosed in []"
+ index = int(index_pos[1:index_end - index_pos])
+ if not new_path and index != 0:
+ raise ValueError, u"index not allowed on root node"
+ name = <bytes>c_name[:index_pos - c_name]
+ new_path.append( (ns, name, index) )
+ if not new_path:
+ raise ValueError, u"invalid path"
+ return new_path
+
+
+cdef _ObjectPath* _build_object_path_segments(list path_list) except NULL:
+ cdef _ObjectPath* c_path
+ cdef _ObjectPath* c_path_segments
+ c_path_segments = <_ObjectPath*>python.lxml_malloc(len(path_list), sizeof(_ObjectPath))
+ if c_path_segments is NULL:
+ raise MemoryError()
+ c_path = c_path_segments
+ for href, name, index in path_list:
+ c_path[0].href = _xcstr(href) if href is not None else NULL
+ c_path[0].name = _xcstr(name) if name is not None else NULL
+ c_path[0].index = index
+ c_path += 1
+ return c_path_segments
+
+
+cdef _find_object_path(_Element root, _ObjectPath* c_path, Py_ssize_t c_path_len, default_value):
+ u"""Follow the path to find the target element.
+ """
+ cdef tree.xmlNode* c_node
+ cdef Py_ssize_t c_index
+ c_node = root._c_node
+ c_name = c_path[0].name
+ c_href = c_path[0].href
+ if c_href is NULL or c_href[0] == c'\0':
+ c_href = tree._getNs(c_node)
+ if not cetree.tagMatches(c_node, c_href, c_name):
+ if default_value is not _NO_DEFAULT:
+ return default_value
+ else:
+ raise ValueError(
+ f"root element does not match: need {cetree.namespacedNameFromNsName(c_href, c_name)}, got {root.tag}")
+
+ while c_node is not NULL:
+ c_path_len -= 1
+ if c_path_len <= 0:
+ break
+
+ c_path += 1
+ if c_path[0].href is not NULL:
+ c_href = c_path[0].href # otherwise: keep parent namespace
+ c_name = tree.xmlDictExists(c_node.doc.dict, c_path[0].name, -1)
+ if c_name is NULL:
+ c_name = c_path[0].name
+ c_node = NULL
+ break
+ c_index = c_path[0].index
+ c_node = c_node.last if c_index < 0 else c_node.children
+ c_node = _findFollowingSibling(c_node, c_href, c_name, c_index)
+
+ if c_node is not NULL:
+ return cetree.elementFactory(root._doc, c_node)
+ elif default_value is not _NO_DEFAULT:
+ return default_value
+ else:
+ tag = cetree.namespacedNameFromNsName(c_href, c_name)
+ raise AttributeError, f"no such child: {tag}"
+
+
+cdef _create_object_path(_Element root, _ObjectPath* c_path,
+ Py_ssize_t c_path_len, int replace, value):
+ u"""Follow the path to find the target element, build the missing children
+ as needed and set the target element to 'value'. If replace is true, an
+ existing value is replaced, otherwise the new value is added.
+ """
+ cdef _Element child
+ cdef tree.xmlNode* c_node
+ cdef tree.xmlNode* c_child
+ cdef Py_ssize_t c_index
+ if c_path_len == 1:
+ raise TypeError, u"cannot update root node"
+
+ c_node = root._c_node
+ c_name = c_path[0].name
+ c_href = c_path[0].href
+ if c_href is NULL or c_href[0] == c'\0':
+ c_href = tree._getNs(c_node)
+ if not cetree.tagMatches(c_node, c_href, c_name):
+ raise ValueError(
+ f"root element does not match: need {cetree.namespacedNameFromNsName(c_href, c_name)}, got {root.tag}")
+
+ while c_path_len > 1:
+ c_path_len -= 1
+ c_path += 1
+ if c_path[0].href is not NULL:
+ c_href = c_path[0].href # otherwise: keep parent namespace
+ c_index = c_path[0].index
+ c_name = tree.xmlDictExists(c_node.doc.dict, c_path[0].name, -1)
+ if c_name is NULL:
+ c_name = c_path[0].name
+ c_child = NULL
+ else:
+ c_child = c_node.last if c_index < 0 else c_node.children
+ c_child = _findFollowingSibling(c_child, c_href, c_name, c_index)
+
+ if c_child is not NULL:
+ c_node = c_child
+ elif c_index != 0:
+ raise TypeError, u"creating indexed path attributes is not supported"
+ elif c_path_len == 1:
+ _appendValue(cetree.elementFactory(root._doc, c_node),
+ cetree.namespacedNameFromNsName(c_href, c_name),
+ value)
+ return
+ else:
+ child = cetree.makeSubElement(
+ cetree.elementFactory(root._doc, c_node),
+ cetree.namespacedNameFromNsName(c_href, c_name),
+ None, None, None, None)
+ c_node = child._c_node
+
+ # if we get here, the entire path was already there
+ if replace:
+ element = cetree.elementFactory(root._doc, c_node)
+ _replaceElement(element, value)
+ else:
+ _appendValue(cetree.elementFactory(root._doc, c_node.parent),
+ cetree.namespacedName(c_node), value)
+
+
+cdef list _build_descendant_paths(tree.xmlNode* c_node, prefix_string):
+ u"""Returns a list of all descendant paths.
+ """
+ cdef list path, path_list
+ tag = cetree.namespacedName(c_node)
+ if prefix_string:
+ if prefix_string[-1] != u'.':
+ prefix_string += u'.'
+ prefix_string = prefix_string + tag
+ else:
+ prefix_string = tag
+ path = [prefix_string]
+ path_list = []
+ _recursive_build_descendant_paths(c_node, path, path_list)
+ return path_list
+
+
+cdef int _recursive_build_descendant_paths(tree.xmlNode* c_node,
+ list path, list path_list) except -1:
+ u"""Fills the list 'path_list' with all descendant paths, initial prefix
+ being in the list 'path'.
+ """
+ cdef tree.xmlNode* c_child
+ tags = {}
+ path_list.append(u'.'.join(path))
+ c_href = tree._getNs(c_node)
+ c_child = c_node.children
+ while c_child is not NULL:
+ while c_child.type != tree.XML_ELEMENT_NODE:
+ c_child = c_child.next
+ if c_child is NULL:
+ return 0
+ if c_href is tree._getNs(c_child):
+ tag = pyunicode(c_child.name)
+ elif c_href is not NULL and tree._getNs(c_child) is NULL:
+ # special case: parent has namespace, child does not
+ tag = u'{}' + pyunicode(c_child.name)
+ else:
+ tag = cetree.namespacedName(c_child)
+ count = tags.get(tag)
+ if count is None:
+ tags[tag] = 1
+ else:
+ tags[tag] = count + 1
+ tag += f'[{count}]'
+ path.append(tag)
+ _recursive_build_descendant_paths(c_child, path, path_list)
+ del path[-1]
+ c_child = c_child.next
+ return 0
diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi
new file mode 100644
index 0000000..3ed223b
--- /dev/null
+++ b/src/lxml/parser.pxi
@@ -0,0 +1,1901 @@
+# Parsers for XML and HTML
+
+from lxml.includes cimport xmlparser
+from lxml.includes cimport htmlparser
+
+
+class ParseError(LxmlSyntaxError):
+ """Syntax error while parsing an XML document.
+
+ For compatibility with ElementTree 1.3 and later.
+ """
+ def __init__(self, message, code, line, column, filename=None):
+ super(_ParseError, self).__init__(message)
+ self.lineno, self.offset = (line, column - 1)
+ self.code = code
+ self.filename = filename
+
+ @property
+ def position(self):
+ return self.lineno, self.offset + 1
+
+ @position.setter
+ def position(self, new_pos):
+ self.lineno, column = new_pos
+ self.offset = column - 1
+
+cdef object _ParseError = ParseError
+
+
+class XMLSyntaxError(ParseError):
+ """Syntax error while parsing an XML document.
+ """
+
+cdef class ParserError(LxmlError):
+ """Internal lxml parser error.
+ """
+
+
+@cython.final
+@cython.internal
+cdef class _ParserDictionaryContext:
+ # Global parser context to share the string dictionary.
+ #
+ # This class is a delegate singleton!
+ #
+ # It creates _ParserDictionaryContext objects for each thread to keep thread state,
+ # but those must never be used directly. Always stick to using the static
+ # __GLOBAL_PARSER_CONTEXT as defined below the class.
+ #
+
+ cdef tree.xmlDict* _c_dict
+ cdef _BaseParser _default_parser
+ cdef list _implied_parser_contexts
+
+ def __cinit__(self):
+ self._c_dict = NULL
+ self._implied_parser_contexts = []
+
+ def __dealloc__(self):
+ if self._c_dict is not NULL:
+ xmlparser.xmlDictFree(self._c_dict)
+
+ cdef void initMainParserContext(self):
+ u"""Put the global context into the thread dictionary of the main
+ thread. To be called once and only in the main thread."""
+ thread_dict = python.PyThreadState_GetDict()
+ if thread_dict is not NULL:
+ (<dict>thread_dict)[u"_ParserDictionaryContext"] = self
+
+ cdef _ParserDictionaryContext _findThreadParserContext(self):
+ u"Find (or create) the _ParserDictionaryContext object for the current thread"
+ cdef _ParserDictionaryContext context
+ thread_dict = python.PyThreadState_GetDict()
+ if thread_dict is NULL:
+ return self
+ d = <dict>thread_dict
+ result = python.PyDict_GetItem(d, u"_ParserDictionaryContext")
+ if result is not NULL:
+ return <object>result
+ context = <_ParserDictionaryContext>_ParserDictionaryContext.__new__(_ParserDictionaryContext)
+ d[u"_ParserDictionaryContext"] = context
+ return context
+
+ cdef void setDefaultParser(self, _BaseParser parser):
+ u"Set the default parser for the current thread"
+ cdef _ParserDictionaryContext context
+ context = self._findThreadParserContext()
+ context._default_parser = parser
+
+ cdef _BaseParser getDefaultParser(self):
+ u"Return (or create) the default parser of the current thread"
+ cdef _ParserDictionaryContext context
+ context = self._findThreadParserContext()
+ if context._default_parser is None:
+ if self._default_parser is None:
+ self._default_parser = __DEFAULT_XML_PARSER._copy()
+ if context is not self:
+ context._default_parser = self._default_parser._copy()
+ return context._default_parser
+
+ cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default):
+ u"Return the thread-local dict or create a new one if necessary."
+ cdef _ParserDictionaryContext context
+ context = self._findThreadParserContext()
+ if context._c_dict is NULL:
+ # thread dict not yet set up => use default or create a new one
+ if default is not NULL:
+ context._c_dict = default
+ xmlparser.xmlDictReference(default)
+ return default
+ if self._c_dict is NULL:
+ self._c_dict = xmlparser.xmlDictCreate()
+ if context is not self:
+ context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict)
+ return context._c_dict
+
+ cdef void initThreadDictRef(self, tree.xmlDict** c_dict_ref):
+ c_dict = c_dict_ref[0]
+ c_thread_dict = self._getThreadDict(c_dict)
+ if c_dict is c_thread_dict:
+ return
+ if c_dict is not NULL:
+ xmlparser.xmlDictFree(c_dict)
+ c_dict_ref[0] = c_thread_dict
+ xmlparser.xmlDictReference(c_thread_dict)
+
+ cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt):
+ u"Assure we always use the same string dictionary."
+ self.initThreadDictRef(&pctxt.dict)
+ pctxt.dictNames = 1
+
+ cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt):
+ u"Assure we always use the same string dictionary."
+ self.initThreadDictRef(&pctxt.dict)
+
+ cdef void initDocDict(self, xmlDoc* result):
+ u"Store dict of last object parsed if no shared dict yet"
+ # XXX We also free the result dict here if there already was one.
+ # This case should only occur for new documents with empty dicts,
+ # otherwise we'd free data that's in use => segfault
+ self.initThreadDictRef(&result.dict)
+
+ cdef _ParserContext findImpliedContext(self):
+ u"""Return any current implied xml parser context for the current
+ thread. This is used when the resolver functions are called
+ with an xmlParserCtxt that was generated from within libxml2
+ (i.e. without a _ParserContext) - which happens when parsing
+ schema and xinclude external references."""
+ cdef _ParserDictionaryContext context
+ cdef _ParserContext implied_context
+
+ # see if we have a current implied parser
+ context = self._findThreadParserContext()
+ if context._implied_parser_contexts:
+ implied_context = context._implied_parser_contexts[-1]
+ return implied_context
+ return None
+
+ cdef void pushImpliedContextFromParser(self, _BaseParser parser):
+ u"Push a new implied context object taken from the parser."
+ if parser is not None:
+ self.pushImpliedContext(parser._getParserContext())
+ else:
+ self.pushImpliedContext(None)
+
+ cdef void pushImpliedContext(self, _ParserContext parser_context):
+ u"Push a new implied context object."
+ cdef _ParserDictionaryContext context
+ context = self._findThreadParserContext()
+ context._implied_parser_contexts.append(parser_context)
+
+ cdef void popImpliedContext(self):
+ u"Pop the current implied context object."
+ cdef _ParserDictionaryContext context
+ context = self._findThreadParserContext()
+ context._implied_parser_contexts.pop()
+
+cdef _ParserDictionaryContext __GLOBAL_PARSER_CONTEXT = _ParserDictionaryContext()
+__GLOBAL_PARSER_CONTEXT.initMainParserContext()
+
+############################################################
+## support for Python unicode I/O
+############################################################
+
+# name of Python unicode encoding as known to libxml2
+cdef const_char* _UNICODE_ENCODING = NULL
+
+cdef int _setupPythonUnicode() except -1:
+ u"""Sets _UNICODE_ENCODING to the internal encoding name of Python unicode
+ strings if libxml2 supports reading native Python unicode. This depends
+ on iconv and the local Python installation, so we simply check if we find
+ a matching encoding handler.
+ """
+ cdef tree.xmlCharEncodingHandler* enchandler
+ cdef Py_ssize_t l
+ cdef const_char* enc
+ cdef Py_UNICODE *uchars = [c'<', c't', c'e', c's', c't', c'/', c'>']
+ cdef const_xmlChar* buffer = <const_xmlChar*>uchars
+ # apparently, libxml2 can't detect UTF-16 on some systems
+ if (buffer[0] == c'<' and buffer[1] == c'\0' and
+ buffer[2] == c't' and buffer[3] == c'\0'):
+ enc = "UTF-16LE"
+ elif (buffer[0] == c'\0' and buffer[1] == c'<' and
+ buffer[2] == c'\0' and buffer[3] == c't'):
+ enc = "UTF-16BE"
+ else:
+ # let libxml2 give it a try
+ enc = _findEncodingName(buffer, sizeof(Py_UNICODE) * 7)
+ if enc is NULL:
+ # not my fault, it's YOUR broken system :)
+ return 0
+ enchandler = tree.xmlFindCharEncodingHandler(enc)
+ if enchandler is not NULL:
+ global _UNICODE_ENCODING
+ tree.xmlCharEncCloseFunc(enchandler)
+ _UNICODE_ENCODING = enc
+ return 0
+
+cdef const_char* _findEncodingName(const_xmlChar* buffer, int size):
+ u"Work around bug in libxml2: find iconv name of encoding on our own."
+ cdef tree.xmlCharEncoding enc
+ enc = tree.xmlDetectCharEncoding(buffer, size)
+ if enc == tree.XML_CHAR_ENCODING_UTF16LE:
+ if size >= 4 and (buffer[0] == <const_xmlChar>'\xFF' and
+ buffer[1] == <const_xmlChar>'\xFE' and
+ buffer[2] == 0 and buffer[3] == 0):
+ return "UTF-32LE" # according to BOM
+ else:
+ return "UTF-16LE"
+ elif enc == tree.XML_CHAR_ENCODING_UTF16BE:
+ return "UTF-16BE"
+ elif enc == tree.XML_CHAR_ENCODING_UCS4LE:
+ return "UCS-4LE"
+ elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
+ return "UCS-4BE"
+ elif enc == tree.XML_CHAR_ENCODING_NONE:
+ return NULL
+ else:
+ # returns a constant char*, no need to free it
+ return tree.xmlGetCharEncodingName(enc)
+
+_setupPythonUnicode()
+
+############################################################
+## support for file-like objects
+############################################################
+
+@cython.final
+@cython.internal
+cdef class _FileReaderContext:
+ cdef object _filelike
+ cdef object _encoding
+ cdef object _url
+ cdef object _bytes
+ cdef _ExceptionContext _exc_context
+ cdef Py_ssize_t _bytes_read
+ cdef char* _c_url
+ cdef bint _close_file_after_read
+
+ def __cinit__(self, filelike, exc_context not None, url, encoding=None, bint close_file=False):
+ self._exc_context = exc_context
+ self._filelike = filelike
+ self._close_file_after_read = close_file
+ self._encoding = encoding
+ if url is None:
+ self._c_url = NULL
+ else:
+ url = _encodeFilename(url)
+ self._c_url = _cstr(url)
+ self._url = url
+ self._bytes = b''
+ self._bytes_read = 0
+
+ cdef _close_file(self):
+ if self._filelike is None or not self._close_file_after_read:
+ return
+ try:
+ close = self._filelike.close
+ except AttributeError:
+ close = None
+ finally:
+ self._filelike = None
+ if close is not None:
+ close()
+
+ cdef xmlparser.xmlParserInputBuffer* _createParserInputBuffer(self):
+ cdef stdio.FILE* c_stream
+ cdef xmlparser.xmlParserInputBuffer* c_buffer
+ c_buffer = xmlparser.xmlAllocParserInputBuffer(0)
+ c_stream = python.PyFile_AsFile(self._filelike)
+ if c_stream is NULL:
+ c_buffer.readcallback = _readFilelikeParser
+ c_buffer.context = <python.PyObject*>self
+ else:
+ c_buffer.readcallback = _readFileParser
+ c_buffer.context = c_stream
+ return c_buffer
+
+ cdef xmlparser.xmlParserInput* _createParserInput(
+ self, xmlparser.xmlParserCtxt* ctxt):
+ cdef xmlparser.xmlParserInputBuffer* c_buffer
+ c_buffer = self._createParserInputBuffer()
+ return xmlparser.xmlNewIOInputStream(ctxt, c_buffer, 0)
+
+ cdef tree.xmlDtd* _readDtd(self):
+ cdef xmlparser.xmlParserInputBuffer* c_buffer
+ c_buffer = self._createParserInputBuffer()
+ with nogil:
+ return xmlparser.xmlIOParseDTD(NULL, c_buffer, 0)
+
+ cdef xmlDoc* _readDoc(self, xmlparser.xmlParserCtxt* ctxt, int options):
+ cdef xmlDoc* result
+ cdef char* c_encoding
+ cdef stdio.FILE* c_stream
+ cdef xmlparser.xmlInputReadCallback c_read_callback
+ cdef xmlparser.xmlInputCloseCallback c_close_callback
+ cdef void* c_callback_context
+
+ if self._encoding is None:
+ c_encoding = NULL
+ else:
+ c_encoding = _cstr(self._encoding)
+
+ c_stream = python.PyFile_AsFile(self._filelike)
+ if c_stream is NULL:
+ c_read_callback = _readFilelikeParser
+ c_callback_context = <python.PyObject*>self
+ else:
+ c_read_callback = _readFileParser
+ c_callback_context = c_stream
+
+ orig_options = ctxt.options
+ with nogil:
+ if ctxt.html:
+ result = htmlparser.htmlCtxtReadIO(
+ ctxt, c_read_callback, NULL, c_callback_context,
+ self._c_url, c_encoding, options)
+ if result is not NULL:
+ if _fixHtmlDictNames(ctxt.dict, result) < 0:
+ tree.xmlFreeDoc(result)
+ result = NULL
+ else:
+ result = xmlparser.xmlCtxtReadIO(
+ ctxt, c_read_callback, NULL, c_callback_context,
+ self._c_url, c_encoding, options)
+ ctxt.options = orig_options # work around libxml2 problem
+ try:
+ self._close_file()
+ except:
+ self._exc_context._store_raised()
+ finally:
+ return result # swallow any exceptions
+
+ cdef int copyToBuffer(self, char* c_buffer, int c_requested):
+ cdef int c_byte_count = 0
+ cdef char* c_start
+ cdef Py_ssize_t byte_count, remaining
+ if self._bytes_read < 0:
+ return 0
+ try:
+ byte_count = python.PyBytes_GET_SIZE(self._bytes)
+ remaining = byte_count - self._bytes_read
+ while c_requested > remaining:
+ c_start = _cstr(self._bytes) + self._bytes_read
+ cstring_h.memcpy(c_buffer, c_start, remaining)
+ c_byte_count += remaining
+ c_buffer += remaining
+ c_requested -= remaining
+
+ self._bytes = self._filelike.read(c_requested)
+ if not isinstance(self._bytes, bytes):
+ if isinstance(self._bytes, unicode):
+ if self._encoding is None:
+ self._bytes = (<unicode>self._bytes).encode('utf8')
+ else:
+ self._bytes = python.PyUnicode_AsEncodedString(
+ self._bytes, _cstr(self._encoding), NULL)
+ else:
+ self._close_file()
+ raise TypeError, \
+ u"reading from file-like objects must return byte strings or unicode strings"
+
+ remaining = python.PyBytes_GET_SIZE(self._bytes)
+ if remaining == 0:
+ self._bytes_read = -1
+ self._close_file()
+ return c_byte_count
+ self._bytes_read = 0
+
+ if c_requested > 0:
+ c_start = _cstr(self._bytes) + self._bytes_read
+ cstring_h.memcpy(c_buffer, c_start, c_requested)
+ c_byte_count += c_requested
+ self._bytes_read += c_requested
+ except:
+ c_byte_count = -1
+ self._exc_context._store_raised()
+ try:
+ self._close_file()
+ except:
+ self._exc_context._store_raised()
+ finally:
+ return c_byte_count # swallow any exceptions
+
+cdef int _readFilelikeParser(void* ctxt, char* c_buffer, int c_size) with gil:
+ return (<_FileReaderContext>ctxt).copyToBuffer(c_buffer, c_size)
+
+cdef int _readFileParser(void* ctxt, char* c_buffer, int c_size) nogil:
+ return stdio.fread(c_buffer, 1, c_size, <stdio.FILE*>ctxt)
+
+############################################################
+## support for custom document loaders
+############################################################
+
+cdef xmlparser.xmlParserInput* _local_resolver(const_char* c_url, const_char* c_pubid,
+ xmlparser.xmlParserCtxt* c_context) with gil:
+ cdef _ResolverContext context
+ cdef xmlparser.xmlParserInput* c_input
+ cdef _InputDocument doc_ref
+ cdef _FileReaderContext file_context
+ # if there is no _ParserContext associated with the xmlParserCtxt
+ # passed, check to see if the thread state object has an implied
+ # context.
+ if c_context._private is not NULL:
+ context = <_ResolverContext>c_context._private
+ else:
+ context = __GLOBAL_PARSER_CONTEXT.findImpliedContext()
+
+ if context is None:
+ if __DEFAULT_ENTITY_LOADER is NULL:
+ return NULL
+ with nogil:
+ # free the GIL as we might do serious I/O here (e.g. HTTP)
+ c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
+ return c_input
+
+ try:
+ if c_url is NULL:
+ url = None
+ else:
+ # parsing a related document (DTD etc.) => UTF-8 encoded URL?
+ url = _decodeFilename(<const_xmlChar*>c_url)
+ if c_pubid is NULL:
+ pubid = None
+ else:
+ pubid = funicode(<const_xmlChar*>c_pubid) # always UTF-8
+
+ doc_ref = context._resolvers.resolve(url, pubid, context)
+ except:
+ context._store_raised()
+ return NULL
+
+ if doc_ref is not None:
+ if doc_ref._type == PARSER_DATA_STRING:
+ data = doc_ref._data_bytes
+ filename = doc_ref._filename
+ if not filename:
+ filename = None
+ elif not isinstance(filename, bytes):
+ # most likely a text URL
+ filename = filename.encode('utf8')
+ if not isinstance(filename, bytes):
+ filename = None
+
+ c_input = xmlparser.xmlNewInputStream(c_context)
+ if c_input is not NULL:
+ if filename is not None:
+ c_input.filename = <char *>tree.xmlStrdup(_xcstr(filename))
+ c_input.base = _xcstr(data)
+ c_input.length = python.PyBytes_GET_SIZE(data)
+ c_input.cur = c_input.base
+ c_input.end = c_input.base + c_input.length
+ elif doc_ref._type == PARSER_DATA_FILENAME:
+ data = None
+ c_filename = _cstr(doc_ref._filename)
+ with nogil:
+ # free the GIL as we might do serious I/O here
+ c_input = xmlparser.xmlNewInputFromFile(
+ c_context, c_filename)
+ elif doc_ref._type == PARSER_DATA_FILE:
+ file_context = _FileReaderContext(doc_ref._file, context, url,
+ None, doc_ref._close_file)
+ c_input = file_context._createParserInput(c_context)
+ data = file_context
+ else:
+ data = None
+ c_input = NULL
+
+ if data is not None:
+ context._storage.add(data)
+ if c_input is not NULL:
+ return c_input
+
+ if __DEFAULT_ENTITY_LOADER is NULL:
+ return NULL
+
+ with nogil:
+ # free the GIL as we might do serious I/O here (e.g. HTTP)
+ c_input = __DEFAULT_ENTITY_LOADER(c_url, c_pubid, c_context)
+ return c_input
+
+cdef xmlparser.xmlExternalEntityLoader __DEFAULT_ENTITY_LOADER
+__DEFAULT_ENTITY_LOADER = xmlparser.xmlGetExternalEntityLoader()
+
+
+cdef xmlparser.xmlExternalEntityLoader _register_document_loader() nogil:
+ cdef xmlparser.xmlExternalEntityLoader old = xmlparser.xmlGetExternalEntityLoader()
+ xmlparser.xmlSetExternalEntityLoader(<xmlparser.xmlExternalEntityLoader>_local_resolver)
+ return old
+
+cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) nogil:
+ xmlparser.xmlSetExternalEntityLoader(old)
+
+
+############################################################
+## Parsers
+############################################################
+
+@cython.no_gc_clear # May have to call "self._validator.disconnect()" on dealloc.
+@cython.internal
+cdef class _ParserContext(_ResolverContext):
+ cdef _ErrorLog _error_log
+ cdef _ParserSchemaValidationContext _validator
+ cdef xmlparser.xmlParserCtxt* _c_ctxt
+ cdef xmlparser.xmlExternalEntityLoader _orig_loader
+ cdef python.PyThread_type_lock _lock
+ cdef _Document _doc
+ cdef bint _collect_ids
+
+ def __cinit__(self):
+ self._c_ctxt = NULL
+ self._collect_ids = True
+ if not config.ENABLE_THREADING:
+ self._lock = NULL
+ else:
+ self._lock = python.PyThread_allocate_lock()
+ self._error_log = _ErrorLog()
+
+ def __dealloc__(self):
+ if config.ENABLE_THREADING and self._lock is not NULL:
+ python.PyThread_free_lock(self._lock)
+ self._lock = NULL
+ if self._c_ctxt is not NULL:
+ if <void*>self._validator is not NULL and self._validator is not None:
+ # If the parser was not closed correctly (e.g. interrupted iterparse()),
+ # and the schema validator wasn't freed and cleaned up yet, the libxml2 SAX
+ # validator plug might still be in place, which will make xmlFreeParserCtxt()
+ # crash when trying to xmlFree() a static SAX handler.
+ # Thus, make sure we disconnect the handler interceptor here at the latest.
+ self._validator.disconnect()
+ xmlparser.xmlFreeParserCtxt(self._c_ctxt)
+
+ cdef _ParserContext _copy(self):
+ cdef _ParserContext context
+ context = self.__class__()
+ context._collect_ids = self._collect_ids
+ context._validator = self._validator.copy()
+ _initParserContext(context, self._resolvers._copy(), NULL)
+ return context
+
+ cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
+ self._c_ctxt = c_ctxt
+ c_ctxt._private = <void*>self
+
+ cdef void _resetParserContext(self):
+ if self._c_ctxt is not NULL:
+ if self._c_ctxt.html:
+ htmlparser.htmlCtxtReset(self._c_ctxt)
+ self._c_ctxt.disableSAX = 0 # work around bug in libxml2
+ else:
+ xmlparser.xmlClearParserCtxt(self._c_ctxt)
+
+ cdef int prepare(self, bint set_document_loader=True) except -1:
+ cdef int result
+ if config.ENABLE_THREADING and self._lock is not NULL:
+ with nogil:
+ result = python.PyThread_acquire_lock(
+ self._lock, python.WAIT_LOCK)
+ if result == 0:
+ raise ParserError, u"parser locking failed"
+ self._error_log.clear()
+ self._doc = None
+ self._c_ctxt.sax.serror = _receiveParserError
+ self._orig_loader = _register_document_loader() if set_document_loader else NULL
+ if self._validator is not None:
+ self._validator.connect(self._c_ctxt, self._error_log)
+ return 0
+
+ cdef int cleanup(self) except -1:
+ if self._orig_loader is not NULL:
+ _reset_document_loader(self._orig_loader)
+ try:
+ if self._validator is not None:
+ self._validator.disconnect()
+ self._resetParserContext()
+ self.clear()
+ self._doc = None
+ self._c_ctxt.sax.serror = NULL
+ finally:
+ if config.ENABLE_THREADING and self._lock is not NULL:
+ python.PyThread_release_lock(self._lock)
+ return 0
+
+ cdef object _handleParseResult(self, _BaseParser parser,
+ xmlDoc* result, filename):
+ c_doc = self._handleParseResultDoc(parser, result, filename)
+ if self._doc is not None and self._doc._c_doc is c_doc:
+ return self._doc
+ else:
+ return _documentFactory(c_doc, parser)
+
+ cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
+ xmlDoc* result, filename) except NULL:
+ recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
+ return _handleParseResult(self, self._c_ctxt, result,
+ filename, recover,
+ free_doc=self._doc is None)
+
+cdef _initParserContext(_ParserContext context,
+ _ResolverRegistry resolvers,
+ xmlparser.xmlParserCtxt* c_ctxt):
+ _initResolverContext(context, resolvers)
+ if c_ctxt is not NULL:
+ context._initParserContext(c_ctxt)
+
+cdef void _forwardParserError(xmlparser.xmlParserCtxt* _parser_context, xmlerror.xmlError* error) with gil:
+ (<_ParserContext>_parser_context._private)._error_log._receive(error)
+
+cdef void _receiveParserError(void* c_context, xmlerror.xmlError* error) nogil:
+ if __DEBUG:
+ if c_context is NULL or (<xmlparser.xmlParserCtxt*>c_context)._private is NULL:
+ _forwardError(NULL, error)
+ else:
+ _forwardParserError(<xmlparser.xmlParserCtxt*>c_context, error)
+
+cdef int _raiseParseError(xmlparser.xmlParserCtxt* ctxt, filename,
+ _ErrorLog error_log) except -1:
+ if filename is not None and \
+ ctxt.lastError.domain == xmlerror.XML_FROM_IO:
+ if isinstance(filename, bytes):
+ filename = _decodeFilenameWithLength(
+ <bytes>filename, len(<bytes>filename))
+ if ctxt.lastError.message is not NULL:
+ try:
+ message = ctxt.lastError.message.decode('utf-8')
+ except UnicodeDecodeError:
+ # the filename may be in there => play it safe
+ message = ctxt.lastError.message.decode('iso8859-1')
+ message = f"Error reading file '{filename}': {message.strip()}"
+ else:
+ message = f"Error reading '{filename}'"
+ raise IOError, message
+ elif error_log:
+ raise error_log._buildParseException(
+ XMLSyntaxError, u"Document is not well formed")
+ elif ctxt.lastError.message is not NULL:
+ message = ctxt.lastError.message.strip()
+ code = ctxt.lastError.code
+ line = ctxt.lastError.line
+ column = ctxt.lastError.int2
+ if ctxt.lastError.line > 0:
+ message = f"line {line}: {message}"
+ raise XMLSyntaxError(message, code, line, column, filename)
+ else:
+ raise XMLSyntaxError(None, xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
+ filename)
+
+cdef xmlDoc* _handleParseResult(_ParserContext context,
+ xmlparser.xmlParserCtxt* c_ctxt,
+ xmlDoc* result, filename,
+ bint recover, bint free_doc) except NULL:
+ cdef bint well_formed
+ if result is not NULL:
+ __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+
+ if c_ctxt.myDoc is not NULL:
+ if c_ctxt.myDoc is not result:
+ __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc)
+ tree.xmlFreeDoc(c_ctxt.myDoc)
+ c_ctxt.myDoc = NULL
+
+ if result is not NULL:
+ if (context._validator is not None and
+ not context._validator.isvalid()):
+ well_formed = 0 # actually not 'valid', but anyway ...
+ elif (not c_ctxt.wellFormed and not c_ctxt.html and
+ c_ctxt.charset == tree.XML_CHAR_ENCODING_8859_1 and
+ [1 for error in context._error_log
+ if error.type == ErrorTypes.ERR_INVALID_CHAR]):
+ # An encoding error occurred and libxml2 switched from UTF-8
+ # input to (undecoded) Latin-1, at some arbitrary point in the
+ # document. Better raise an error than allowing for a broken
+ # tree with mixed encodings.
+ well_formed = 0
+ elif recover or (c_ctxt.wellFormed and
+ c_ctxt.lastError.level < xmlerror.XML_ERR_ERROR):
+ well_formed = 1
+ elif not c_ctxt.replaceEntities and not c_ctxt.validate \
+ and context is not None:
+ # in this mode, we ignore errors about undefined entities
+ for error in context._error_log.filter_from_errors():
+ if error.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
+ error.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
+ well_formed = 0
+ break
+ else:
+ well_formed = 1
+ else:
+ well_formed = 0
+
+ if not well_formed:
+ if free_doc:
+ tree.xmlFreeDoc(result)
+ result = NULL
+
+ if context is not None and context._has_raised():
+ if result is not NULL:
+ if free_doc:
+ tree.xmlFreeDoc(result)
+ result = NULL
+ context._raise_if_stored()
+
+ if result is NULL:
+ if context is not None:
+ _raiseParseError(c_ctxt, filename, context._error_log)
+ else:
+ _raiseParseError(c_ctxt, filename, None)
+ else:
+ if result.URL is NULL and filename is not None:
+ result.URL = tree.xmlStrdup(_xcstr(filename))
+ if result.encoding is NULL:
+ result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
+
+ if context._validator is not None and \
+ context._validator._add_default_attributes:
+ # we currently need to do this here as libxml2 does not
+ # support inserting default attributes during parse-time
+ # validation
+ context._validator.inject_default_attributes(result)
+
+ return result
+
+cdef int _fixHtmlDictNames(tree.xmlDict* c_dict, xmlDoc* c_doc) nogil:
+ cdef xmlNode* c_node
+ if c_doc is NULL:
+ return 0
+ c_node = c_doc.children
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
+ return -1
+ tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+ return 0
+
+cdef int _fixHtmlDictSubtreeNames(tree.xmlDict* c_dict, xmlDoc* c_doc,
+ xmlNode* c_start_node) nogil:
+ """
+ Move names to the dict, iterating in document order, starting at
+ c_start_node. This is used in incremental parsing after each chunk.
+ """
+ cdef xmlNode* c_node
+ if not c_doc:
+ return 0
+ if not c_start_node:
+ return _fixHtmlDictNames(c_dict, c_doc)
+ c_node = c_start_node
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(<xmlNode*>c_doc, c_node, 1)
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ if _fixHtmlDictNodeNames(c_dict, c_node) < 0:
+ return -1
+ tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+ return 0
+
+cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict,
+ xmlNode* c_node) nogil:
+ cdef xmlNode* c_attr
+ c_name = tree.xmlDictLookup(c_dict, c_node.name, -1)
+ if c_name is NULL:
+ return -1
+ if c_name is not c_node.name:
+ tree.xmlFree(<char*>c_node.name)
+ c_node.name = c_name
+ c_attr = <xmlNode*>c_node.properties
+ while c_attr is not NULL:
+ c_name = tree.xmlDictLookup(c_dict, c_attr.name, -1)
+ if c_name is NULL:
+ return -1
+ if c_name is not c_attr.name:
+ tree.xmlFree(<char*>c_attr.name)
+ c_attr.name = c_name
+ c_attr = c_attr.next
+ return 0
+
+@cython.internal
+cdef class _BaseParser:
+ cdef ElementClassLookup _class_lookup
+ cdef _ResolverRegistry _resolvers
+ cdef _ParserContext _parser_context
+ cdef _ParserContext _push_parser_context
+ cdef int _parse_options
+ cdef bint _for_html
+ cdef bint _remove_comments
+ cdef bint _remove_pis
+ cdef bint _strip_cdata
+ cdef bint _collect_ids
+ cdef XMLSchema _schema
+ cdef bytes _filename
+ cdef readonly object target
+ cdef object _default_encoding
+ cdef tuple _events_to_collect # (event_types, tag)
+
+ def __init__(self, int parse_options, bint for_html, XMLSchema schema,
+ remove_comments, remove_pis, strip_cdata, collect_ids,
+ target, encoding):
+ cdef tree.xmlCharEncodingHandler* enchandler
+ cdef int c_encoding
+ if not isinstance(self, (XMLParser, HTMLParser)):
+ raise TypeError, u"This class cannot be instantiated"
+
+ self._parse_options = parse_options
+ self.target = target
+ self._for_html = for_html
+ self._remove_comments = remove_comments
+ self._remove_pis = remove_pis
+ self._strip_cdata = strip_cdata
+ self._collect_ids = collect_ids
+ self._schema = schema
+
+ self._resolvers = _ResolverRegistry()
+
+ if encoding is None:
+ self._default_encoding = None
+ else:
+ encoding = _utf8(encoding)
+ enchandler = tree.xmlFindCharEncodingHandler(_cstr(encoding))
+ if enchandler is NULL:
+ raise LookupError, f"unknown encoding: '{encoding}'"
+ tree.xmlCharEncCloseFunc(enchandler)
+ self._default_encoding = encoding
+
+ cdef _setBaseURL(self, base_url):
+ self._filename = _encodeFilename(base_url)
+
+ cdef _collectEvents(self, event_types, tag):
+ if event_types is None:
+ event_types = ()
+ else:
+ event_types = tuple(set(event_types))
+ _buildParseEventFilter(event_types) # purely for validation
+ self._events_to_collect = (event_types, tag)
+
+ cdef _ParserContext _getParserContext(self):
+ cdef xmlparser.xmlParserCtxt* pctxt
+ if self._parser_context is None:
+ self._parser_context = self._createContext(self.target, None)
+ self._parser_context._collect_ids = self._collect_ids
+ if self._schema is not None:
+ self._parser_context._validator = \
+ self._schema._newSaxValidator(
+ self._parse_options & xmlparser.XML_PARSE_DTDATTR)
+ pctxt = self._newParserCtxt()
+ _initParserContext(self._parser_context, self._resolvers, pctxt)
+ self._configureSaxContext(pctxt)
+ return self._parser_context
+
+ cdef _ParserContext _getPushParserContext(self):
+ cdef xmlparser.xmlParserCtxt* pctxt
+ if self._push_parser_context is None:
+ self._push_parser_context = self._createContext(
+ self.target, self._events_to_collect)
+ self._push_parser_context._collect_ids = self._collect_ids
+ if self._schema is not None:
+ self._push_parser_context._validator = \
+ self._schema._newSaxValidator(
+ self._parse_options & xmlparser.XML_PARSE_DTDATTR)
+ pctxt = self._newPushParserCtxt()
+ _initParserContext(
+ self._push_parser_context, self._resolvers, pctxt)
+ self._configureSaxContext(pctxt)
+ return self._push_parser_context
+
+ cdef _ParserContext _createContext(self, target, events_to_collect):
+ cdef _SaxParserContext sax_context
+ if target is not None:
+ sax_context = _TargetParserContext(self)
+ (<_TargetParserContext>sax_context)._setTarget(target)
+ elif events_to_collect:
+ sax_context = _SaxParserContext(self)
+ else:
+ # nothing special to configure
+ return _ParserContext()
+ if events_to_collect:
+ events, tag = events_to_collect
+ sax_context._setEventFilter(events, tag)
+ return sax_context
+
+ @cython.final
+ cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1:
+ if self._remove_comments:
+ pctxt.sax.comment = NULL
+ if self._remove_pis:
+ pctxt.sax.processingInstruction = NULL
+ if self._strip_cdata:
+ # hard switch-off for CDATA nodes => makes them plain text
+ pctxt.sax.cdataBlock = NULL
+
+ cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1:
+ cdef xmlparser.xmlSAXHandler* sax = c_ctxt.sax
+ if sax is not NULL and sax.initialized and sax.initialized != xmlparser.XML_SAX2_MAGIC:
+ # need to extend SAX1 context to SAX2 to get proper error reports
+ if <xmlparser.xmlSAXHandlerV1*>sax is &htmlparser.htmlDefaultSAXHandler:
+ sax = <xmlparser.xmlSAXHandler*> tree.xmlMalloc(sizeof(xmlparser.xmlSAXHandler))
+ if sax is NULL:
+ raise MemoryError()
+ cstring_h.memcpy(sax, &htmlparser.htmlDefaultSAXHandler,
+ sizeof(htmlparser.htmlDefaultSAXHandler))
+ c_ctxt.sax = sax
+ sax.initialized = xmlparser.XML_SAX2_MAGIC
+ sax.serror = _receiveParserError
+ sax.startElementNs = NULL
+ sax.endElementNs = NULL
+ sax._private = NULL
+ return 0
+
+ cdef xmlparser.xmlParserCtxt* _newParserCtxt(self) except NULL:
+ cdef xmlparser.xmlParserCtxt* c_ctxt
+ if self._for_html:
+ c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5)
+ if c_ctxt is not NULL:
+ self._registerHtmlErrorHandler(c_ctxt)
+ else:
+ c_ctxt = xmlparser.xmlNewParserCtxt()
+ if c_ctxt is NULL:
+ raise MemoryError
+ c_ctxt.sax.startDocument = _initSaxDocument
+ return c_ctxt
+
+ cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL:
+ cdef xmlparser.xmlParserCtxt* c_ctxt
+ cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL
+ if self._for_html:
+ c_ctxt = htmlparser.htmlCreatePushParserCtxt(
+ NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE)
+ if c_ctxt is not NULL:
+ self._registerHtmlErrorHandler(c_ctxt)
+ htmlparser.htmlCtxtUseOptions(c_ctxt, self._parse_options)
+ else:
+ c_ctxt = xmlparser.xmlCreatePushParserCtxt(
+ NULL, NULL, NULL, 0, c_filename)
+ if c_ctxt is not NULL:
+ xmlparser.xmlCtxtUseOptions(c_ctxt, self._parse_options)
+ if c_ctxt is NULL:
+ raise MemoryError()
+ c_ctxt.sax.startDocument = _initSaxDocument
+ return c_ctxt
+
+ @property
+ def error_log(self):
+ """The error log of the last parser run.
+ """
+ cdef _ParserContext context
+ context = self._getParserContext()
+ return context._error_log.copy()
+
+ @property
+ def resolvers(self):
+ """The custom resolver registry of this parser."""
+ return self._resolvers
+
+ @property
+ def version(self):
+ """The version of the underlying XML parser."""
+ return u"libxml2 %d.%d.%d" % LIBXML_VERSION
+
+ def setElementClassLookup(self, ElementClassLookup lookup = None):
+ u":deprecated: use ``parser.set_element_class_lookup(lookup)`` instead."
+ self.set_element_class_lookup(lookup)
+
+ def set_element_class_lookup(self, ElementClassLookup lookup = None):
+ u"""set_element_class_lookup(self, lookup = None)
+
+ Set a lookup scheme for element classes generated from this parser.
+
+ Reset it by passing None or nothing.
+ """
+ self._class_lookup = lookup
+
+ cdef _BaseParser _copy(self):
+ u"Create a new parser with the same configuration."
+ cdef _BaseParser parser
+ parser = self.__class__()
+ parser._parse_options = self._parse_options
+ parser._for_html = self._for_html
+ parser._remove_comments = self._remove_comments
+ parser._remove_pis = self._remove_pis
+ parser._strip_cdata = self._strip_cdata
+ parser._filename = self._filename
+ parser._resolvers = self._resolvers
+ parser.target = self.target
+ parser._class_lookup = self._class_lookup
+ parser._default_encoding = self._default_encoding
+ parser._schema = self._schema
+ parser._events_to_collect = self._events_to_collect
+ return parser
+
+ def copy(self):
+ u"""copy(self)
+
+ Create a new parser with the same configuration.
+ """
+ return self._copy()
+
+ def makeelement(self, _tag, attrib=None, nsmap=None, **_extra):
+ u"""makeelement(self, _tag, attrib=None, nsmap=None, **_extra)
+
+ Creates a new element associated with this parser.
+ """
+ return _makeElement(_tag, NULL, None, self, None, None,
+ attrib, nsmap, _extra)
+
+ # internal parser methods
+
+ cdef xmlDoc* _parseUnicodeDoc(self, utext, char* c_filename) except NULL:
+ u"""Parse unicode document, share dictionary if possible.
+ """
+ cdef _ParserContext context
+ cdef xmlDoc* result
+ cdef xmlparser.xmlParserCtxt* pctxt
+ cdef Py_ssize_t py_buffer_len
+ cdef int buffer_len, c_kind
+ cdef const_char* c_text
+ cdef const_char* c_encoding = _UNICODE_ENCODING
+ cdef bint is_pep393_string = (
+ python.PEP393_ENABLED and python.PyUnicode_IS_READY(utext))
+ if is_pep393_string:
+ c_text = <const_char*>python.PyUnicode_DATA(utext)
+ py_buffer_len = python.PyUnicode_GET_LENGTH(utext)
+ c_kind = python.PyUnicode_KIND(utext)
+ if c_kind == 1:
+ c_encoding = 'ISO-8859-1'
+ elif c_kind == 2:
+ py_buffer_len *= 2
+ if python.PY_BIG_ENDIAN:
+ c_encoding = 'UTF-16BE' # actually UCS-2
+ else:
+ c_encoding = 'UTF-16LE' # actually UCS-2
+ elif c_kind == 4:
+ py_buffer_len *= 4
+ if python.PY_BIG_ENDIAN:
+ c_encoding = 'UCS-4BE'
+ else:
+ c_encoding = 'UCS-4LE'
+ else:
+ assert False, f"Illegal Unicode kind {c_kind}"
+ else:
+ py_buffer_len = python.PyUnicode_GET_DATA_SIZE(utext)
+ c_text = python.PyUnicode_AS_DATA(utext)
+ assert 0 <= py_buffer_len <= limits.INT_MAX
+ buffer_len = py_buffer_len
+
+ context = self._getParserContext()
+ context.prepare()
+ try:
+ pctxt = context._c_ctxt
+ __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+ orig_options = pctxt.options
+ with nogil:
+ if self._for_html:
+ result = htmlparser.htmlCtxtReadMemory(
+ pctxt, c_text, buffer_len, c_filename, c_encoding,
+ self._parse_options)
+ if result is not NULL:
+ if _fixHtmlDictNames(pctxt.dict, result) < 0:
+ tree.xmlFreeDoc(result)
+ result = NULL
+ else:
+ result = xmlparser.xmlCtxtReadMemory(
+ pctxt, c_text, buffer_len, c_filename, c_encoding,
+ self._parse_options)
+ pctxt.options = orig_options # work around libxml2 problem
+
+ return context._handleParseResultDoc(self, result, None)
+ finally:
+ context.cleanup()
+
+ cdef xmlDoc* _parseDoc(self, char* c_text, int c_len,
+ char* c_filename) except NULL:
+ u"""Parse document, share dictionary if possible.
+ """
+ cdef _ParserContext context
+ cdef xmlDoc* result
+ cdef xmlparser.xmlParserCtxt* pctxt
+ cdef char* c_encoding
+ cdef tree.xmlCharEncoding enc
+ context = self._getParserContext()
+ context.prepare()
+ try:
+ pctxt = context._c_ctxt
+ __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+
+ if self._default_encoding is None:
+ c_encoding = NULL
+ # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs
+ # NOTE: limit to problematic cases because it changes character offsets
+ if c_len >= 4 and (c_text[0] == '\xFF' and c_text[1] == '\xFE' and
+ c_text[2] == 0 and c_text[3] == 0):
+ c_encoding = "UTF-32LE"
+ c_text += 4
+ c_len -= 4
+ elif c_len >= 4 and (c_text[0] == 0 and c_text[1] == 0 and
+ c_text[2] == '\xFE' and c_text[3] == '\xFF'):
+ c_encoding = "UTF-32BE"
+ c_text += 4
+ c_len -= 4
+ else:
+ # no BOM => try to determine encoding
+ enc = tree.xmlDetectCharEncoding(<const_xmlChar*>c_text, c_len)
+ if enc == tree.XML_CHAR_ENCODING_UCS4LE:
+ c_encoding = 'UTF-32LE'
+ elif enc == tree.XML_CHAR_ENCODING_UCS4BE:
+ c_encoding = 'UTF-32BE'
+ else:
+ c_encoding = _cstr(self._default_encoding)
+
+ orig_options = pctxt.options
+ with nogil:
+ if self._for_html:
+ result = htmlparser.htmlCtxtReadMemory(
+ pctxt, c_text, c_len, c_filename,
+ c_encoding, self._parse_options)
+ if result is not NULL:
+ if _fixHtmlDictNames(pctxt.dict, result) < 0:
+ tree.xmlFreeDoc(result)
+ result = NULL
+ else:
+ result = xmlparser.xmlCtxtReadMemory(
+ pctxt, c_text, c_len, c_filename,
+ c_encoding, self._parse_options)
+ pctxt.options = orig_options # work around libxml2 problem
+
+ return context._handleParseResultDoc(self, result, None)
+ finally:
+ context.cleanup()
+
+ cdef xmlDoc* _parseDocFromFile(self, char* c_filename) except NULL:
+ cdef _ParserContext context
+ cdef xmlDoc* result
+ cdef xmlparser.xmlParserCtxt* pctxt
+ cdef char* c_encoding
+ result = NULL
+
+ context = self._getParserContext()
+ context.prepare()
+ try:
+ pctxt = context._c_ctxt
+ __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+
+ if self._default_encoding is None:
+ c_encoding = NULL
+ else:
+ c_encoding = _cstr(self._default_encoding)
+
+ orig_options = pctxt.options
+ with nogil:
+ if self._for_html:
+ result = htmlparser.htmlCtxtReadFile(
+ pctxt, c_filename, c_encoding, self._parse_options)
+ if result is not NULL:
+ if _fixHtmlDictNames(pctxt.dict, result) < 0:
+ tree.xmlFreeDoc(result)
+ result = NULL
+ else:
+ result = xmlparser.xmlCtxtReadFile(
+ pctxt, c_filename, c_encoding, self._parse_options)
+ pctxt.options = orig_options # work around libxml2 problem
+
+ return context._handleParseResultDoc(self, result, c_filename)
+ finally:
+ context.cleanup()
+
+ cdef xmlDoc* _parseDocFromFilelike(self, filelike, filename,
+ encoding) except NULL:
+ cdef _ParserContext context
+ cdef _FileReaderContext file_context
+ cdef xmlDoc* result
+ cdef xmlparser.xmlParserCtxt* pctxt
+ cdef char* c_filename
+ if not filename:
+ filename = None
+
+ context = self._getParserContext()
+ context.prepare()
+ try:
+ pctxt = context._c_ctxt
+ __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+ file_context = _FileReaderContext(
+ filelike, context, filename,
+ encoding or self._default_encoding)
+ result = file_context._readDoc(pctxt, self._parse_options)
+
+ return context._handleParseResultDoc(
+ self, result, filename)
+ finally:
+ context.cleanup()
+
+
+cdef void _initSaxDocument(void* ctxt) with gil:
+ xmlparser.xmlSAX2StartDocument(ctxt)
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ c_doc = c_ctxt.myDoc
+
+ # set up document dict
+ if c_doc and c_ctxt.dict and not c_doc.dict:
+ # I have no idea why libxml2 disables this - we need it
+ c_ctxt.dictNames = 1
+ c_doc.dict = c_ctxt.dict
+ xmlparser.xmlDictReference(c_ctxt.dict)
+
+ # set up XML ID hash table
+ if c_ctxt._private:
+ context = <_ParserContext>c_ctxt._private
+ if context._collect_ids:
+ # keep the global parser dict from filling up with XML IDs
+ if c_doc and not c_doc.ids:
+ # memory errors are not fatal here
+ c_dict = xmlparser.xmlDictCreate()
+ if c_dict:
+ c_doc.ids = tree.xmlHashCreateDict(0, c_dict)
+ xmlparser.xmlDictFree(c_dict)
+ else:
+ c_doc.ids = tree.xmlHashCreate(0)
+ else:
+ c_ctxt.loadsubset |= xmlparser.XML_SKIP_IDS
+ if c_doc and c_doc.ids and not tree.xmlHashSize(c_doc.ids):
+ # already initialised but empty => clear
+ tree.xmlHashFree(c_doc.ids, NULL)
+ c_doc.ids = NULL
+
+
+############################################################
+## ET feed parser
+############################################################
+
+cdef class _FeedParser(_BaseParser):
+ cdef bint _feed_parser_running
+
+ @property
+ def feed_error_log(self):
+ """The error log of the last (or current) run of the feed parser.
+
+ Note that this is local to the feed parser and thus is
+ different from what the ``error_log`` property returns.
+ """
+ return self._getPushParserContext()._error_log.copy()
+
+ cpdef feed(self, data):
+ u"""feed(self, data)
+
+ Feeds data to the parser. The argument should be an 8-bit string
+ buffer containing encoded data, although Unicode is supported as long
+ as both string types are not mixed.
+
+ This is the main entry point to the consumer interface of a
+ parser. The parser will parse as much of the XML stream as it
+ can on each call. To finish parsing or to reset the parser,
+ call the ``close()`` method. Both methods may raise
+ ParseError if errors occur in the input data. If an error is
+ raised, there is no longer a need to call ``close()``.
+
+ The feed parser interface is independent of the normal parser
+ usage. You can use the same parser as a feed parser and in
+ the ``parse()`` function concurrently.
+ """
+ cdef _ParserContext context
+ cdef xmlparser.xmlParserCtxt* pctxt
+ cdef Py_ssize_t py_buffer_len
+ cdef const_char* c_data
+ cdef const_char* c_encoding
+ cdef int buffer_len
+ cdef int error
+ cdef bint recover = self._parse_options & xmlparser.XML_PARSE_RECOVER
+ if isinstance(data, bytes):
+ if self._default_encoding is None:
+ c_encoding = NULL
+ else:
+ c_encoding = self._default_encoding
+ c_data = _cstr(data)
+ py_buffer_len = python.PyBytes_GET_SIZE(data)
+ elif isinstance(data, unicode):
+ if _UNICODE_ENCODING is NULL:
+ raise ParserError, \
+ u"Unicode parsing is not supported on this platform"
+ c_encoding = _UNICODE_ENCODING
+ c_data = python.PyUnicode_AS_DATA(data)
+ py_buffer_len = python.PyUnicode_GET_DATA_SIZE(data)
+ else:
+ raise TypeError, u"Parsing requires string data"
+
+ context = self._getPushParserContext()
+ pctxt = context._c_ctxt
+ error = 0
+ if not self._feed_parser_running:
+ context.prepare(set_document_loader=False)
+ self._feed_parser_running = 1
+ c_filename = (_cstr(self._filename)
+ if self._filename is not None else NULL)
+
+ # We have to give *mlCtxtResetPush() enough input to figure
+ # out the character encoding (at least four bytes),
+ # however if we give it all we got, we'll have nothing for
+ # *mlParseChunk() and things go wrong.
+ buffer_len = 4 if py_buffer_len > 4 else <int>py_buffer_len
+ orig_loader = _register_document_loader()
+ if self._for_html:
+ error = _htmlCtxtResetPush(
+ pctxt, c_data, buffer_len, c_filename, c_encoding,
+ self._parse_options)
+ else:
+ xmlparser.xmlCtxtUseOptions(pctxt, self._parse_options)
+ error = xmlparser.xmlCtxtResetPush(
+ pctxt, c_data, buffer_len, c_filename, c_encoding)
+ _reset_document_loader(orig_loader)
+ py_buffer_len -= buffer_len
+ c_data += buffer_len
+ if error:
+ raise MemoryError()
+ __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt)
+
+ #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding
+
+ fixup_error = 0
+ while py_buffer_len > 0 and (error == 0 or recover):
+ with nogil:
+ if py_buffer_len > limits.INT_MAX:
+ buffer_len = limits.INT_MAX
+ else:
+ buffer_len = <int>py_buffer_len
+ if self._for_html:
+ c_node = pctxt.node # last node where the parser stopped
+ orig_loader = _register_document_loader()
+ error = htmlparser.htmlParseChunk(pctxt, c_data, buffer_len, 0)
+ _reset_document_loader(orig_loader)
+ # and now for the fun part: move node names to the dict
+ if pctxt.myDoc:
+ fixup_error = _fixHtmlDictSubtreeNames(
+ pctxt.dict, pctxt.myDoc, c_node)
+ if pctxt.myDoc.dict and pctxt.myDoc.dict is not pctxt.dict:
+ xmlparser.xmlDictFree(pctxt.myDoc.dict)
+ pctxt.myDoc.dict = pctxt.dict
+ xmlparser.xmlDictReference(pctxt.dict)
+ else:
+ orig_loader = _register_document_loader()
+ error = xmlparser.xmlParseChunk(pctxt, c_data, buffer_len, 0)
+ _reset_document_loader(orig_loader)
+ py_buffer_len -= buffer_len
+ c_data += buffer_len
+
+ if fixup_error:
+ context.store_exception(MemoryError())
+
+ if context._has_raised():
+ # propagate Python exceptions immediately
+ recover = 0
+ error = 1
+ break
+
+ if error and not pctxt.replaceEntities and not pctxt.validate:
+ # in this mode, we ignore errors about undefined entities
+ for entry in context._error_log.filter_from_errors():
+ if entry.type != ErrorTypes.WAR_UNDECLARED_ENTITY and \
+ entry.type != ErrorTypes.ERR_UNDECLARED_ENTITY:
+ break
+ else:
+ error = 0
+
+ if not pctxt.wellFormed and pctxt.disableSAX and context._has_raised():
+ # propagate Python exceptions immediately
+ recover = 0
+ error = 1
+
+ if fixup_error or not recover and (error or not pctxt.wellFormed):
+ self._feed_parser_running = 0
+ try:
+ context._handleParseResult(self, pctxt.myDoc, None)
+ finally:
+ context.cleanup()
+
+ cpdef close(self):
+ u"""close(self)
+
+ Terminates feeding data to this parser. This tells the parser to
+ process any remaining data in the feed buffer, and then returns the
+ root Element of the tree that was parsed.
+
+ This method must be called after passing the last chunk of data into
+ the ``feed()`` method. It should only be called when using the feed
+ parser interface, all other usage is undefined.
+ """
+ if not self._feed_parser_running:
+ raise XMLSyntaxError(u"no element found",
+ xmlerror.XML_ERR_INTERNAL_ERROR, 0, 0,
+ self._filename)
+
+ context = self._getPushParserContext()
+ pctxt = context._c_ctxt
+
+ self._feed_parser_running = 0
+ if self._for_html:
+ htmlparser.htmlParseChunk(pctxt, NULL, 0, 1)
+ else:
+ xmlparser.xmlParseChunk(pctxt, NULL, 0, 1)
+
+ if (pctxt.recovery and not pctxt.disableSAX and
+ isinstance(context, _SaxParserContext)):
+ # apply any left-over 'end' events
+ (<_SaxParserContext>context).flushEvents()
+
+ try:
+ result = context._handleParseResult(self, pctxt.myDoc, None)
+ finally:
+ context.cleanup()
+
+ if isinstance(result, _Document):
+ return (<_Document>result).getroot()
+ else:
+ return result
+
+
+cdef int _htmlCtxtResetPush(xmlparser.xmlParserCtxt* c_ctxt,
+ const_char* c_data, int buffer_len,
+ const_char* c_filename, const_char* c_encoding,
+ int parse_options) except -1:
+ cdef xmlparser.xmlParserInput* c_input_stream
+ # libxml2 lacks an HTML push parser setup function
+ error = xmlparser.xmlCtxtResetPush(
+ c_ctxt, c_data, buffer_len, c_filename, c_encoding)
+ if error:
+ return error
+
+ # fix libxml2 setup for HTML
+ c_ctxt.progressive = 1
+ c_ctxt.html = 1
+ htmlparser.htmlCtxtUseOptions(c_ctxt, parse_options)
+
+ return 0
+
+
+############################################################
+## XML parser
+############################################################
+
+cdef int _XML_DEFAULT_PARSE_OPTIONS
+_XML_DEFAULT_PARSE_OPTIONS = (
+ xmlparser.XML_PARSE_NOENT |
+ xmlparser.XML_PARSE_NOCDATA |
+ xmlparser.XML_PARSE_NONET |
+ xmlparser.XML_PARSE_COMPACT |
+ xmlparser.XML_PARSE_BIG_LINES
+ )
+
+cdef class XMLParser(_FeedParser):
+ u"""XMLParser(self, encoding=None, attribute_defaults=False, dtd_validation=False, load_dtd=False, no_network=True, ns_clean=False, recover=False, schema: XMLSchema =None, huge_tree=False, remove_blank_text=False, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, collect_ids=True, target=None, compact=True)
+
+ The XML parser.
+
+ Parsers can be supplied as additional argument to various parse
+ functions of the lxml API. A default parser is always available
+ and can be replaced by a call to the global function
+ 'set_default_parser'. New parsers can be created at any time
+ without a major run-time overhead.
+
+ The keyword arguments in the constructor are mainly based on the
+ libxml2 parser configuration. A DTD will also be loaded if DTD
+ validation or attribute default values are requested (unless you
+ additionally provide an XMLSchema from which the default
+ attributes can be read).
+
+ Available boolean keyword arguments:
+
+ - attribute_defaults - inject default attributes from DTD or XMLSchema
+ - dtd_validation - validate against a DTD referenced by the document
+ - load_dtd - use DTD for parsing
+ - no_network - prevent network access for related files (default: True)
+ - ns_clean - clean up redundant namespace declarations
+ - recover - try hard to parse through broken XML
+ - remove_blank_text - discard blank text nodes that appear ignorable
+ - remove_comments - discard comments
+ - remove_pis - discard processing instructions
+ - strip_cdata - replace CDATA sections by normal text content (default: True)
+ - compact - save memory for short text content (default: True)
+ - collect_ids - use a hash table of XML IDs for fast access (default: True, always True with DTD validation)
+ - resolve_entities - replace entities by their text value (default: True)
+ - huge_tree - disable security restrictions and support very deep trees
+ and very long text content (only affects libxml2 2.7+)
+
+ Other keyword arguments:
+
+ - encoding - override the document encoding
+ - target - a parser target object that will receive the parse events
+ - schema - an XMLSchema to validate against
+
+ Note that you should avoid sharing parsers between threads. While this is
+ not harmful, it is more efficient to use separate parsers. This does not
+ apply to the default parser.
+ """
+ def __init__(self, *, encoding=None, attribute_defaults=False,
+ dtd_validation=False, load_dtd=False, no_network=True,
+ ns_clean=False, recover=False, XMLSchema schema=None,
+ huge_tree=False, remove_blank_text=False, resolve_entities=True,
+ remove_comments=False, remove_pis=False, strip_cdata=True,
+ collect_ids=True, target=None, compact=True):
+ cdef int parse_options
+ parse_options = _XML_DEFAULT_PARSE_OPTIONS
+ if load_dtd:
+ parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
+ if dtd_validation:
+ parse_options = parse_options | xmlparser.XML_PARSE_DTDVALID | \
+ xmlparser.XML_PARSE_DTDLOAD
+ if attribute_defaults:
+ parse_options = parse_options | xmlparser.XML_PARSE_DTDATTR
+ if schema is None:
+ parse_options = parse_options | xmlparser.XML_PARSE_DTDLOAD
+ if ns_clean:
+ parse_options = parse_options | xmlparser.XML_PARSE_NSCLEAN
+ if recover:
+ parse_options = parse_options | xmlparser.XML_PARSE_RECOVER
+ if remove_blank_text:
+ parse_options = parse_options | xmlparser.XML_PARSE_NOBLANKS
+ if huge_tree:
+ parse_options = parse_options | xmlparser.XML_PARSE_HUGE
+ if not no_network:
+ parse_options = parse_options ^ xmlparser.XML_PARSE_NONET
+ if not compact:
+ parse_options = parse_options ^ xmlparser.XML_PARSE_COMPACT
+ if not resolve_entities:
+ parse_options = parse_options ^ xmlparser.XML_PARSE_NOENT
+ if not strip_cdata:
+ parse_options = parse_options ^ xmlparser.XML_PARSE_NOCDATA
+
+ _BaseParser.__init__(self, parse_options, 0, schema,
+ remove_comments, remove_pis, strip_cdata,
+ collect_ids, target, encoding)
+
+
+cdef class XMLPullParser(XMLParser):
+ """XMLPullParser(self, events=None, *, tag=None, **kwargs)
+
+ XML parser that collects parse events in an iterator.
+
+ The collected events are the same as for iterparse(), but the
+ parser itself is non-blocking in the sense that it receives
+ data chunks incrementally through its .feed() method, instead
+ of reading them directly from a file(-like) object all by itself.
+
+ By default, it collects Element end events. To change that,
+ pass any subset of the available events into the ``events``
+ argument: ``'start'``, ``'end'``, ``'start-ns'``,
+ ``'end-ns'``, ``'comment'``, ``'pi'``.
+
+ To support loading external dependencies relative to the input
+ source, you can pass the ``base_url``.
+ """
+ def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
+ XMLParser.__init__(self, **kwargs)
+ if events is None:
+ events = ('end',)
+ self._setBaseURL(base_url)
+ self._collectEvents(events, tag)
+
+ def read_events(self):
+ return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
+
+
+cdef class ETCompatXMLParser(XMLParser):
+ u"""ETCompatXMLParser(self, encoding=None, attribute_defaults=False, \
+ dtd_validation=False, load_dtd=False, no_network=True, \
+ ns_clean=False, recover=False, schema=None, \
+ huge_tree=False, remove_blank_text=False, resolve_entities=True, \
+ remove_comments=True, remove_pis=True, strip_cdata=True, \
+ target=None, compact=True)
+
+ An XML parser with an ElementTree compatible default setup.
+
+ See the XMLParser class for details.
+
+ This parser has ``remove_comments`` and ``remove_pis`` enabled by default
+ and thus ignores comments and processing instructions.
+ """
+ def __init__(self, *, encoding=None, attribute_defaults=False,
+ dtd_validation=False, load_dtd=False, no_network=True,
+ ns_clean=False, recover=False, schema=None,
+ huge_tree=False, remove_blank_text=False, resolve_entities=True,
+ remove_comments=True, remove_pis=True, strip_cdata=True,
+ target=None, compact=True):
+ XMLParser.__init__(self,
+ attribute_defaults=attribute_defaults,
+ dtd_validation=dtd_validation,
+ load_dtd=load_dtd,
+ no_network=no_network,
+ ns_clean=ns_clean,
+ recover=recover,
+ remove_blank_text=remove_blank_text,
+ huge_tree=huge_tree,
+ compact=compact,
+ resolve_entities=resolve_entities,
+ remove_comments=remove_comments,
+ remove_pis=remove_pis,
+ strip_cdata=strip_cdata,
+ target=target,
+ encoding=encoding,
+ schema=schema)
+
+# ET 1.2 compatible name
+XMLTreeBuilder = ETCompatXMLParser
+
+
+cdef XMLParser __DEFAULT_XML_PARSER
+__DEFAULT_XML_PARSER = XMLParser()
+
+__GLOBAL_PARSER_CONTEXT.setDefaultParser(__DEFAULT_XML_PARSER)
+
+def set_default_parser(_BaseParser parser=None):
+ u"""set_default_parser(parser=None)
+
+ Set a default parser for the current thread. This parser is used
+ globally whenever no parser is supplied to the various parse functions of
+ the lxml API. If this function is called without a parser (or if it is
+ None), the default parser is reset to the original configuration.
+
+ Note that the pre-installed default parser is not thread-safe. Avoid the
+ default parser in multi-threaded environments. You can create a separate
+ parser for each thread explicitly or use a parser pool.
+ """
+ if parser is None:
+ parser = __DEFAULT_XML_PARSER
+ __GLOBAL_PARSER_CONTEXT.setDefaultParser(parser)
+
+def get_default_parser():
+ u"get_default_parser()"
+ return __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+
+############################################################
+## HTML parser
+############################################################
+
+cdef int _HTML_DEFAULT_PARSE_OPTIONS
+_HTML_DEFAULT_PARSE_OPTIONS = (
+ htmlparser.HTML_PARSE_RECOVER |
+ htmlparser.HTML_PARSE_NONET |
+ htmlparser.HTML_PARSE_COMPACT
+ )
+
+cdef class HTMLParser(_FeedParser):
+ u"""HTMLParser(self, encoding=None, remove_blank_text=False, \
+ remove_comments=False, remove_pis=False, strip_cdata=True, \
+ no_network=True, target=None, schema: XMLSchema =None, \
+ recover=True, compact=True, collect_ids=True, huge_tree=False)
+
+ The HTML parser.
+
+ This parser allows reading HTML into a normal XML tree. By
+ default, it can read broken (non well-formed) HTML, depending on
+ the capabilities of libxml2. Use the 'recover' option to switch
+ this off.
+
+ Available boolean keyword arguments:
+
+ - recover - try hard to parse through broken HTML (default: True)
+ - no_network - prevent network access for related files (default: True)
+ - remove_blank_text - discard empty text nodes that are ignorable (i.e. not actual text content)
+ - remove_comments - discard comments
+ - remove_pis - discard processing instructions
+ - strip_cdata - replace CDATA sections by normal text content (default: True)
+ - compact - save memory for short text content (default: True)
+ - default_doctype - add a default doctype even if it is not found in the HTML (default: True)
+ - collect_ids - use a hash table of XML IDs for fast access (default: True)
+ - huge_tree - disable security restrictions and support very deep trees
+ and very long text content (only affects libxml2 2.7+)
+
+ Other keyword arguments:
+
+ - encoding - override the document encoding
+ - target - a parser target object that will receive the parse events
+ - schema - an XMLSchema to validate against
+
+ Note that you should avoid sharing parsers between threads for performance
+ reasons.
+ """
+ def __init__(self, *, encoding=None, remove_blank_text=False,
+ remove_comments=False, remove_pis=False, strip_cdata=True,
+ no_network=True, target=None, XMLSchema schema=None,
+ recover=True, compact=True, default_doctype=True,
+ collect_ids=True, huge_tree=False):
+ cdef int parse_options
+ parse_options = _HTML_DEFAULT_PARSE_OPTIONS
+ if remove_blank_text:
+ parse_options = parse_options | htmlparser.HTML_PARSE_NOBLANKS
+ if not recover:
+ parse_options = parse_options ^ htmlparser.HTML_PARSE_RECOVER
+ if not no_network:
+ parse_options = parse_options ^ htmlparser.HTML_PARSE_NONET
+ if not compact:
+ parse_options = parse_options ^ htmlparser.HTML_PARSE_COMPACT
+ if not default_doctype:
+ parse_options = parse_options ^ htmlparser.HTML_PARSE_NODEFDTD
+ if huge_tree:
+ parse_options = parse_options | xmlparser.XML_PARSE_HUGE
+
+ _BaseParser.__init__(self, parse_options, 1, schema,
+ remove_comments, remove_pis, strip_cdata,
+ collect_ids, target, encoding)
+
+
+cdef HTMLParser __DEFAULT_HTML_PARSER
+__DEFAULT_HTML_PARSER = HTMLParser()
+
+
+cdef class HTMLPullParser(HTMLParser):
+ """HTMLPullParser(self, events=None, *, tag=None, base_url=None, **kwargs)
+
+ HTML parser that collects parse events in an iterator.
+
+ The collected events are the same as for iterparse(), but the
+ parser itself is non-blocking in the sense that it receives
+ data chunks incrementally through its .feed() method, instead
+ of reading them directly from a file(-like) object all by itself.
+
+ By default, it collects Element end events. To change that,
+ pass any subset of the available events into the ``events``
+ argument: ``'start'``, ``'end'``, ``'start-ns'``,
+ ``'end-ns'``, ``'comment'``, ``'pi'``.
+
+ To support loading external dependencies relative to the input
+ source, you can pass the ``base_url``.
+ """
+ def __init__(self, events=None, *, tag=None, base_url=None, **kwargs):
+ HTMLParser.__init__(self, **kwargs)
+ if events is None:
+ events = ('end',)
+ self._setBaseURL(base_url)
+ self._collectEvents(events, tag)
+
+ def read_events(self):
+ return (<_SaxParserContext?>self._getPushParserContext()).events_iterator
+
+
+############################################################
+## helper functions for document creation
+############################################################
+
+cdef xmlDoc* _parseDoc(text, filename, _BaseParser parser) except NULL:
+ cdef char* c_filename
+ cdef char* c_text
+ cdef Py_ssize_t c_len
+ cdef bint is_pep393_string
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ if not filename:
+ c_filename = NULL
+ else:
+ filename_utf = _encodeFilenameUTF8(filename)
+ c_filename = _cstr(filename_utf)
+ if isinstance(text, unicode):
+ is_pep393_string = (
+ python.PEP393_ENABLED and python.PyUnicode_IS_READY(text))
+ if is_pep393_string:
+ c_len = python.PyUnicode_GET_LENGTH(text) * python.PyUnicode_KIND(text)
+ else:
+ c_len = python.PyUnicode_GET_DATA_SIZE(text)
+ if c_len > limits.INT_MAX:
+ return (<_BaseParser>parser)._parseDocFromFilelike(
+ StringIO(text), filename, None)
+ if _UNICODE_ENCODING is NULL and not is_pep393_string:
+ text = (<unicode>text).encode('utf8')
+ return (<_BaseParser>parser)._parseDocFromFilelike(
+ BytesIO(text), filename, "UTF-8")
+ return (<_BaseParser>parser)._parseUnicodeDoc(text, c_filename)
+ else:
+ c_len = python.PyBytes_GET_SIZE(text)
+ if c_len > limits.INT_MAX:
+ return (<_BaseParser>parser)._parseDocFromFilelike(
+ BytesIO(text), filename, None)
+ c_text = _cstr(text)
+ return (<_BaseParser>parser)._parseDoc(c_text, c_len, c_filename)
+
+cdef xmlDoc* _parseDocFromFile(filename8, _BaseParser parser) except NULL:
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ return (<_BaseParser>parser)._parseDocFromFile(_cstr(filename8))
+
+cdef xmlDoc* _parseDocFromFilelike(source, filename,
+ _BaseParser parser) except NULL:
+ if parser is None:
+ parser = __GLOBAL_PARSER_CONTEXT.getDefaultParser()
+ return (<_BaseParser>parser)._parseDocFromFilelike(source, filename, None)
+
+cdef xmlDoc* _newXMLDoc() except NULL:
+ cdef xmlDoc* result
+ result = tree.xmlNewDoc(NULL)
+ if result is NULL:
+ raise MemoryError()
+ if result.encoding is NULL:
+ result.encoding = tree.xmlStrdup(<unsigned char*>"UTF-8")
+ __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+ return result
+
+cdef xmlDoc* _newHTMLDoc() except NULL:
+ cdef xmlDoc* result
+ result = tree.htmlNewDoc(NULL, NULL)
+ if result is NULL:
+ raise MemoryError()
+ __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+ return result
+
+cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL:
+ cdef xmlDoc* result
+ if recursive:
+ with nogil:
+ result = tree.xmlCopyDoc(c_doc, recursive)
+ else:
+ result = tree.xmlCopyDoc(c_doc, 0)
+ if result is NULL:
+ raise MemoryError()
+ __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+ return result
+
+cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL:
+ u"Recursively copy the document and make c_new_root the new root node."
+ cdef xmlDoc* result
+ cdef xmlNode* c_node
+ result = tree.xmlCopyDoc(c_doc, 0) # non recursive
+ __GLOBAL_PARSER_CONTEXT.initDocDict(result)
+ with nogil:
+ c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive
+ if c_node is NULL:
+ raise MemoryError()
+ tree.xmlDocSetRootElement(result, c_node)
+ _copyTail(c_new_root.next, c_node)
+ return result
+
+cdef xmlNode* _copyNodeToDoc(xmlNode* c_node, xmlDoc* c_doc) except NULL:
+ u"Recursively copy the element into the document. c_doc is not modified."
+ cdef xmlNode* c_root
+ c_root = tree.xmlDocCopyNode(c_node, c_doc, 1) # recursive
+ if c_root is NULL:
+ raise MemoryError()
+ _copyTail(c_node.next, c_root)
+ return c_root
+
+
+############################################################
+## API level helper functions for _Document creation
+############################################################
+
+cdef _Document _parseDocument(source, _BaseParser parser, base_url):
+ cdef _Document doc
+ if _isString(source):
+ # parse the file directly from the filesystem
+ doc = _parseDocumentFromURL(_encodeFilename(source), parser)
+ # fix base URL if requested
+ if base_url is not None:
+ base_url = _encodeFilenameUTF8(base_url)
+ if doc._c_doc.URL is not NULL:
+ tree.xmlFree(<char*>doc._c_doc.URL)
+ doc._c_doc.URL = tree.xmlStrdup(_xcstr(base_url))
+ return doc
+
+ if base_url is not None:
+ url = base_url
+ else:
+ url = _getFilenameForFile(source)
+
+ if hasattr(source, u'getvalue') and hasattr(source, u'tell'):
+ # StringIO - reading from start?
+ if source.tell() == 0:
+ return _parseMemoryDocument(source.getvalue(), url, parser)
+
+ # Support for file-like objects (urlgrabber.urlopen, ...)
+ if hasattr(source, u'read'):
+ return _parseFilelikeDocument(source, url, parser)
+
+ raise TypeError, f"cannot parse from '{python._fqtypename(source).decode('UTF-8')}'"
+
+cdef _Document _parseDocumentFromURL(url, _BaseParser parser):
+ c_doc = _parseDocFromFile(url, parser)
+ return _documentFactory(c_doc, parser)
+
+cdef _Document _parseMemoryDocument(text, url, _BaseParser parser):
+ if isinstance(text, unicode):
+ if _hasEncodingDeclaration(text):
+ raise ValueError(
+ u"Unicode strings with encoding declaration are not supported. "
+ u"Please use bytes input or XML fragments without declaration.")
+ elif not isinstance(text, bytes):
+ raise ValueError, u"can only parse strings"
+ c_doc = _parseDoc(text, url, parser)
+ return _documentFactory(c_doc, parser)
+
+cdef _Document _parseFilelikeDocument(source, url, _BaseParser parser):
+ c_doc = _parseDocFromFilelike(source, url, parser)
+ return _documentFactory(c_doc, parser)
diff --git a/src/lxml/parsertarget.pxi b/src/lxml/parsertarget.pxi
new file mode 100644
index 0000000..941e032
--- /dev/null
+++ b/src/lxml/parsertarget.pxi
@@ -0,0 +1,194 @@
+# Parser target context (ET target interface)
+
+cdef object inspect_getargspec
+try:
+ from inspect import getfullargspec as inspect_getargspec
+except ImportError:
+ from inspect import getargspec as inspect_getargspec
+
+
+class _TargetParserResult(Exception):
+ # Admittedly, this is somewhat ugly, but it's the easiest way
+ # to push the Python level parser result through the parser
+ # machinery towards the API level functions
+ def __init__(self, result):
+ self.result = result
+
+
+@cython.final
+@cython.internal
+cdef class _PythonSaxParserTarget(_SaxParserTarget):
+ cdef object _target_start
+ cdef object _target_end
+ cdef object _target_data
+ cdef object _target_start_ns
+ cdef object _target_end_ns
+ cdef object _target_doctype
+ cdef object _target_pi
+ cdef object _target_comment
+ cdef bint _start_takes_nsmap
+
+ def __cinit__(self, target):
+ cdef int event_filter
+ event_filter = 0
+ self._start_takes_nsmap = 0
+ try:
+ self._target_start = target.start
+ if self._target_start is not None:
+ event_filter |= SAX_EVENT_START
+ except AttributeError:
+ pass
+ else:
+ try:
+ arguments = inspect_getargspec(self._target_start)
+ if len(arguments[0]) > 3 or arguments[1] is not None:
+ self._start_takes_nsmap = 1
+ except TypeError:
+ pass
+ try:
+ self._target_end = target.end
+ if self._target_end is not None:
+ event_filter |= SAX_EVENT_END
+ except AttributeError:
+ pass
+ try:
+ self._target_start_ns = target.start_ns
+ if self._target_start_ns is not None:
+ event_filter |= SAX_EVENT_START_NS
+ except AttributeError:
+ pass
+ try:
+ self._target_end_ns = target.end_ns
+ if self._target_end_ns is not None:
+ event_filter |= SAX_EVENT_END_NS
+ except AttributeError:
+ pass
+ try:
+ self._target_data = target.data
+ if self._target_data is not None:
+ event_filter |= SAX_EVENT_DATA
+ except AttributeError:
+ pass
+ try:
+ self._target_doctype = target.doctype
+ if self._target_doctype is not None:
+ event_filter |= SAX_EVENT_DOCTYPE
+ except AttributeError:
+ pass
+ try:
+ self._target_pi = target.pi
+ if self._target_pi is not None:
+ event_filter |= SAX_EVENT_PI
+ except AttributeError:
+ pass
+ try:
+ self._target_comment = target.comment
+ if self._target_comment is not None:
+ event_filter |= SAX_EVENT_COMMENT
+ except AttributeError:
+ pass
+ self._sax_event_filter = event_filter
+
+ cdef _handleSaxStart(self, tag, attrib, nsmap):
+ if self._start_takes_nsmap:
+ return self._target_start(tag, attrib, nsmap)
+ else:
+ return self._target_start(tag, attrib)
+
+ cdef _handleSaxEnd(self, tag):
+ return self._target_end(tag)
+
+ cdef _handleSaxStartNs(self, prefix, uri):
+ return self._target_start_ns(prefix, uri)
+
+ cdef _handleSaxEndNs(self, prefix):
+ return self._target_end_ns(prefix)
+
+ cdef int _handleSaxData(self, data) except -1:
+ self._target_data(data)
+
+ cdef int _handleSaxDoctype(self, root_tag, public_id, system_id) except -1:
+ self._target_doctype(root_tag, public_id, system_id)
+
+ cdef _handleSaxPi(self, target, data):
+ return self._target_pi(target, data)
+
+ cdef _handleSaxComment(self, comment):
+ return self._target_comment(comment)
+
+
+@cython.final
+@cython.internal
+@cython.no_gc_clear # Required because parent class uses it - Cython bug.
+cdef class _TargetParserContext(_SaxParserContext):
+ u"""This class maps SAX2 events to the ET parser target interface.
+ """
+ cdef object _python_target
+ cdef int _setTarget(self, target) except -1:
+ self._python_target = target
+ if not isinstance(target, _SaxParserTarget) or \
+ hasattr(target, u'__dict__'):
+ target = _PythonSaxParserTarget(target)
+ self._setSaxParserTarget(target)
+ return 0
+
+ cdef _ParserContext _copy(self):
+ cdef _TargetParserContext context
+ context = _ParserContext._copy(self)
+ context._setTarget(self._python_target)
+ return context
+
+ cdef void _cleanupTargetParserContext(self, xmlDoc* result):
+ if self._c_ctxt.myDoc is not NULL:
+ if self._c_ctxt.myDoc is not result and \
+ self._c_ctxt.myDoc._private is NULL:
+ # no _Document proxy => orphen
+ tree.xmlFreeDoc(self._c_ctxt.myDoc)
+ self._c_ctxt.myDoc = NULL
+
+ cdef object _handleParseResult(self, _BaseParser parser, xmlDoc* result,
+ filename):
+ cdef bint recover
+ recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
+ try:
+ if self._has_raised():
+ self._cleanupTargetParserContext(result)
+ self._raise_if_stored()
+ if not self._c_ctxt.wellFormed and not recover:
+ _raiseParseError(self._c_ctxt, filename, self._error_log)
+ except:
+ if python.IS_PYTHON2:
+ exc = sys.exc_info()
+ # Python 2 can't chain exceptions
+ try: self._python_target.close()
+ except: pass
+ raise exc[0], exc[1], exc[2]
+ else:
+ self._python_target.close()
+ raise
+ return self._python_target.close()
+
+ cdef xmlDoc* _handleParseResultDoc(self, _BaseParser parser,
+ xmlDoc* result, filename) except NULL:
+ cdef bint recover
+ recover = parser._parse_options & xmlparser.XML_PARSE_RECOVER
+ if result is not NULL and result._private is NULL:
+ # no _Document proxy => orphen
+ tree.xmlFreeDoc(result)
+ try:
+ self._cleanupTargetParserContext(result)
+ self._raise_if_stored()
+ if not self._c_ctxt.wellFormed and not recover:
+ _raiseParseError(self._c_ctxt, filename, self._error_log)
+ except:
+ if python.IS_PYTHON2:
+ exc = sys.exc_info()
+ # Python 2 can't chain exceptions
+ try: self._python_target.close()
+ except: pass
+ raise exc[0], exc[1], exc[2]
+ else:
+ self._python_target.close()
+ raise
+ parse_result = self._python_target.close()
+ raise _TargetParserResult(parse_result)
diff --git a/src/lxml/proxy.pxi b/src/lxml/proxy.pxi
new file mode 100644
index 0000000..3c6e306
--- /dev/null
+++ b/src/lxml/proxy.pxi
@@ -0,0 +1,619 @@
+# Proxy functions and low level node allocation stuff
+
+# Proxies represent elements, their reference is stored in the C
+# structure of the respective node to avoid multiple instantiation of
+# the Python class.
+
+@cython.linetrace(False)
+@cython.profile(False)
+cdef inline _Element getProxy(xmlNode* c_node):
+ u"""Get a proxy for a given node.
+ """
+ #print "getProxy for:", <int>c_node
+ if c_node is not NULL and c_node._private is not NULL:
+ return <_Element>c_node._private
+ else:
+ return None
+
+
+@cython.linetrace(False)
+@cython.profile(False)
+cdef inline bint hasProxy(xmlNode* c_node):
+ if c_node._private is NULL:
+ return False
+ return True
+
+
+@cython.linetrace(False)
+@cython.profile(False)
+cdef inline int _registerProxy(_Element proxy, _Document doc,
+ xmlNode* c_node) except -1:
+ u"""Register a proxy and type for the node it's proxying for.
+ """
+ #print "registering for:", <int>proxy._c_node
+ assert not hasProxy(c_node), u"double registering proxy!"
+ proxy._doc = doc
+ proxy._c_node = c_node
+ c_node._private = <void*>proxy
+ return 0
+
+
+@cython.linetrace(False)
+@cython.profile(False)
+cdef inline int _unregisterProxy(_Element proxy) except -1:
+ u"""Unregister a proxy for the node it's proxying for.
+ """
+ cdef xmlNode* c_node = proxy._c_node
+ assert c_node._private is <void*>proxy, u"Tried to unregister unknown proxy"
+ c_node._private = NULL
+ return 0
+
+
+################################################################################
+# temporarily make a node the root node of its document
+
+cdef xmlDoc* _fakeRootDoc(xmlDoc* c_base_doc, xmlNode* c_node) except NULL:
+ return _plainFakeRootDoc(c_base_doc, c_node, 1)
+
+cdef xmlDoc* _plainFakeRootDoc(xmlDoc* c_base_doc, xmlNode* c_node,
+ bint with_siblings) except NULL:
+ # build a temporary document that has the given node as root node
+ # note that copy and original must not be modified during its lifetime!!
+ # always call _destroyFakeDoc() after use!
+ cdef xmlNode* c_child
+ cdef xmlNode* c_root
+ cdef xmlNode* c_new_root
+ cdef xmlDoc* c_doc
+ if with_siblings or (c_node.prev is NULL and c_node.next is NULL):
+ c_root = tree.xmlDocGetRootElement(c_base_doc)
+ if c_root is c_node:
+ # already the root node, no siblings
+ return c_base_doc
+
+ c_doc = _copyDoc(c_base_doc, 0) # non recursive!
+ c_new_root = tree.xmlDocCopyNode(c_node, c_doc, 2) # non recursive!
+ tree.xmlDocSetRootElement(c_doc, c_new_root)
+ _copyParentNamespaces(c_node, c_new_root)
+
+ c_new_root.children = c_node.children
+ c_new_root.last = c_node.last
+ c_new_root.next = c_new_root.prev = NULL
+
+ # store original node
+ c_doc._private = c_node
+
+ # divert parent pointers of children
+ c_child = c_new_root.children
+ while c_child is not NULL:
+ c_child.parent = c_new_root
+ c_child = c_child.next
+
+ c_doc.children = c_new_root
+ return c_doc
+
+cdef void _destroyFakeDoc(xmlDoc* c_base_doc, xmlDoc* c_doc):
+ # delete a temporary document
+ cdef xmlNode* c_child
+ cdef xmlNode* c_parent
+ cdef xmlNode* c_root
+ if c_doc is c_base_doc:
+ return
+ c_root = tree.xmlDocGetRootElement(c_doc)
+
+ # restore parent pointers of children
+ c_parent = <xmlNode*>c_doc._private
+ c_child = c_root.children
+ while c_child is not NULL:
+ c_child.parent = c_parent
+ c_child = c_child.next
+
+ # prevent recursive removal of children
+ c_root.children = c_root.last = NULL
+ tree.xmlFreeDoc(c_doc)
+
+cdef _Element _fakeDocElementFactory(_Document doc, xmlNode* c_element):
+ u"""Special element factory for cases where we need to create a fake
+ root document, but still need to instantiate arbitrary nodes from
+ it. If we instantiate the fake root node, things will turn bad
+ when it's destroyed.
+
+ Instead, if we are asked to instantiate the fake root node, we
+ instantiate the original node instead.
+ """
+ if c_element.doc is not doc._c_doc:
+ if c_element.doc._private is not NULL:
+ if c_element is c_element.doc.children:
+ c_element = <xmlNode*>c_element.doc._private
+ #assert c_element.type == tree.XML_ELEMENT_NODE
+ return _elementFactory(doc, c_element)
+
+################################################################################
+# support for freeing tree elements when proxy objects are destroyed
+
+cdef int attemptDeallocation(xmlNode* c_node):
+ u"""Attempt deallocation of c_node (or higher up in tree).
+ """
+ cdef xmlNode* c_top
+ # could be we actually aren't referring to the tree at all
+ if c_node is NULL:
+ #print "not freeing, node is NULL"
+ return 0
+ c_top = getDeallocationTop(c_node)
+ if c_top is not NULL:
+ #print "freeing:", c_top.name
+ _removeText(c_top.next) # tail
+ tree.xmlFreeNode(c_top)
+ return 1
+ return 0
+
+cdef xmlNode* getDeallocationTop(xmlNode* c_node):
+ u"""Return the top of the tree that can be deallocated, or NULL.
+ """
+ cdef xmlNode* c_next
+ #print "trying to do deallocating:", c_node.type
+ if hasProxy(c_node):
+ #print "Not freeing: proxies still exist"
+ return NULL
+ while c_node.parent is not NULL:
+ c_node = c_node.parent
+ #print "checking:", c_current.type
+ if c_node.type == tree.XML_DOCUMENT_NODE or \
+ c_node.type == tree.XML_HTML_DOCUMENT_NODE:
+ #print "not freeing: still in doc"
+ return NULL
+ # if we're still attached to the document, don't deallocate
+ if hasProxy(c_node):
+ #print "Not freeing: proxies still exist"
+ return NULL
+ # see whether we have children to deallocate
+ if not canDeallocateChildNodes(c_node):
+ return NULL
+ # see whether we have siblings to deallocate
+ c_next = c_node.prev
+ while c_next:
+ if _isElement(c_next):
+ if hasProxy(c_next) or not canDeallocateChildNodes(c_next):
+ return NULL
+ c_next = c_next.prev
+ c_next = c_node.next
+ while c_next:
+ if _isElement(c_next):
+ if hasProxy(c_next) or not canDeallocateChildNodes(c_next):
+ return NULL
+ c_next = c_next.next
+ return c_node
+
+cdef int canDeallocateChildNodes(xmlNode* c_parent):
+ cdef xmlNode* c_node
+ c_node = c_parent.children
+ tree.BEGIN_FOR_EACH_ELEMENT_FROM(c_parent, c_node, 1)
+ if hasProxy(c_node):
+ return 0
+ tree.END_FOR_EACH_ELEMENT_FROM(c_node)
+ return 1
+
+################################################################################
+# fix _Document references and namespaces when a node changes documents
+
+cdef void _copyParentNamespaces(xmlNode* c_from_node, xmlNode* c_to_node) nogil:
+ u"""Copy the namespaces of all ancestors of c_from_node to c_to_node.
+ """
+ cdef xmlNode* c_parent
+ cdef xmlNs* c_ns
+ cdef xmlNs* c_new_ns
+ cdef int prefix_known
+ c_parent = c_from_node.parent
+ while c_parent and (tree._isElementOrXInclude(c_parent) or
+ c_parent.type == tree.XML_DOCUMENT_NODE):
+ c_new_ns = c_parent.nsDef
+ while c_new_ns:
+ # libxml2 will check if the prefix is already defined
+ tree.xmlNewNs(c_to_node, c_new_ns.href, c_new_ns.prefix)
+ c_new_ns = c_new_ns.next
+ c_parent = c_parent.parent
+
+
+ctypedef struct _ns_update_map:
+ xmlNs* old
+ xmlNs* new
+
+
+ctypedef struct _nscache:
+ _ns_update_map* ns_map
+ size_t size
+ size_t last
+
+
+cdef int _growNsCache(_nscache* c_ns_cache) except -1:
+ cdef _ns_update_map* ns_map_ptr
+ if c_ns_cache.size == 0:
+ c_ns_cache.size = 20
+ else:
+ c_ns_cache.size *= 2
+ ns_map_ptr = <_ns_update_map*> python.lxml_realloc(
+ c_ns_cache.ns_map, c_ns_cache.size, sizeof(_ns_update_map))
+ if not ns_map_ptr:
+ python.lxml_free(c_ns_cache.ns_map)
+ c_ns_cache.ns_map = NULL
+ raise MemoryError()
+ c_ns_cache.ns_map = ns_map_ptr
+ return 0
+
+
+cdef inline int _appendToNsCache(_nscache* c_ns_cache,
+ xmlNs* c_old_ns, xmlNs* c_new_ns) except -1:
+ if c_ns_cache.last >= c_ns_cache.size:
+ _growNsCache(c_ns_cache)
+ c_ns_cache.ns_map[c_ns_cache.last] = _ns_update_map(old=c_old_ns, new=c_new_ns)
+ c_ns_cache.last += 1
+
+
+cdef int _stripRedundantNamespaceDeclarations(xmlNode* c_element, _nscache* c_ns_cache,
+ xmlNs** c_del_ns_list) except -1:
+ u"""Removes namespace declarations from an element that are already
+ defined in its parents. Does not free the xmlNs's, just prepends
+ them to the c_del_ns_list.
+ """
+ cdef xmlNs* c_ns
+ cdef xmlNs* c_ns_next
+ cdef xmlNs** c_nsdef
+ # use a xmlNs** to handle assignments to "c_element.nsDef" correctly
+ c_nsdef = &c_element.nsDef
+ while c_nsdef[0] is not NULL:
+ c_ns = tree.xmlSearchNsByHref(
+ c_element.doc, c_element.parent, c_nsdef[0].href)
+ if c_ns is NULL:
+ # new namespace href => keep and cache the ns declaration
+ _appendToNsCache(c_ns_cache, c_nsdef[0], c_nsdef[0])
+ c_nsdef = &c_nsdef[0].next
+ else:
+ # known namespace href => cache mapping and strip old ns
+ _appendToNsCache(c_ns_cache, c_nsdef[0], c_ns)
+ # cut out c_nsdef.next and prepend it to garbage chain
+ c_ns_next = c_nsdef[0].next
+ c_nsdef[0].next = c_del_ns_list[0]
+ c_del_ns_list[0] = c_nsdef[0]
+ c_nsdef[0] = c_ns_next
+ return 0
+
+
+cdef void _cleanUpFromNamespaceAdaptation(xmlNode* c_start_node,
+ _nscache* c_ns_cache, xmlNs* c_del_ns_list):
+ # Try to recover from exceptions with really bad timing. We were in the middle
+ # of ripping out xmlNS-es and likely ran out of memory. Try to fix up the tree
+ # by re-adding the original xmlNs declarations (which might still be used in some
+ # places).
+ if c_ns_cache.ns_map:
+ python.lxml_free(c_ns_cache.ns_map)
+ if c_del_ns_list:
+ if not c_start_node.nsDef:
+ c_start_node.nsDef = c_del_ns_list
+ else:
+ c_ns = c_start_node.nsDef
+ while c_ns.next:
+ c_ns = c_ns.next
+ c_ns.next = c_del_ns_list
+
+
+cdef int moveNodeToDocument(_Document doc, xmlDoc* c_source_doc,
+ xmlNode* c_element) except -1:
+ u"""Fix the xmlNs pointers of a node and its subtree that were moved.
+
+ Originally copied from libxml2's xmlReconciliateNs(). Expects
+ libxml2 doc pointers of node to be correct already, but fixes
+ _Document references.
+
+ For each node in the subtree, we do this:
+
+ 1) Remove redundant declarations of namespace that are already
+ defined in its parents.
+
+ 2) Replace namespaces that are *not* defined on the node or its
+ parents by the equivalent namespace declarations that *are*
+ defined on the node or its parents (possibly using a different
+ prefix). If a namespace is unknown, declare a new one on the
+ node.
+
+ 3) Reassign the names of tags and attribute from the dict of the
+ target document *iff* it is different from the dict used in the
+ source subtree.
+
+ 4) Set the Document reference to the new Document (if different).
+ This is done on backtracking to keep the original Document
+ alive as long as possible, until all its elements are updated.
+
+ Note that the namespace declarations are removed from the tree in
+ step 1), but freed only after the complete subtree was traversed
+ and all occurrences were replaced by tree-internal pointers.
+ """
+ cdef xmlNode* c_start_node
+ cdef xmlNode* c_node
+ cdef xmlDoc* c_doc = doc._c_doc
+ cdef tree.xmlAttr* c_attr
+ cdef char* c_name
+ cdef _nscache c_ns_cache = [NULL, 0, 0]
+ cdef xmlNs* c_del_ns_list = NULL
+ cdef proxy_count = 0
+
+ if not tree._isElementOrXInclude(c_element):
+ return 0
+
+ c_start_node = c_element
+
+ tree.BEGIN_FOR_EACH_FROM(c_element, c_element, 1)
+ if tree._isElementOrXInclude(c_element):
+ if hasProxy(c_element):
+ proxy_count += 1
+
+ # 1) cut out namespaces defined here that are already known by
+ # the ancestors
+ if c_element.nsDef is not NULL:
+ try:
+ _stripRedundantNamespaceDeclarations(c_element, &c_ns_cache, &c_del_ns_list)
+ except:
+ _cleanUpFromNamespaceAdaptation(c_start_node, &c_ns_cache, c_del_ns_list)
+ raise
+
+ # 2) make sure the namespaces of an element and its attributes
+ # are declared in this document (i.e. on the node or its parents)
+ if c_element.ns is not NULL:
+ _fixCNs(doc, c_start_node, c_element, &c_ns_cache, c_del_ns_list)
+
+ c_node = <xmlNode*>c_element.properties
+ while c_node is not NULL:
+ if c_node.ns is not NULL:
+ _fixCNs(doc, c_start_node, c_node, &c_ns_cache, c_del_ns_list)
+ c_node = c_node.next
+
+ tree.END_FOR_EACH_FROM(c_element)
+
+ # free now unused namespace declarations
+ if c_del_ns_list is not NULL:
+ tree.xmlFreeNsList(c_del_ns_list)
+
+ # cleanup
+ if c_ns_cache.ns_map is not NULL:
+ python.lxml_free(c_ns_cache.ns_map)
+
+ # 3) fix the names in the tree if we moved it from a different thread
+ if doc._c_doc.dict is not c_source_doc.dict:
+ fixThreadDictNames(c_start_node, c_source_doc.dict, doc._c_doc.dict)
+
+ # 4) fix _Document references
+ # (and potentially deallocate the source document)
+ if proxy_count > 0:
+ if proxy_count == 1 and c_start_node._private is not NULL:
+ proxy = getProxy(c_start_node)
+ if proxy is not None:
+ if proxy._doc is not doc:
+ proxy._doc = doc
+ else:
+ fixElementDocument(c_start_node, doc, proxy_count)
+ else:
+ fixElementDocument(c_start_node, doc, proxy_count)
+
+ return 0
+
+
+cdef void _setTreeDoc(xmlNode* c_node, xmlDoc* c_doc):
+ """Adaptation of 'xmlSetTreeDoc()' that deep-fixes the document links iteratively.
+ It avoids https://gitlab.gnome.org/GNOME/libxml2/issues/42
+ """
+ tree.BEGIN_FOR_EACH_FROM(c_node, c_node, 1)
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ c_attr = <tree.xmlAttr*>c_node.properties
+ while c_attr:
+ if c_attr.atype == tree.XML_ATTRIBUTE_ID:
+ tree.xmlRemoveID(c_node.doc, c_attr)
+ c_attr.doc = c_doc
+ _fixDocChildren(c_attr.children, c_doc)
+ c_attr = c_attr.next
+ # Set doc link for all nodes, not only elements.
+ c_node.doc = c_doc
+ tree.END_FOR_EACH_FROM(c_node)
+
+
+cdef inline void _fixDocChildren(xmlNode* c_child, xmlDoc* c_doc):
+ while c_child:
+ c_child.doc = c_doc
+ if c_child.children:
+ _fixDocChildren(c_child.children, c_doc)
+ c_child = c_child.next
+
+
+cdef int _fixCNs(_Document doc, xmlNode* c_start_node, xmlNode* c_node,
+ _nscache* c_ns_cache, xmlNs* c_del_ns_list) except -1:
+ cdef xmlNs* c_ns = NULL
+ cdef bint is_prefixed_attr = (c_node.type == tree.XML_ATTRIBUTE_NODE and c_node.ns.prefix)
+
+ for ns_map in c_ns_cache.ns_map[:c_ns_cache.last]:
+ if c_node.ns is ns_map.old:
+ if is_prefixed_attr and not ns_map.new.prefix:
+ # avoid dropping prefix from attributes
+ continue
+ c_ns = ns_map.new
+ break
+
+ if c_ns:
+ c_node.ns = c_ns
+ else:
+ # not in cache or not acceptable
+ # => find a replacement from this document
+ try:
+ c_ns = doc._findOrBuildNodeNs(
+ c_start_node, c_node.ns.href, c_node.ns.prefix,
+ c_node.type == tree.XML_ATTRIBUTE_NODE)
+ c_node.ns = c_ns
+ _appendToNsCache(c_ns_cache, c_node.ns, c_ns)
+ except:
+ _cleanUpFromNamespaceAdaptation(c_start_node, c_ns_cache, c_del_ns_list)
+ raise
+ return 0
+
+
+cdef void fixElementDocument(xmlNode* c_element, _Document doc,
+ size_t proxy_count):
+ cdef xmlNode* c_node = c_element
+ cdef _Element proxy = None # init-to-None required due to fake-loop below
+ tree.BEGIN_FOR_EACH_FROM(c_element, c_node, 1)
+ if c_node._private is not NULL:
+ proxy = getProxy(c_node)
+ if proxy is not None:
+ if proxy._doc is not doc:
+ proxy._doc = doc
+ proxy_count -= 1
+ if proxy_count == 0:
+ return
+ tree.END_FOR_EACH_FROM(c_node)
+
+
+cdef void fixThreadDictNames(xmlNode* c_element,
+ tree.xmlDict* c_src_dict,
+ tree.xmlDict* c_dict) nogil:
+ # re-assign the names of tags and attributes
+ #
+ # this should only be called when the element is based on a
+ # different libxml2 tag name dictionary
+ if c_element.type == tree.XML_DOCUMENT_NODE or \
+ c_element.type == tree.XML_HTML_DOCUMENT_NODE:
+ # may define "xml" namespace
+ fixThreadDictNsForNode(c_element, c_src_dict, c_dict)
+ if c_element.doc.extSubset:
+ fixThreadDictNamesForDtd(c_element.doc.extSubset, c_src_dict, c_dict)
+ if c_element.doc.intSubset:
+ fixThreadDictNamesForDtd(c_element.doc.intSubset, c_src_dict, c_dict)
+ c_element = c_element.children
+ while c_element is not NULL:
+ fixThreadDictNamesForNode(c_element, c_src_dict, c_dict)
+ c_element = c_element.next
+ elif tree._isElementOrXInclude(c_element):
+ fixThreadDictNamesForNode(c_element, c_src_dict, c_dict)
+
+
+cdef inline void _fixThreadDictPtr(const_xmlChar** c_ptr,
+ tree.xmlDict* c_src_dict,
+ tree.xmlDict* c_dict) nogil:
+ c_str = c_ptr[0]
+ if c_str and c_src_dict and tree.xmlDictOwns(c_src_dict, c_str):
+ # return value can be NULL on memory error, but we don't handle that here
+ c_str = tree.xmlDictLookup(c_dict, c_str, -1)
+ if c_str:
+ c_ptr[0] = c_str
+
+
+cdef void fixThreadDictNamesForNode(xmlNode* c_element,
+ tree.xmlDict* c_src_dict,
+ tree.xmlDict* c_dict) nogil:
+ cdef xmlNode* c_node = c_element
+ tree.BEGIN_FOR_EACH_FROM(c_element, c_node, 1)
+ if c_node.type in (tree.XML_ELEMENT_NODE, tree.XML_XINCLUDE_START):
+ fixThreadDictNamesForAttributes(
+ c_node.properties, c_src_dict, c_dict)
+ fixThreadDictNsForNode(c_node, c_src_dict, c_dict)
+ _fixThreadDictPtr(&c_node.name, c_src_dict, c_dict)
+ elif c_node.type == tree.XML_TEXT_NODE:
+ # libxml2's SAX2 parser interns some indentation space
+ fixThreadDictContentForNode(c_node, c_src_dict, c_dict)
+ elif c_node.type == tree.XML_COMMENT_NODE:
+ pass # don't touch c_node.name
+ else:
+ _fixThreadDictPtr(&c_node.name, c_src_dict, c_dict)
+ tree.END_FOR_EACH_FROM(c_node)
+
+
+cdef inline void fixThreadDictNamesForAttributes(tree.xmlAttr* c_attr,
+ tree.xmlDict* c_src_dict,
+ tree.xmlDict* c_dict) nogil:
+ cdef xmlNode* c_child
+ cdef xmlNode* c_node = <xmlNode*>c_attr
+ while c_node is not NULL:
+ if c_node.type not in (tree.XML_TEXT_NODE, tree.XML_COMMENT_NODE):
+ _fixThreadDictPtr(&c_node.name, c_src_dict, c_dict)
+ # libxml2 keeps some (!) attribute values in the dict
+ c_child = c_node.children
+ while c_child is not NULL:
+ fixThreadDictContentForNode(c_child, c_src_dict, c_dict)
+ c_child = c_child.next
+ c_node = c_node.next
+
+
+cdef inline void fixThreadDictContentForNode(xmlNode* c_node,
+ tree.xmlDict* c_src_dict,
+ tree.xmlDict* c_dict) nogil:
+ if c_node.content is not NULL and \
+ c_node.content is not <xmlChar*>&c_node.properties:
+ if tree.xmlDictOwns(c_src_dict, c_node.content):
+ # result can be NULL on memory error, but we don't handle that here
+ c_node.content = <xmlChar*>tree.xmlDictLookup(c_dict, c_node.content, -1)
+
+
+cdef inline void fixThreadDictNsForNode(xmlNode* c_node,
+ tree.xmlDict* c_src_dict,
+ tree.xmlDict* c_dict) nogil:
+ cdef xmlNs* c_ns = c_node.nsDef
+ while c_ns is not NULL:
+ _fixThreadDictPtr(&c_ns.href, c_src_dict, c_dict)
+ _fixThreadDictPtr(&c_ns.prefix, c_src_dict, c_dict)
+ c_ns = c_ns.next
+
+
+cdef void fixThreadDictNamesForDtd(tree.xmlDtd* c_dtd,
+ tree.xmlDict* c_src_dict,
+ tree.xmlDict* c_dict) nogil:
+ cdef xmlNode* c_node
+ cdef tree.xmlElement* c_element
+ cdef tree.xmlAttribute* c_attribute
+ cdef tree.xmlEntity* c_entity
+
+ c_node = c_dtd.children
+ while c_node:
+ if c_node.type == tree.XML_ELEMENT_DECL:
+ c_element = <tree.xmlElement*>c_node
+ if c_element.content:
+ _fixThreadDictPtr(&c_element.content.name, c_src_dict, c_dict)
+ _fixThreadDictPtr(&c_element.content.prefix, c_src_dict, c_dict)
+ c_attribute = c_element.attributes
+ while c_attribute:
+ _fixThreadDictPtr(&c_attribute.defaultValue, c_src_dict, c_dict)
+ _fixThreadDictPtr(&c_attribute.name, c_src_dict, c_dict)
+ _fixThreadDictPtr(&c_attribute.prefix, c_src_dict, c_dict)
+ _fixThreadDictPtr(&c_attribute.elem, c_src_dict, c_dict)
+ c_attribute = c_attribute.nexth
+ elif c_node.type == tree.XML_ENTITY_DECL:
+ c_entity = <tree.xmlEntity*>c_node
+ _fixThreadDictPtr(&c_entity.name, c_src_dict, c_dict)
+ _fixThreadDictPtr(&c_entity.ExternalID, c_src_dict, c_dict)
+ _fixThreadDictPtr(&c_entity.SystemID, c_src_dict, c_dict)
+ _fixThreadDictPtr(<const_xmlChar**>&c_entity.content, c_src_dict, c_dict)
+ c_node = c_node.next
+
+
+################################################################################
+# adopt an xmlDoc from an external libxml2 document source
+
+cdef _Document _adoptForeignDoc(xmlDoc* c_doc, _BaseParser parser=None, bint is_owned=True):
+ """Convert and wrap an externally produced xmlDoc for use in lxml.
+ Assures that all '_private' pointers are NULL to prevent accidental
+ dereference into lxml proxy objects.
+ """
+ if c_doc is NULL:
+ raise ValueError("Illegal document provided: NULL")
+ if c_doc.type not in (tree.XML_DOCUMENT_NODE, tree.XML_HTML_DOCUMENT_NODE):
+ doc_type = c_doc.type
+ if is_owned:
+ tree.xmlFreeDoc(c_doc)
+ raise ValueError(f"Illegal document provided: expected XML or HTML, found {doc_type}")
+
+ cdef xmlNode* c_node = <xmlNode*>c_doc
+
+ if is_owned:
+ tree.BEGIN_FOR_EACH_FROM(<xmlNode*>c_doc, c_node, 1)
+ c_node._private = NULL
+ tree.END_FOR_EACH_FROM(c_node)
+ else:
+ # create a fresh copy that lxml owns
+ c_doc = tree.xmlCopyDoc(c_doc, 1)
+ if c_doc is NULL:
+ raise MemoryError()
+
+ return _documentFactory(c_doc, parser)
diff --git a/src/lxml/public-api.pxi b/src/lxml/public-api.pxi
new file mode 100644
index 0000000..1c4a552
--- /dev/null
+++ b/src/lxml/public-api.pxi
@@ -0,0 +1,178 @@
+# Public C API for lxml.etree
+
+cdef public api _Element deepcopyNodeToDocument(_Document doc, xmlNode* c_root):
+ u"Recursively copy the element into the document. doc is not modified."
+ cdef xmlNode* c_node
+ c_node = _copyNodeToDoc(c_root, doc._c_doc)
+ return _elementFactory(doc, c_node)
+
+cdef public api _ElementTree elementTreeFactory(_Element context_node):
+ _assertValidNode(context_node)
+ return newElementTree(context_node, _ElementTree)
+
+cdef public api _ElementTree newElementTree(_Element context_node,
+ object subclass):
+ if <void*>context_node is NULL or context_node is None:
+ raise TypeError
+ _assertValidNode(context_node)
+ return _newElementTree(context_node._doc, context_node, subclass)
+
+cdef public api _ElementTree adoptExternalDocument(xmlDoc* c_doc, parser, bint is_owned):
+ if c_doc is NULL:
+ raise TypeError
+ doc = _adoptForeignDoc(c_doc, parser, is_owned)
+ return _elementTreeFactory(doc, None)
+
+cdef public api _Element elementFactory(_Document doc, xmlNode* c_node):
+ if c_node is NULL or doc is None:
+ raise TypeError
+ return _elementFactory(doc, c_node)
+
+cdef public api _Element makeElement(tag, _Document doc, parser,
+ text, tail, attrib, nsmap):
+ return _makeElement(tag, NULL, doc, parser, text, tail, attrib, nsmap, None)
+
+cdef public api _Element makeSubElement(_Element parent, tag, text, tail,
+ attrib, nsmap):
+ _assertValidNode(parent)
+ return _makeSubElement(parent, tag, text, tail, attrib, nsmap, None)
+
+cdef public api void setElementClassLookupFunction(
+ _element_class_lookup_function function, state):
+ _setElementClassLookupFunction(function, state)
+
+cdef public api object lookupDefaultElementClass(state, doc, xmlNode* c_node):
+ return _lookupDefaultElementClass(state, doc, c_node)
+
+cdef public api object lookupNamespaceElementClass(state, doc, xmlNode* c_node):
+ return _find_nselement_class(state, doc, c_node)
+
+cdef public api object callLookupFallback(FallbackElementClassLookup lookup,
+ _Document doc, xmlNode* c_node):
+ return _callLookupFallback(lookup, doc, c_node)
+
+cdef public api int tagMatches(xmlNode* c_node, const_xmlChar* c_href, const_xmlChar* c_name):
+ if c_node is NULL:
+ return -1
+ return _tagMatches(c_node, c_href, c_name)
+
+cdef public api _Document documentOrRaise(object input):
+ return _documentOrRaise(input)
+
+cdef public api _Element rootNodeOrRaise(object input):
+ return _rootNodeOrRaise(input)
+
+cdef public api bint hasText(xmlNode* c_node):
+ return _hasText(c_node)
+
+cdef public api bint hasTail(xmlNode* c_node):
+ return _hasTail(c_node)
+
+cdef public api object textOf(xmlNode* c_node):
+ if c_node is NULL:
+ return None
+ return _collectText(c_node.children)
+
+cdef public api object tailOf(xmlNode* c_node):
+ if c_node is NULL:
+ return None
+ return _collectText(c_node.next)
+
+cdef public api int setNodeText(xmlNode* c_node, text) except -1:
+ if c_node is NULL:
+ raise ValueError
+ return _setNodeText(c_node, text)
+
+cdef public api int setTailText(xmlNode* c_node, text) except -1:
+ if c_node is NULL:
+ raise ValueError
+ return _setTailText(c_node, text)
+
+cdef public api object attributeValue(xmlNode* c_element, xmlAttr* c_attrib_node):
+ return _attributeValue(c_element, c_attrib_node)
+
+cdef public api object attributeValueFromNsName(xmlNode* c_element,
+ const_xmlChar* ns, const_xmlChar* name):
+ return _attributeValueFromNsName(c_element, ns, name)
+
+cdef public api object getAttributeValue(_Element element, key, default):
+ _assertValidNode(element)
+ return _getAttributeValue(element, key, default)
+
+cdef public api object iterattributes(_Element element, int keysvalues):
+ _assertValidNode(element)
+ return _attributeIteratorFactory(element, keysvalues)
+
+cdef public api list collectAttributes(xmlNode* c_element, int keysvalues):
+ return _collectAttributes(c_element, keysvalues)
+
+cdef public api int setAttributeValue(_Element element, key, value) except -1:
+ _assertValidNode(element)
+ return _setAttributeValue(element, key, value)
+
+cdef public api int delAttribute(_Element element, key) except -1:
+ _assertValidNode(element)
+ return _delAttribute(element, key)
+
+cdef public api int delAttributeFromNsName(tree.xmlNode* c_element,
+ const_xmlChar* c_href, const_xmlChar* c_name):
+ return _delAttributeFromNsName(c_element, c_href, c_name)
+
+cdef public api bint hasChild(xmlNode* c_node):
+ return _hasChild(c_node)
+
+cdef public api xmlNode* findChild(xmlNode* c_node, Py_ssize_t index):
+ return _findChild(c_node, index)
+
+cdef public api xmlNode* findChildForwards(xmlNode* c_node, Py_ssize_t index):
+ return _findChildForwards(c_node, index)
+
+cdef public api xmlNode* findChildBackwards(xmlNode* c_node, Py_ssize_t index):
+ return _findChildBackwards(c_node, index)
+
+cdef public api xmlNode* nextElement(xmlNode* c_node):
+ return _nextElement(c_node)
+
+cdef public api xmlNode* previousElement(xmlNode* c_node):
+ return _previousElement(c_node)
+
+cdef public api void appendChild(_Element parent, _Element child):
+ # deprecated, use appendChildToElement() instead!
+ _appendChild(parent, child)
+
+cdef public api int appendChildToElement(_Element parent, _Element child) except -1:
+ return _appendChild(parent, child)
+
+cdef public api object pyunicode(const_xmlChar* s):
+ if s is NULL:
+ raise TypeError
+ return funicode(s)
+
+cdef public api bytes utf8(object s):
+ return _utf8(s)
+
+cdef public api tuple getNsTag(object tag):
+ return _getNsTag(tag)
+
+cdef public api tuple getNsTagWithEmptyNs(object tag):
+ return _getNsTagWithEmptyNs(tag)
+
+cdef public api object namespacedName(xmlNode* c_node):
+ return _namespacedName(c_node)
+
+cdef public api object namespacedNameFromNsName(const_xmlChar* href, const_xmlChar* name):
+ return _namespacedNameFromNsName(href, name)
+
+cdef public api void iteratorStoreNext(_ElementIterator iterator, _Element node):
+ # deprecated!
+ iterator._storeNext(node)
+
+cdef public api void initTagMatch(_ElementTagMatcher matcher, tag):
+ # deprecated!
+ matcher._initTagMatch(tag)
+
+cdef public api tree.xmlNs* findOrBuildNodeNsPrefix(
+ _Document doc, xmlNode* c_node, const_xmlChar* href, const_xmlChar* prefix) except NULL:
+ if doc is None:
+ raise TypeError
+ return doc._findOrBuildNodeNs(c_node, href, prefix, 0)
diff --git a/src/lxml/pyclasslookup.py b/src/lxml/pyclasslookup.py
new file mode 100644
index 0000000..9e1496d
--- /dev/null
+++ b/src/lxml/pyclasslookup.py
@@ -0,0 +1,3 @@
+# dummy module for backwards compatibility
+
+from lxml.etree import PythonElementClassLookup
diff --git a/src/lxml/python.pxd b/src/lxml/python.pxd
new file mode 100644
index 0000000..0d26cdd
--- /dev/null
+++ b/src/lxml/python.pxd
@@ -0,0 +1,132 @@
+from libc cimport stdio
+from libc.string cimport const_char
+cimport cython
+
+cdef extern from *:
+ cdef bint PEP393_ENABLED "CYTHON_PEP393_ENABLED"
+
+cdef extern from "Python.h":
+ ctypedef struct PyObject
+ cdef int PY_SSIZE_T_MAX
+ cdef int PY_VERSION_HEX
+
+ cdef void Py_INCREF(object o)
+ cdef void Py_DECREF(object o)
+ cdef void Py_XDECREF(PyObject* o)
+
+ cdef stdio.FILE* PyFile_AsFile(object p)
+
+ # PEP 393
+ cdef bint PyUnicode_IS_READY(object u)
+ cdef Py_ssize_t PyUnicode_GET_LENGTH(object u)
+ cdef int PyUnicode_KIND(object u)
+ cdef void* PyUnicode_DATA(object u)
+
+ cdef bytes PyUnicode_AsEncodedString(object u, char* encoding,
+ char* errors)
+ cdef cython.unicode PyUnicode_FromFormat(char* format, ...) # Python 3
+ cdef cython.unicode PyUnicode_Decode(char* s, Py_ssize_t size,
+ char* encoding, char* errors)
+ cdef cython.unicode PyUnicode_DecodeUTF8(char* s, Py_ssize_t size, char* errors)
+ cdef cython.unicode PyUnicode_DecodeLatin1(char* s, Py_ssize_t size, char* errors)
+ cdef object PyUnicode_RichCompare(object o1, object o2, int op)
+ cdef bytes PyUnicode_AsUTF8String(object ustring)
+ cdef bytes PyUnicode_AsASCIIString(object ustring)
+ cdef char* PyUnicode_AS_DATA(object ustring)
+ cdef Py_ssize_t PyUnicode_GET_DATA_SIZE(object ustring)
+ cdef Py_ssize_t PyUnicode_GET_SIZE(object ustring)
+ cdef bytes PyBytes_FromStringAndSize(char* s, Py_ssize_t size)
+ cdef bytes PyBytes_FromFormat(char* format, ...)
+ cdef Py_ssize_t PyBytes_GET_SIZE(object s)
+
+ cdef object PyNumber_Int(object value)
+ cdef Py_ssize_t PyInt_AsSsize_t(object value)
+
+ cdef Py_ssize_t PyTuple_GET_SIZE(object t)
+ cdef object PyTuple_GET_ITEM(object o, Py_ssize_t pos)
+
+ cdef object PyList_New(Py_ssize_t index)
+ cdef Py_ssize_t PyList_GET_SIZE(object l)
+ cdef object PyList_GET_ITEM(object l, Py_ssize_t index)
+ cdef void PyList_SET_ITEM(object l, Py_ssize_t index, object value)
+ cdef int PyList_Insert(object l, Py_ssize_t index, object o) except -1
+ cdef object PyList_AsTuple(object l)
+ cdef void PyList_Clear(object l)
+
+ cdef PyObject* PyDict_GetItemString(object d, char* key)
+ cdef PyObject* PyDict_GetItem(object d, object key)
+ cdef void PyDict_Clear(object d)
+ cdef object PyDictProxy_New(object d)
+ cdef Py_ssize_t PyDict_Size(object d)
+ cdef object PySequence_List(object o)
+ cdef object PySequence_Tuple(object o)
+
+ cdef bint PyNumber_Check(object instance)
+ cdef bint PySequence_Check(object instance)
+ cdef bint PyType_Check(object instance)
+ cdef bint PyTuple_CheckExact(object instance)
+
+ cdef int _PyEval_SliceIndex(object value, Py_ssize_t* index) except 0
+ cdef int PySlice_GetIndicesEx "_lx_PySlice_GetIndicesEx" (
+ object slice, Py_ssize_t length,
+ Py_ssize_t *start, Py_ssize_t *stop, Py_ssize_t *step,
+ Py_ssize_t *slicelength) except -1
+
+ cdef object PyObject_RichCompare(object o1, object o2, int op)
+ cdef int PyObject_RichCompareBool(object o1, object o2, int op)
+
+ PyObject* PyWeakref_NewRef(object ob, PyObject* callback) except NULL # used for PyPy only
+ object PyWeakref_LockObject(PyObject* ob) # PyPy only
+
+ cdef void* PyMem_Malloc(size_t size)
+ cdef void* PyMem_Realloc(void* p, size_t size)
+ cdef void PyMem_Free(void* p)
+
+ # always returns NULL to pass on the exception
+ cdef object PyErr_SetFromErrno(object type)
+
+ cdef PyObject* PyThreadState_GetDict()
+
+ # some handy functions
+ cdef char* _cstr "PyBytes_AS_STRING" (object s)
+ cdef char* __cstr "PyBytes_AS_STRING" (PyObject* s)
+
+ # Py_buffer related flags
+ cdef int PyBUF_SIMPLE
+ cdef int PyBUF_WRITABLE
+ cdef int PyBUF_LOCK
+ cdef int PyBUF_FORMAT
+ cdef int PyBUF_ND
+ cdef int PyBUF_STRIDES
+ cdef int PyBUF_C_CONTIGUOUS
+ cdef int PyBUF_F_CONTIGUOUS
+ cdef int PyBUF_ANY_CONTIGUOUS
+ cdef int PyBUF_INDIRECT
+
+cdef extern from "pythread.h":
+ ctypedef void* PyThread_type_lock
+ cdef PyThread_type_lock PyThread_allocate_lock()
+ cdef void PyThread_free_lock(PyThread_type_lock lock)
+ cdef int PyThread_acquire_lock(PyThread_type_lock lock, int mode) nogil
+ cdef void PyThread_release_lock(PyThread_type_lock lock)
+ cdef long PyThread_get_thread_ident()
+
+ ctypedef enum __WaitLock:
+ WAIT_LOCK
+ NOWAIT_LOCK
+
+cdef extern from "includes/etree_defs.h": # redefines some functions as macros
+ cdef void* lxml_malloc(size_t count, size_t item_size)
+ cdef void* lxml_realloc(void* mem, size_t count, size_t item_size)
+ cdef void lxml_free(void* mem)
+ cdef void* lxml_unpack_xmldoc_capsule(object capsule, bint* is_owned) except? NULL
+ cdef bint _isString(object obj)
+ cdef const_char* _fqtypename(object t)
+ cdef object PY_NEW(object t)
+ cdef bint LXML_UNICODE_STRINGS
+ cdef bint IS_PYTHON2
+ cdef bint IS_PYTHON3 # legacy, avoid
+ cdef bint IS_PYPY
+
+cdef extern from "lxml_endian.h":
+ cdef bint PY_BIG_ENDIAN # defined in later Py3.x versions
diff --git a/src/lxml/readonlytree.pxi b/src/lxml/readonlytree.pxi
new file mode 100644
index 0000000..cc25f98
--- /dev/null
+++ b/src/lxml/readonlytree.pxi
@@ -0,0 +1,565 @@
+# read-only tree implementation
+
+@cython.internal
+cdef class _ReadOnlyProxy:
+ u"A read-only proxy class suitable for PIs/Comments (for internal use only!)."
+ cdef bint _free_after_use
+ cdef xmlNode* _c_node
+ cdef _ReadOnlyProxy _source_proxy
+ cdef list _dependent_proxies
+ def __cinit__(self):
+ self._c_node = NULL
+ self._free_after_use = 0
+
+ cdef int _assertNode(self) except -1:
+ u"""This is our way of saying: this proxy is invalid!
+ """
+ if not self._c_node:
+ raise ReferenceError("Proxy invalidated!")
+ return 0
+
+ cdef int _raise_unsupported_type(self) except -1:
+ raise TypeError(f"Unsupported node type: {self._c_node.type}")
+
+ cdef void free_after_use(self):
+ u"""Should the xmlNode* be freed when releasing the proxy?
+ """
+ self._free_after_use = 1
+
+ @property
+ def tag(self):
+ """Element tag
+ """
+ self._assertNode()
+ if self._c_node.type == tree.XML_ELEMENT_NODE:
+ return _namespacedName(self._c_node)
+ elif self._c_node.type == tree.XML_PI_NODE:
+ return ProcessingInstruction
+ elif self._c_node.type == tree.XML_COMMENT_NODE:
+ return Comment
+ elif self._c_node.type == tree.XML_ENTITY_REF_NODE:
+ return Entity
+ else:
+ self._raise_unsupported_type()
+
+ @property
+ def text(self):
+ """Text before the first subelement. This is either a string or
+ the value None, if there was no text.
+ """
+ self._assertNode()
+ if self._c_node.type == tree.XML_ELEMENT_NODE:
+ return _collectText(self._c_node.children)
+ elif self._c_node.type in (tree.XML_PI_NODE,
+ tree.XML_COMMENT_NODE):
+ if self._c_node.content is NULL:
+ return ''
+ else:
+ return funicode(self._c_node.content)
+ elif self._c_node.type == tree.XML_ENTITY_REF_NODE:
+ return f'&{funicode(self._c_node.name)};'
+ else:
+ self._raise_unsupported_type()
+
+ @property
+ def tail(self):
+ """Text after this element's end tag, but before the next sibling
+ element's start tag. This is either a string or the value None, if
+ there was no text.
+ """
+ self._assertNode()
+ return _collectText(self._c_node.next)
+
+ @property
+ def sourceline(self):
+ """Original line number as found by the parser or None if unknown.
+ """
+ cdef long line
+ self._assertNode()
+ line = tree.xmlGetLineNo(self._c_node)
+ if line > 0:
+ return line
+ else:
+ return None
+
+ def __repr__(self):
+ self._assertNode()
+ if self._c_node.type == tree.XML_ELEMENT_NODE:
+ return "<Element %s at 0x%x>" % (strrepr(self.tag), id(self))
+ elif self._c_node.type == tree.XML_COMMENT_NODE:
+ return "<!--%s-->" % strrepr(self.text)
+ elif self._c_node.type == tree.XML_ENTITY_NODE:
+ return "&%s;" % strrepr(funicode(self._c_node.name))
+ elif self._c_node.type == tree.XML_PI_NODE:
+ text = self.text
+ if text:
+ return "<?%s %s?>" % (strrepr(self.target), text)
+ else:
+ return "<?%s?>" % strrepr(self.target)
+ else:
+ self._raise_unsupported_type()
+
+ def __getitem__(self, x):
+ u"""Returns the subelement at the given position or the requested
+ slice.
+ """
+ cdef xmlNode* c_node = NULL
+ cdef Py_ssize_t step = 0, slicelength = 0
+ cdef Py_ssize_t c, i
+ cdef _node_to_node_function next_element
+ cdef list result
+ self._assertNode()
+ if isinstance(x, slice):
+ # slicing
+ if _isFullSlice(<slice>x):
+ return _collectChildren(self)
+ _findChildSlice(<slice>x, self._c_node, &c_node, &step, &slicelength)
+ if c_node is NULL:
+ return []
+ if step > 0:
+ next_element = _nextElement
+ else:
+ step = -step
+ next_element = _previousElement
+ result = []
+ c = 0
+ while c_node is not NULL and c < slicelength:
+ result.append(_newReadOnlyProxy(self._source_proxy, c_node))
+ result.append(_elementFactory(self._doc, c_node))
+ c = c + 1
+ for i from 0 <= i < step:
+ c_node = next_element(c_node)
+ return result
+ else:
+ # indexing
+ c_node = _findChild(self._c_node, x)
+ if c_node is NULL:
+ raise IndexError, u"list index out of range"
+ return _newReadOnlyProxy(self._source_proxy, c_node)
+
+ def __len__(self):
+ u"""Returns the number of subelements.
+ """
+ cdef Py_ssize_t c
+ cdef xmlNode* c_node
+ self._assertNode()
+ c = 0
+ c_node = self._c_node.children
+ while c_node is not NULL:
+ if tree._isElement(c_node):
+ c = c + 1
+ c_node = c_node.next
+ return c
+
+ def __nonzero__(self):
+ cdef xmlNode* c_node
+ self._assertNode()
+ c_node = _findChildBackwards(self._c_node, 0)
+ return c_node != NULL
+
+ def __deepcopy__(self, memo):
+ u"__deepcopy__(self, memo)"
+ return self.__copy__()
+
+ cpdef __copy__(self):
+ u"__copy__(self)"
+ cdef xmlDoc* c_doc
+ cdef xmlNode* c_node
+ cdef _Document new_doc
+ if self._c_node is NULL:
+ return self
+ c_doc = _copyDocRoot(self._c_node.doc, self._c_node) # recursive
+ new_doc = _documentFactory(c_doc, None)
+ root = new_doc.getroot()
+ if root is not None:
+ return root
+ # Comment/PI
+ c_node = c_doc.children
+ while c_node is not NULL and c_node.type != self._c_node.type:
+ c_node = c_node.next
+ if c_node is NULL:
+ return None
+ return _elementFactory(new_doc, c_node)
+
+ def __iter__(self):
+ return iter(self.getchildren())
+
+ def iterchildren(self, tag=None, *, reversed=False):
+ u"""iterchildren(self, tag=None, reversed=False)
+
+ Iterate over the children of this element.
+ """
+ children = self.getchildren()
+ if tag is not None and tag != '*':
+ children = [ el for el in children if el.tag == tag ]
+ if reversed:
+ children = children[::-1]
+ return iter(children)
+
+ cpdef getchildren(self):
+ u"""Returns all subelements. The elements are returned in document
+ order.
+ """
+ cdef xmlNode* c_node
+ cdef list result
+ self._assertNode()
+ result = []
+ c_node = self._c_node.children
+ while c_node is not NULL:
+ if tree._isElement(c_node):
+ result.append(_newReadOnlyProxy(self._source_proxy, c_node))
+ c_node = c_node.next
+ return result
+
+ def getparent(self):
+ u"""Returns the parent of this element or None for the root element.
+ """
+ cdef xmlNode* c_parent
+ self._assertNode()
+ c_parent = self._c_node.parent
+ if c_parent is NULL or not tree._isElement(c_parent):
+ return None
+ else:
+ return _newReadOnlyProxy(self._source_proxy, c_parent)
+
+ def getnext(self):
+ u"""Returns the following sibling of this element or None.
+ """
+ cdef xmlNode* c_node
+ self._assertNode()
+ c_node = _nextElement(self._c_node)
+ if c_node is not NULL:
+ return _newReadOnlyProxy(self._source_proxy, c_node)
+ return None
+
+ def getprevious(self):
+ u"""Returns the preceding sibling of this element or None.
+ """
+ cdef xmlNode* c_node
+ self._assertNode()
+ c_node = _previousElement(self._c_node)
+ if c_node is not NULL:
+ return _newReadOnlyProxy(self._source_proxy, c_node)
+ return None
+
+
+@cython.final
+@cython.internal
+cdef class _ReadOnlyPIProxy(_ReadOnlyProxy):
+ """A read-only proxy for processing instructions (for internal use only!)"""
+ @property
+ def target(self):
+ self._assertNode()
+ return funicode(self._c_node.name)
+
+@cython.final
+@cython.internal
+cdef class _ReadOnlyEntityProxy(_ReadOnlyProxy):
+ """A read-only proxy for entity references (for internal use only!)"""
+ property name:
+ def __get__(self):
+ return funicode(self._c_node.name)
+
+ def __set__(self, value):
+ value_utf = _utf8(value)
+ if u'&' in value or u';' in value:
+ raise ValueError(f"Invalid entity name '{value}'")
+ tree.xmlNodeSetName(self._c_node, _xcstr(value_utf))
+
+ @property
+ def text(self):
+ return f'&{funicode(self._c_node.name)};'
+
+
+@cython.internal
+cdef class _ReadOnlyElementProxy(_ReadOnlyProxy):
+ """The main read-only Element proxy class (for internal use only!)."""
+
+ @property
+ def attrib(self):
+ self._assertNode()
+ return dict(_collectAttributes(self._c_node, 3))
+
+ @property
+ def prefix(self):
+ """Namespace prefix or None.
+ """
+ self._assertNode()
+ if self._c_node.ns is not NULL:
+ if self._c_node.ns.prefix is not NULL:
+ return funicode(self._c_node.ns.prefix)
+ return None
+
+ @property
+ def nsmap(self):
+ """Namespace prefix->URI mapping known in the context of this
+ Element. This includes all namespace declarations of the
+ parents.
+
+ Note that changing the returned dict has no effect on the Element.
+ """
+ self._assertNode()
+ return _build_nsmap(self._c_node)
+
+ def get(self, key, default=None):
+ u"""Gets an element attribute.
+ """
+ self._assertNode()
+ return _getNodeAttributeValue(self._c_node, key, default)
+
+ def keys(self):
+ u"""Gets a list of attribute names. The names are returned in an
+ arbitrary order (just like for an ordinary Python dictionary).
+ """
+ self._assertNode()
+ return _collectAttributes(self._c_node, 1)
+
+ def values(self):
+ u"""Gets element attributes, as a sequence. The attributes are returned
+ in an arbitrary order.
+ """
+ self._assertNode()
+ return _collectAttributes(self._c_node, 2)
+
+ def items(self):
+ u"""Gets element attributes, as a sequence. The attributes are returned
+ in an arbitrary order.
+ """
+ self._assertNode()
+ return _collectAttributes(self._c_node, 3)
+
+cdef _ReadOnlyProxy _newReadOnlyProxy(
+ _ReadOnlyProxy source_proxy, xmlNode* c_node):
+ cdef _ReadOnlyProxy el
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ el = _ReadOnlyElementProxy.__new__(_ReadOnlyElementProxy)
+ elif c_node.type == tree.XML_PI_NODE:
+ el = _ReadOnlyPIProxy.__new__(_ReadOnlyPIProxy)
+ elif c_node.type in (tree.XML_COMMENT_NODE,
+ tree.XML_ENTITY_REF_NODE):
+ el = _ReadOnlyProxy.__new__(_ReadOnlyProxy)
+ else:
+ raise TypeError(f"Unsupported element type: {c_node.type}")
+ el._c_node = c_node
+ _initReadOnlyProxy(el, source_proxy)
+ return el
+
+cdef inline _initReadOnlyProxy(_ReadOnlyProxy el,
+ _ReadOnlyProxy source_proxy):
+ if source_proxy is None:
+ el._source_proxy = el
+ el._dependent_proxies = [el]
+ else:
+ el._source_proxy = source_proxy
+ source_proxy._dependent_proxies.append(el)
+
+cdef _freeReadOnlyProxies(_ReadOnlyProxy sourceProxy):
+ cdef xmlNode* c_node
+ cdef _ReadOnlyProxy el
+ if sourceProxy is None:
+ return
+ if sourceProxy._dependent_proxies is None:
+ return
+ for el in sourceProxy._dependent_proxies:
+ c_node = el._c_node
+ el._c_node = NULL
+ if el._free_after_use:
+ tree.xmlFreeNode(c_node)
+ del sourceProxy._dependent_proxies[:]
+
+# opaque wrapper around non-element nodes, e.g. the document node
+#
+# This class does not imply any restrictions on modifiability or
+# read-only status of the node, so use with caution.
+
+@cython.internal
+cdef class _OpaqueNodeWrapper:
+ cdef tree.xmlNode* _c_node
+ def __init__(self):
+ raise TypeError, u"This type cannot be instantiated from Python"
+
+@cython.final
+@cython.internal
+cdef class _OpaqueDocumentWrapper(_OpaqueNodeWrapper):
+ cdef int _assertNode(self) except -1:
+ u"""This is our way of saying: this proxy is invalid!
+ """
+ assert self._c_node is not NULL, u"Proxy invalidated!"
+ return 0
+
+ cpdef append(self, other_element):
+ u"""Append a copy of an Element to the list of children.
+ """
+ cdef xmlNode* c_next
+ cdef xmlNode* c_node
+ self._assertNode()
+ c_node = _roNodeOf(other_element)
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ if tree.xmlDocGetRootElement(<tree.xmlDoc*>self._c_node) is not NULL:
+ raise ValueError, u"cannot append, document already has a root element"
+ elif c_node.type not in (tree.XML_PI_NODE, tree.XML_COMMENT_NODE):
+ raise TypeError, f"unsupported element type for top-level node: {c_node.type}"
+ c_node = _copyNodeToDoc(c_node, <tree.xmlDoc*>self._c_node)
+ c_next = c_node.next
+ tree.xmlAddChild(self._c_node, c_node)
+ _moveTail(c_next, c_node)
+
+ def extend(self, elements):
+ u"""Append a copy of all Elements from a sequence to the list of
+ children.
+ """
+ self._assertNode()
+ for element in elements:
+ self.append(element)
+
+cdef _OpaqueNodeWrapper _newOpaqueAppendOnlyNodeWrapper(xmlNode* c_node):
+ cdef _OpaqueNodeWrapper node
+ if c_node.type in (tree.XML_DOCUMENT_NODE, tree.XML_HTML_DOCUMENT_NODE):
+ node = _OpaqueDocumentWrapper.__new__(_OpaqueDocumentWrapper)
+ else:
+ node = _OpaqueNodeWrapper.__new__(_OpaqueNodeWrapper)
+ node._c_node = c_node
+ return node
+
+# element proxies that allow restricted modification
+
+@cython.internal
+cdef class _ModifyContentOnlyProxy(_ReadOnlyProxy):
+ u"""A read-only proxy that allows changing the text content.
+ """
+ property text:
+ def __get__(self):
+ self._assertNode()
+ if self._c_node.content is NULL:
+ return ''
+ else:
+ return funicode(self._c_node.content)
+
+ def __set__(self, value):
+ cdef tree.xmlDict* c_dict
+ self._assertNode()
+ if value is None:
+ c_text = <const_xmlChar*>NULL
+ else:
+ value = _utf8(value)
+ c_text = _xcstr(value)
+ tree.xmlNodeSetContent(self._c_node, c_text)
+
+@cython.final
+@cython.internal
+cdef class _ModifyContentOnlyPIProxy(_ModifyContentOnlyProxy):
+ """A read-only proxy that allows changing the text/target content of a
+ processing instruction.
+ """
+ property target:
+ def __get__(self):
+ self._assertNode()
+ return funicode(self._c_node.name)
+
+ def __set__(self, value):
+ self._assertNode()
+ value = _utf8(value)
+ c_text = _xcstr(value)
+ tree.xmlNodeSetName(self._c_node, c_text)
+
+@cython.final
+@cython.internal
+cdef class _ModifyContentOnlyEntityProxy(_ModifyContentOnlyProxy):
+ "A read-only proxy for entity references (for internal use only!)"
+ property name:
+ def __get__(self):
+ return funicode(self._c_node.name)
+
+ def __set__(self, value):
+ value = _utf8(value)
+ assert u'&' not in value and u';' not in value, \
+ f"Invalid entity name '{value}'"
+ c_text = _xcstr(value)
+ tree.xmlNodeSetName(self._c_node, c_text)
+
+
+@cython.final
+@cython.internal
+cdef class _AppendOnlyElementProxy(_ReadOnlyElementProxy):
+ u"""A read-only element that allows adding children and changing the
+ text content (i.e. everything that adds to the subtree).
+ """
+ cpdef append(self, other_element):
+ u"""Append a copy of an Element to the list of children.
+ """
+ cdef xmlNode* c_next
+ cdef xmlNode* c_node
+ self._assertNode()
+ c_node = _roNodeOf(other_element)
+ c_node = _copyNodeToDoc(c_node, self._c_node.doc)
+ c_next = c_node.next
+ tree.xmlAddChild(self._c_node, c_node)
+ _moveTail(c_next, c_node)
+
+ def extend(self, elements):
+ u"""Append a copy of all Elements from a sequence to the list of
+ children.
+ """
+ self._assertNode()
+ for element in elements:
+ self.append(element)
+
+ property text:
+ """Text before the first subelement. This is either a string or the
+ value None, if there was no text.
+ """
+ def __get__(self):
+ self._assertNode()
+ return _collectText(self._c_node.children)
+
+ def __set__(self, value):
+ self._assertNode()
+ if isinstance(value, QName):
+ value = _resolveQNameText(self, value).decode('utf8')
+ _setNodeText(self._c_node, value)
+
+
+cdef _ReadOnlyProxy _newAppendOnlyProxy(
+ _ReadOnlyProxy source_proxy, xmlNode* c_node):
+ cdef _ReadOnlyProxy el
+ if c_node.type == tree.XML_ELEMENT_NODE:
+ el = _AppendOnlyElementProxy.__new__(_AppendOnlyElementProxy)
+ elif c_node.type == tree.XML_PI_NODE:
+ el = _ModifyContentOnlyPIProxy.__new__(_ModifyContentOnlyPIProxy)
+ elif c_node.type == tree.XML_COMMENT_NODE:
+ el = _ModifyContentOnlyProxy.__new__(_ModifyContentOnlyProxy)
+ else:
+ raise TypeError(f"Unsupported element type: {c_node.type}")
+ el._c_node = c_node
+ _initReadOnlyProxy(el, source_proxy)
+ return el
+
+cdef xmlNode* _roNodeOf(element) except NULL:
+ cdef xmlNode* c_node
+ if isinstance(element, _Element):
+ c_node = (<_Element>element)._c_node
+ elif isinstance(element, _ReadOnlyProxy):
+ c_node = (<_ReadOnlyProxy>element)._c_node
+ elif isinstance(element, _OpaqueNodeWrapper):
+ c_node = (<_OpaqueNodeWrapper>element)._c_node
+ else:
+ raise TypeError, f"invalid argument type {type(element)}"
+
+ if c_node is NULL:
+ raise TypeError, u"invalid element"
+ return c_node
+
+cdef xmlNode* _nonRoNodeOf(element) except NULL:
+ cdef xmlNode* c_node
+ if isinstance(element, _Element):
+ c_node = (<_Element>element)._c_node
+ elif isinstance(element, _AppendOnlyElementProxy):
+ c_node = (<_AppendOnlyElementProxy>element)._c_node
+ elif isinstance(element, _OpaqueNodeWrapper):
+ c_node = (<_OpaqueNodeWrapper>element)._c_node
+ else:
+ raise TypeError, f"invalid argument type {type(element)}"
+
+ if c_node is NULL:
+ raise TypeError, u"invalid element"
+ return c_node
diff --git a/src/lxml/relaxng.pxi b/src/lxml/relaxng.pxi
new file mode 100644
index 0000000..6a82a29
--- /dev/null
+++ b/src/lxml/relaxng.pxi
@@ -0,0 +1,163 @@
+# support for RelaxNG validation
+from lxml.includes cimport relaxng
+
+cdef object _rnc2rng
+try:
+ import rnc2rng as _rnc2rng
+except ImportError:
+ _rnc2rng = None
+
+
+cdef int _require_rnc2rng() except -1:
+ if _rnc2rng is None:
+ raise RelaxNGParseError(
+ 'compact syntax not supported (please install rnc2rng)')
+ return 0
+
+
+cdef class RelaxNGError(LxmlError):
+ """Base class for RelaxNG errors.
+ """
+
+cdef class RelaxNGParseError(RelaxNGError):
+ """Error while parsing an XML document as RelaxNG.
+ """
+
+cdef class RelaxNGValidateError(RelaxNGError):
+ """Error while validating an XML document with a RelaxNG schema.
+ """
+
+
+################################################################################
+# RelaxNG
+
+cdef class RelaxNG(_Validator):
+ u"""RelaxNG(self, etree=None, file=None)
+ Turn a document into a Relax NG validator.
+
+ Either pass a schema as Element or ElementTree, or pass a file or
+ filename through the ``file`` keyword argument.
+ """
+ cdef relaxng.xmlRelaxNG* _c_schema
+ def __cinit__(self):
+ self._c_schema = NULL
+
+ def __init__(self, etree=None, *, file=None):
+ cdef _Document doc
+ cdef _Element root_node
+ cdef xmlDoc* fake_c_doc = NULL
+ cdef relaxng.xmlRelaxNGParserCtxt* parser_ctxt = NULL
+ _Validator.__init__(self)
+ if etree is not None:
+ doc = _documentOrRaise(etree)
+ root_node = _rootNodeOrRaise(etree)
+ fake_c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node)
+ parser_ctxt = relaxng.xmlRelaxNGNewDocParserCtxt(fake_c_doc)
+ elif file is not None:
+ if _isString(file):
+ if file[-4:].lower() == '.rnc':
+ _require_rnc2rng()
+ rng_data_utf8 = _utf8(_rnc2rng.dumps(_rnc2rng.load(file)))
+ doc = _parseMemoryDocument(rng_data_utf8, parser=None, url=file)
+ parser_ctxt = relaxng.xmlRelaxNGNewDocParserCtxt(doc._c_doc)
+ else:
+ doc = None
+ filename = _encodeFilename(file)
+ with self._error_log:
+ orig_loader = _register_document_loader()
+ parser_ctxt = relaxng.xmlRelaxNGNewParserCtxt(_cstr(filename))
+ _reset_document_loader(orig_loader)
+ elif (_getFilenameForFile(file) or '')[-4:].lower() == '.rnc':
+ _require_rnc2rng()
+ rng_data_utf8 = _utf8(_rnc2rng.dumps(_rnc2rng.load(file)))
+ doc = _parseMemoryDocument(
+ rng_data_utf8, parser=None, url=_getFilenameForFile(file))
+ parser_ctxt = relaxng.xmlRelaxNGNewDocParserCtxt(doc._c_doc)
+ else:
+ doc = _parseDocument(file, parser=None, base_url=None)
+ parser_ctxt = relaxng.xmlRelaxNGNewDocParserCtxt(doc._c_doc)
+ else:
+ raise RelaxNGParseError, u"No tree or file given"
+
+ if parser_ctxt is NULL:
+ if fake_c_doc is not NULL:
+ _destroyFakeDoc(doc._c_doc, fake_c_doc)
+ raise RelaxNGParseError(
+ self._error_log._buildExceptionMessage(
+ u"Document is not parsable as Relax NG"),
+ self._error_log)
+
+ relaxng.xmlRelaxNGSetParserStructuredErrors(
+ parser_ctxt, _receiveError, <void*>self._error_log)
+ _connectGenericErrorLog(self._error_log, xmlerror.XML_FROM_RELAXNGP)
+ self._c_schema = relaxng.xmlRelaxNGParse(parser_ctxt)
+ _connectGenericErrorLog(None)
+
+ relaxng.xmlRelaxNGFreeParserCtxt(parser_ctxt)
+ if self._c_schema is NULL:
+ if fake_c_doc is not NULL:
+ _destroyFakeDoc(doc._c_doc, fake_c_doc)
+ raise RelaxNGParseError(
+ self._error_log._buildExceptionMessage(
+ u"Document is not valid Relax NG"),
+ self._error_log)
+ if fake_c_doc is not NULL:
+ _destroyFakeDoc(doc._c_doc, fake_c_doc)
+
+ def __dealloc__(self):
+ relaxng.xmlRelaxNGFree(self._c_schema)
+
+ def __call__(self, etree):
+ u"""__call__(self, etree)
+
+ Validate doc using Relax NG.
+
+ Returns true if document is valid, false if not."""
+ cdef _Document doc
+ cdef _Element root_node
+ cdef xmlDoc* c_doc
+ cdef relaxng.xmlRelaxNGValidCtxt* valid_ctxt
+ cdef int ret
+
+ assert self._c_schema is not NULL, "RelaxNG instance not initialised"
+ doc = _documentOrRaise(etree)
+ root_node = _rootNodeOrRaise(etree)
+
+ valid_ctxt = relaxng.xmlRelaxNGNewValidCtxt(self._c_schema)
+ if valid_ctxt is NULL:
+ raise MemoryError()
+
+ try:
+ self._error_log.clear()
+ relaxng.xmlRelaxNGSetValidStructuredErrors(
+ valid_ctxt, _receiveError, <void*>self._error_log)
+ _connectGenericErrorLog(self._error_log, xmlerror.XML_FROM_RELAXNGV)
+ c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node)
+ with nogil:
+ ret = relaxng.xmlRelaxNGValidateDoc(valid_ctxt, c_doc)
+ _destroyFakeDoc(doc._c_doc, c_doc)
+ finally:
+ _connectGenericErrorLog(None)
+ relaxng.xmlRelaxNGFreeValidCtxt(valid_ctxt)
+
+ if ret == -1:
+ raise RelaxNGValidateError(
+ u"Internal error in Relax NG validation",
+ self._error_log)
+ if ret == 0:
+ return True
+ else:
+ return False
+
+ @classmethod
+ def from_rnc_string(cls, src, base_url=None):
+ """Parse a RelaxNG schema in compact syntax from a text string
+
+ Requires the rnc2rng package to be installed.
+
+ Passing the source URL or file path of the source as 'base_url'
+ will enable resolving resource references relative to the source.
+ """
+ _require_rnc2rng()
+ rng_str = utf8(_rnc2rng.dumps(_rnc2rng.loads(src)))
+ return cls(_parseMemoryDocument(rng_str, parser=None, url=base_url))
diff --git a/src/lxml/sax.pxd b/src/lxml/sax.pxd
new file mode 100644
index 0000000..b1b7d2a
--- /dev/null
+++ b/src/lxml/sax.pxd
@@ -0,0 +1,16 @@
+# cython: language_level=2
+
+cimport cython
+
+cdef tuple _getNsTag(tag)
+
+cdef class ElementTreeProducer:
+ cdef _element
+ cdef _content_handler
+ cdef _attr_class
+ cdef _empty_attributes
+
+ @cython.locals(element_nsmap=dict)
+ cdef inline _recursive_saxify(self, element, dict parent_nsmap)
+
+ cdef inline _build_qname(self, ns_uri, local_name, dict nsmap, preferred_prefix, bint is_attribute)
diff --git a/src/lxml/sax.py b/src/lxml/sax.py
new file mode 100644
index 0000000..02ee3bf
--- /dev/null
+++ b/src/lxml/sax.py
@@ -0,0 +1,278 @@
+# cython: language_level=2
+
+"""
+SAX-based adapter to copy trees from/to the Python standard library.
+
+Use the `ElementTreeContentHandler` class to build an ElementTree from
+SAX events.
+
+Use the `ElementTreeProducer` class or the `saxify()` function to fire
+the SAX events of an ElementTree against a SAX ContentHandler.
+
+See https://lxml.de/sax.html
+"""
+
+from __future__ import absolute_import
+
+from xml.sax.handler import ContentHandler
+from lxml import etree
+from lxml.etree import ElementTree, SubElement
+from lxml.etree import Comment, ProcessingInstruction
+
+
+class SaxError(etree.LxmlError):
+ """General SAX error.
+ """
+
+
+def _getNsTag(tag):
+ if tag[0] == '{':
+ return tuple(tag[1:].split('}', 1))
+ else:
+ return None, tag
+
+
+class ElementTreeContentHandler(ContentHandler):
+ """Build an lxml ElementTree from SAX events.
+ """
+ def __init__(self, makeelement=None):
+ ContentHandler.__init__(self)
+ self._root = None
+ self._root_siblings = []
+ self._element_stack = []
+ self._default_ns = None
+ self._ns_mapping = { None : [None] }
+ self._new_mappings = {}
+ if makeelement is None:
+ makeelement = etree.Element
+ self._makeelement = makeelement
+
+ def _get_etree(self):
+ "Contains the generated ElementTree after parsing is finished."
+ return ElementTree(self._root)
+
+ etree = property(_get_etree, doc=_get_etree.__doc__)
+
+ def setDocumentLocator(self, locator):
+ pass
+
+ def startDocument(self):
+ pass
+
+ def endDocument(self):
+ pass
+
+ def startPrefixMapping(self, prefix, uri):
+ self._new_mappings[prefix] = uri
+ try:
+ self._ns_mapping[prefix].append(uri)
+ except KeyError:
+ self._ns_mapping[prefix] = [uri]
+ if prefix is None:
+ self._default_ns = uri
+
+ def endPrefixMapping(self, prefix):
+ ns_uri_list = self._ns_mapping[prefix]
+ ns_uri_list.pop()
+ if prefix is None:
+ self._default_ns = ns_uri_list[-1]
+
+ def _buildTag(self, ns_name_tuple):
+ ns_uri, local_name = ns_name_tuple
+ if ns_uri:
+ el_tag = "{%s}%s" % ns_name_tuple
+ elif self._default_ns:
+ el_tag = "{%s}%s" % (self._default_ns, local_name)
+ else:
+ el_tag = local_name
+ return el_tag
+
+ def startElementNS(self, ns_name, qname, attributes=None):
+ el_name = self._buildTag(ns_name)
+ if attributes:
+ attrs = {}
+ try:
+ iter_attributes = attributes.iteritems()
+ except AttributeError:
+ iter_attributes = attributes.items()
+
+ for name_tuple, value in iter_attributes:
+ if name_tuple[0]:
+ attr_name = "{%s}%s" % name_tuple
+ else:
+ attr_name = name_tuple[1]
+ attrs[attr_name] = value
+ else:
+ attrs = None
+
+ element_stack = self._element_stack
+ if self._root is None:
+ element = self._root = \
+ self._makeelement(el_name, attrs, self._new_mappings)
+ if self._root_siblings and hasattr(element, 'addprevious'):
+ for sibling in self._root_siblings:
+ element.addprevious(sibling)
+ del self._root_siblings[:]
+ else:
+ element = SubElement(element_stack[-1], el_name,
+ attrs, self._new_mappings)
+ element_stack.append(element)
+
+ self._new_mappings.clear()
+
+ def processingInstruction(self, target, data):
+ pi = ProcessingInstruction(target, data)
+ if self._root is None:
+ self._root_siblings.append(pi)
+ else:
+ self._element_stack[-1].append(pi)
+
+ def endElementNS(self, ns_name, qname):
+ element = self._element_stack.pop()
+ el_tag = self._buildTag(ns_name)
+ if el_tag != element.tag:
+ raise SaxError("Unexpected element closed: " + el_tag)
+
+ def startElement(self, name, attributes=None):
+ if attributes:
+ attributes = dict(
+ [((None, k), v) for k, v in attributes.items()]
+ )
+ self.startElementNS((None, name), name, attributes)
+
+ def endElement(self, name):
+ self.endElementNS((None, name), name)
+
+ def characters(self, data):
+ last_element = self._element_stack[-1]
+ try:
+ # if there already is a child element, we must append to its tail
+ last_element = last_element[-1]
+ last_element.tail = (last_element.tail or '') + data
+ except IndexError:
+ # otherwise: append to the text
+ last_element.text = (last_element.text or '') + data
+
+ ignorableWhitespace = characters
+
+
+class ElementTreeProducer(object):
+ """Produces SAX events for an element and children.
+ """
+ def __init__(self, element_or_tree, content_handler):
+ try:
+ element = element_or_tree.getroot()
+ except AttributeError:
+ element = element_or_tree
+ self._element = element
+ self._content_handler = content_handler
+ from xml.sax.xmlreader import AttributesNSImpl as attr_class
+ self._attr_class = attr_class
+ self._empty_attributes = attr_class({}, {})
+
+ def saxify(self):
+ self._content_handler.startDocument()
+
+ element = self._element
+ if hasattr(element, 'getprevious'):
+ siblings = []
+ sibling = element.getprevious()
+ while getattr(sibling, 'tag', None) is ProcessingInstruction:
+ siblings.append(sibling)
+ sibling = sibling.getprevious()
+ for sibling in siblings[::-1]:
+ self._recursive_saxify(sibling, {})
+
+ self._recursive_saxify(element, {})
+
+ if hasattr(element, 'getnext'):
+ sibling = element.getnext()
+ while getattr(sibling, 'tag', None) is ProcessingInstruction:
+ self._recursive_saxify(sibling, {})
+ sibling = sibling.getnext()
+
+ self._content_handler.endDocument()
+
+ def _recursive_saxify(self, element, parent_nsmap):
+ content_handler = self._content_handler
+ tag = element.tag
+ if tag is Comment or tag is ProcessingInstruction:
+ if tag is ProcessingInstruction:
+ content_handler.processingInstruction(
+ element.target, element.text)
+ tail = element.tail
+ if tail:
+ content_handler.characters(tail)
+ return
+
+ element_nsmap = element.nsmap
+ new_prefixes = []
+ if element_nsmap != parent_nsmap:
+ # There have been updates to the namespace
+ for prefix, ns_uri in element_nsmap.items():
+ if parent_nsmap.get(prefix) != ns_uri:
+ new_prefixes.append( (prefix, ns_uri) )
+
+ attribs = element.items()
+ if attribs:
+ attr_values = {}
+ attr_qnames = {}
+ for attr_ns_name, value in attribs:
+ attr_ns_tuple = _getNsTag(attr_ns_name)
+ attr_values[attr_ns_tuple] = value
+ attr_qnames[attr_ns_tuple] = self._build_qname(
+ attr_ns_tuple[0], attr_ns_tuple[1], element_nsmap,
+ preferred_prefix=None, is_attribute=True)
+ sax_attributes = self._attr_class(attr_values, attr_qnames)
+ else:
+ sax_attributes = self._empty_attributes
+
+ ns_uri, local_name = _getNsTag(tag)
+ qname = self._build_qname(
+ ns_uri, local_name, element_nsmap, element.prefix, is_attribute=False)
+
+ for prefix, uri in new_prefixes:
+ content_handler.startPrefixMapping(prefix, uri)
+ content_handler.startElementNS(
+ (ns_uri, local_name), qname, sax_attributes)
+ text = element.text
+ if text:
+ content_handler.characters(text)
+ for child in element:
+ self._recursive_saxify(child, element_nsmap)
+ content_handler.endElementNS((ns_uri, local_name), qname)
+ for prefix, uri in new_prefixes:
+ content_handler.endPrefixMapping(prefix)
+ tail = element.tail
+ if tail:
+ content_handler.characters(tail)
+
+ def _build_qname(self, ns_uri, local_name, nsmap, preferred_prefix, is_attribute):
+ if ns_uri is None:
+ return local_name
+
+ if not is_attribute and nsmap.get(preferred_prefix) == ns_uri:
+ prefix = preferred_prefix
+ else:
+ # Pick the first matching prefix, in alphabetical order.
+ candidates = [
+ pfx for (pfx, uri) in nsmap.items()
+ if pfx is not None and uri == ns_uri
+ ]
+ prefix = (
+ candidates[0] if len(candidates) == 1
+ else min(candidates) if candidates
+ else None
+ )
+
+ if prefix is None:
+ # Default namespace
+ return local_name
+ return prefix + ':' + local_name
+
+
+def saxify(element_or_tree, content_handler):
+ """One-shot helper to generate SAX events from an XML tree and fire
+ them against a SAX ContentHandler.
+ """
+ return ElementTreeProducer(element_or_tree, content_handler).saxify()
diff --git a/src/lxml/saxparser.pxi b/src/lxml/saxparser.pxi
new file mode 100644
index 0000000..49e72be
--- /dev/null
+++ b/src/lxml/saxparser.pxi
@@ -0,0 +1,867 @@
+# SAX-like interfaces
+
+class XMLSyntaxAssertionError(XMLSyntaxError, AssertionError):
+ """
+ An XMLSyntaxError that additionally inherits from AssertionError for
+ ElementTree / backwards compatibility reasons.
+
+ This class may get replaced by a plain XMLSyntaxError in a future version.
+ """
+
+
+ctypedef enum _SaxParserEvents:
+ SAX_EVENT_START = 1 << 0
+ SAX_EVENT_END = 1 << 1
+ SAX_EVENT_DATA = 1 << 2
+ SAX_EVENT_DOCTYPE = 1 << 3
+ SAX_EVENT_PI = 1 << 4
+ SAX_EVENT_COMMENT = 1 << 5
+ SAX_EVENT_START_NS = 1 << 6
+ SAX_EVENT_END_NS = 1 << 7
+
+ctypedef enum _ParseEventFilter:
+ PARSE_EVENT_FILTER_START = 1 << 0
+ PARSE_EVENT_FILTER_END = 1 << 1
+ PARSE_EVENT_FILTER_START_NS = 1 << 2
+ PARSE_EVENT_FILTER_END_NS = 1 << 3
+ PARSE_EVENT_FILTER_COMMENT = 1 << 4
+ PARSE_EVENT_FILTER_PI = 1 << 5
+
+
+cdef int _buildParseEventFilter(events) except -1:
+ cdef int event_filter
+ event_filter = 0
+ for event in events:
+ if event == 'start':
+ event_filter |= PARSE_EVENT_FILTER_START
+ elif event == 'end':
+ event_filter |= PARSE_EVENT_FILTER_END
+ elif event == 'start-ns':
+ event_filter |= PARSE_EVENT_FILTER_START_NS
+ elif event == 'end-ns':
+ event_filter |= PARSE_EVENT_FILTER_END_NS
+ elif event == 'comment':
+ event_filter |= PARSE_EVENT_FILTER_COMMENT
+ elif event == 'pi':
+ event_filter |= PARSE_EVENT_FILTER_PI
+ else:
+ raise ValueError, f"invalid event name '{event}'"
+ return event_filter
+
+
+cdef class _SaxParserTarget:
+ cdef int _sax_event_filter
+ def __cinit__(self):
+ self._sax_event_filter = 0
+
+ cdef _handleSaxStart(self, tag, attrib, nsmap):
+ return None
+ cdef _handleSaxEnd(self, tag):
+ return None
+ cdef int _handleSaxData(self, data) except -1:
+ return 0
+ cdef int _handleSaxDoctype(self, root_tag, public_id, system_id) except -1:
+ return 0
+ cdef _handleSaxPi(self, target, data):
+ return None
+ cdef _handleSaxComment(self, comment):
+ return None
+ cdef _handleSaxStartNs(self, prefix, uri):
+ return None
+ cdef _handleSaxEndNs(self, prefix):
+ return None
+
+
+#@cython.final
+@cython.internal
+@cython.no_gc_clear # Required because parent class uses it - Cython bug.
+cdef class _SaxParserContext(_ParserContext):
+ u"""This class maps SAX2 events to parser target events.
+ """
+ cdef _SaxParserTarget _target
+ cdef _BaseParser _parser
+ cdef xmlparser.startElementNsSAX2Func _origSaxStart
+ cdef xmlparser.endElementNsSAX2Func _origSaxEnd
+ cdef xmlparser.startElementSAXFunc _origSaxStartNoNs
+ cdef xmlparser.endElementSAXFunc _origSaxEndNoNs
+ cdef xmlparser.charactersSAXFunc _origSaxData
+ cdef xmlparser.cdataBlockSAXFunc _origSaxCData
+ cdef xmlparser.internalSubsetSAXFunc _origSaxDoctype
+ cdef xmlparser.commentSAXFunc _origSaxComment
+ cdef xmlparser.processingInstructionSAXFunc _origSaxPI
+ cdef xmlparser.startDocumentSAXFunc _origSaxStartDocument
+
+ # for event collecting
+ cdef int _event_filter
+ cdef list _ns_stack
+ cdef list _node_stack
+ cdef _ParseEventsIterator events_iterator
+
+ # for iterparse
+ cdef _Element _root
+ cdef _MultiTagMatcher _matcher
+
+ def __cinit__(self, _BaseParser parser):
+ self._ns_stack = []
+ self._node_stack = []
+ self._parser = parser
+ self.events_iterator = _ParseEventsIterator()
+
+ cdef void _setSaxParserTarget(self, _SaxParserTarget target):
+ self._target = target
+
+ cdef void _initParserContext(self, xmlparser.xmlParserCtxt* c_ctxt):
+ _ParserContext._initParserContext(self, c_ctxt)
+ if self._target is not None:
+ self._connectTarget(c_ctxt)
+ elif self._event_filter:
+ self._connectEvents(c_ctxt)
+
+ cdef void _connectTarget(self, xmlparser.xmlParserCtxt* c_ctxt):
+ """Wrap original SAX2 callbacks to call into parser target.
+ """
+ sax = c_ctxt.sax
+ self._origSaxStart = sax.startElementNs = NULL
+ self._origSaxStartNoNs = sax.startElement = NULL
+ if self._target._sax_event_filter & (SAX_EVENT_START |
+ SAX_EVENT_START_NS |
+ SAX_EVENT_END_NS):
+ # intercept => overwrite orig callback
+ # FIXME: also intercept on when collecting END events
+ if sax.initialized == xmlparser.XML_SAX2_MAGIC:
+ sax.startElementNs = _handleSaxTargetStart
+ if self._target._sax_event_filter & SAX_EVENT_START:
+ sax.startElement = _handleSaxTargetStartNoNs
+
+ self._origSaxEnd = sax.endElementNs = NULL
+ self._origSaxEndNoNs = sax.endElement = NULL
+ if self._target._sax_event_filter & (SAX_EVENT_END |
+ SAX_EVENT_END_NS):
+ if sax.initialized == xmlparser.XML_SAX2_MAGIC:
+ sax.endElementNs = _handleSaxEnd
+ if self._target._sax_event_filter & SAX_EVENT_END:
+ sax.endElement = _handleSaxEndNoNs
+
+ self._origSaxData = sax.characters = sax.cdataBlock = NULL
+ if self._target._sax_event_filter & SAX_EVENT_DATA:
+ sax.characters = sax.cdataBlock = _handleSaxData
+
+ # doctype propagation is always required for entity replacement
+ self._origSaxDoctype = sax.internalSubset
+ if self._target._sax_event_filter & SAX_EVENT_DOCTYPE:
+ sax.internalSubset = _handleSaxTargetDoctype
+
+ self._origSaxPI = sax.processingInstruction = NULL
+ if self._target._sax_event_filter & SAX_EVENT_PI:
+ sax.processingInstruction = _handleSaxTargetPI
+
+ self._origSaxComment = sax.comment = NULL
+ if self._target._sax_event_filter & SAX_EVENT_COMMENT:
+ sax.comment = _handleSaxTargetComment
+
+ # enforce entity replacement
+ sax.reference = NULL
+ c_ctxt.replaceEntities = 1
+
+ cdef void _connectEvents(self, xmlparser.xmlParserCtxt* c_ctxt):
+ """Wrap original SAX2 callbacks to collect parse events without parser target.
+ """
+ sax = c_ctxt.sax
+ self._origSaxStartDocument = sax.startDocument
+ sax.startDocument = _handleSaxStartDocument
+
+ # only override "start" event handler if needed
+ self._origSaxStart = sax.startElementNs
+ if self._event_filter == 0 or c_ctxt.html or \
+ self._event_filter & (PARSE_EVENT_FILTER_START |
+ PARSE_EVENT_FILTER_END |
+ PARSE_EVENT_FILTER_START_NS |
+ PARSE_EVENT_FILTER_END_NS):
+ sax.startElementNs = <xmlparser.startElementNsSAX2Func>_handleSaxStart
+
+ self._origSaxStartNoNs = sax.startElement
+ if self._event_filter == 0 or c_ctxt.html or \
+ self._event_filter & (PARSE_EVENT_FILTER_START |
+ PARSE_EVENT_FILTER_END):
+ sax.startElement = <xmlparser.startElementSAXFunc>_handleSaxStartNoNs
+
+ # only override "end" event handler if needed
+ self._origSaxEnd = sax.endElementNs
+ if self._event_filter == 0 or \
+ self._event_filter & (PARSE_EVENT_FILTER_END |
+ PARSE_EVENT_FILTER_END_NS):
+ sax.endElementNs = <xmlparser.endElementNsSAX2Func>_handleSaxEnd
+
+ self._origSaxEndNoNs = sax.endElement
+ if self._event_filter == 0 or \
+ self._event_filter & PARSE_EVENT_FILTER_END:
+ sax.endElement = <xmlparser.endElementSAXFunc>_handleSaxEndNoNs
+
+ self._origSaxComment = sax.comment
+ if self._event_filter & PARSE_EVENT_FILTER_COMMENT:
+ sax.comment = <xmlparser.commentSAXFunc>_handleSaxComment
+
+ self._origSaxPI = sax.processingInstruction
+ if self._event_filter & PARSE_EVENT_FILTER_PI:
+ sax.processingInstruction = <xmlparser.processingInstructionSAXFunc>_handleSaxPIEvent
+
+ cdef _setEventFilter(self, events, tag):
+ self._event_filter = _buildParseEventFilter(events)
+ if not self._event_filter or tag is None or tag == '*':
+ self._matcher = None
+ else:
+ self._matcher = _MultiTagMatcher.__new__(_MultiTagMatcher, tag)
+
+ cdef int startDocument(self, xmlDoc* c_doc) except -1:
+ try:
+ self._doc = _documentFactory(c_doc, self._parser)
+ finally:
+ self._parser = None # clear circular reference ASAP
+ if self._matcher is not None:
+ self._matcher.cacheTags(self._doc, True) # force entry in libxml2 dict
+ return 0
+
+ cdef int pushEvent(self, event, xmlNode* c_node) except -1:
+ cdef _Element root
+ if self._root is None:
+ root = self._doc.getroot()
+ if root is not None and root._c_node.type == tree.XML_ELEMENT_NODE:
+ self._root = root
+ node = _elementFactory(self._doc, c_node)
+ self.events_iterator._events.append( (event, node) )
+ return 0
+
+ cdef int flushEvents(self) except -1:
+ events = self.events_iterator._events
+ while self._node_stack:
+ events.append( ('end', self._node_stack.pop()) )
+ _pushSaxNsEndEvents(self)
+ while self._ns_stack:
+ _pushSaxNsEndEvents(self)
+
+ cdef void _handleSaxException(self, xmlparser.xmlParserCtxt* c_ctxt):
+ if c_ctxt.errNo == xmlerror.XML_ERR_OK:
+ c_ctxt.errNo = xmlerror.XML_ERR_INTERNAL_ERROR
+ # stop parsing immediately
+ c_ctxt.wellFormed = 0
+ c_ctxt.disableSAX = 1
+ c_ctxt.instate = xmlparser.XML_PARSER_EOF
+ self._store_raised()
+
+
+@cython.final
+@cython.internal
+cdef class _ParseEventsIterator:
+ """A reusable parse events iterator"""
+ cdef list _events
+ cdef int _event_index
+
+ def __cinit__(self):
+ self._events = []
+ self._event_index = 0
+
+ def __iter__(self):
+ return self
+
+ def __next__(self):
+ cdef int event_index = self._event_index
+ events = self._events
+ if event_index >= 2**10 or event_index * 2 >= len(events):
+ if event_index:
+ # clean up from time to time
+ del events[:event_index]
+ self._event_index = event_index = 0
+ if event_index >= len(events):
+ raise StopIteration
+ item = events[event_index]
+ self._event_index = event_index + 1
+ return item
+
+
+cdef list _build_prefix_uri_list(_SaxParserContext context, int c_nb_namespaces,
+ const_xmlChar** c_namespaces):
+ "Build [(prefix, uri)] list of declared namespaces."
+ cdef int i
+ namespaces = []
+ for i in xrange(c_nb_namespaces):
+ namespaces.append((funicodeOrEmpty(c_namespaces[0]), funicode(c_namespaces[1])))
+ c_namespaces += 2
+ return namespaces
+
+
+cdef void _handleSaxStart(
+ void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix,
+ const_xmlChar* c_namespace, int c_nb_namespaces,
+ const_xmlChar** c_namespaces,
+ int c_nb_attributes, int c_nb_defaulted,
+ const_xmlChar** c_attributes) with gil:
+ cdef int i
+ cdef size_t c_len
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ cdef int event_filter = context._event_filter
+ try:
+ if (c_nb_namespaces and
+ event_filter & (PARSE_EVENT_FILTER_START_NS |
+ PARSE_EVENT_FILTER_END_NS)):
+ declared_namespaces = _build_prefix_uri_list(
+ context, c_nb_namespaces, c_namespaces)
+ if event_filter & PARSE_EVENT_FILTER_START_NS:
+ for prefix_uri_tuple in declared_namespaces:
+ context.events_iterator._events.append(("start-ns", prefix_uri_tuple))
+ else:
+ declared_namespaces = None
+
+ context._origSaxStart(c_ctxt, c_localname, c_prefix, c_namespace,
+ c_nb_namespaces, c_namespaces, c_nb_attributes,
+ c_nb_defaulted, c_attributes)
+ if c_ctxt.html:
+ _fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node)
+
+ if event_filter & PARSE_EVENT_FILTER_END_NS:
+ context._ns_stack.append(declared_namespaces)
+ if event_filter & (PARSE_EVENT_FILTER_END |
+ PARSE_EVENT_FILTER_START):
+ _pushSaxStartEvent(context, c_ctxt, c_namespace, c_localname, None)
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef void _handleSaxTargetStart(
+ void* ctxt, const_xmlChar* c_localname, const_xmlChar* c_prefix,
+ const_xmlChar* c_namespace, int c_nb_namespaces,
+ const_xmlChar** c_namespaces,
+ int c_nb_attributes, int c_nb_defaulted,
+ const_xmlChar** c_attributes) with gil:
+ cdef int i
+ cdef size_t c_len
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+
+ cdef int event_filter = context._event_filter
+ cdef int sax_event_filter = context._target._sax_event_filter
+ try:
+ if c_nb_namespaces:
+ declared_namespaces = _build_prefix_uri_list(
+ context, c_nb_namespaces, c_namespaces)
+
+ if event_filter & PARSE_EVENT_FILTER_START_NS:
+ for prefix_uri_tuple in declared_namespaces:
+ context.events_iterator._events.append(("start-ns", prefix_uri_tuple))
+
+ if sax_event_filter & SAX_EVENT_START_NS:
+ for prefix, uri in declared_namespaces:
+ context._target._handleSaxStartNs(prefix, uri)
+ #if not context._target._sax_event_filter & SAX_EVENT_START:
+ # # *Only* collecting start-ns events.
+ # return
+ else:
+ declared_namespaces = None
+
+ if sax_event_filter & SAX_EVENT_START:
+ if c_nb_defaulted > 0:
+ # only add default attributes if we asked for them
+ if c_ctxt.loadsubset & xmlparser.XML_COMPLETE_ATTRS == 0:
+ c_nb_attributes -= c_nb_defaulted
+ if c_nb_attributes == 0:
+ attrib = IMMUTABLE_EMPTY_MAPPING
+ else:
+ attrib = {}
+ for i in xrange(c_nb_attributes):
+ name = _namespacedNameFromNsName(
+ c_attributes[2], c_attributes[0])
+ if c_attributes[3] is NULL:
+ value = ''
+ else:
+ c_len = c_attributes[4] - c_attributes[3]
+ value = c_attributes[3][:c_len].decode('utf8')
+ attrib[name] = value
+ c_attributes += 5
+
+ nsmap = dict(declared_namespaces) if c_nb_namespaces else IMMUTABLE_EMPTY_MAPPING
+
+ element = _callTargetSaxStart(
+ context, c_ctxt,
+ _namespacedNameFromNsName(c_namespace, c_localname),
+ attrib, nsmap)
+ else:
+ element = None
+
+ if (event_filter & PARSE_EVENT_FILTER_END_NS or
+ sax_event_filter & SAX_EVENT_END_NS):
+ context._ns_stack.append(declared_namespaces)
+ if event_filter & (PARSE_EVENT_FILTER_END |
+ PARSE_EVENT_FILTER_START):
+ _pushSaxStartEvent(context, c_ctxt, c_namespace,
+ c_localname, element)
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef void _handleSaxStartNoNs(void* ctxt, const_xmlChar* c_name,
+ const_xmlChar** c_attributes) with gil:
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ try:
+ context._origSaxStartNoNs(c_ctxt, c_name, c_attributes)
+ if c_ctxt.html:
+ _fixHtmlDictNodeNames(c_ctxt.dict, c_ctxt.node)
+ if context._event_filter & (PARSE_EVENT_FILTER_END |
+ PARSE_EVENT_FILTER_START):
+ _pushSaxStartEvent(context, c_ctxt, NULL, c_name, None)
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef void _handleSaxTargetStartNoNs(void* ctxt, const_xmlChar* c_name,
+ const_xmlChar** c_attributes) with gil:
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ try:
+ if c_attributes is NULL:
+ attrib = IMMUTABLE_EMPTY_MAPPING
+ else:
+ attrib = {}
+ while c_attributes[0] is not NULL:
+ name = funicode(c_attributes[0])
+ attrib[name] = funicodeOrEmpty(c_attributes[1])
+ c_attributes += 2
+ element = _callTargetSaxStart(
+ context, c_ctxt, funicode(c_name),
+ attrib, IMMUTABLE_EMPTY_MAPPING)
+ if context._event_filter & (PARSE_EVENT_FILTER_END |
+ PARSE_EVENT_FILTER_START):
+ _pushSaxStartEvent(context, c_ctxt, NULL, c_name, element)
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef _callTargetSaxStart(_SaxParserContext context,
+ xmlparser.xmlParserCtxt* c_ctxt,
+ tag, attrib, nsmap):
+ element = context._target._handleSaxStart(tag, attrib, nsmap)
+ if element is not None and c_ctxt.input is not NULL:
+ if isinstance(element, _Element):
+ (<_Element>element)._c_node.line = (
+ <unsigned short>c_ctxt.input.line
+ if c_ctxt.input.line < 65535 else 65535)
+ return element
+
+
+cdef int _pushSaxStartEvent(_SaxParserContext context,
+ xmlparser.xmlParserCtxt* c_ctxt,
+ const_xmlChar* c_href,
+ const_xmlChar* c_name, node) except -1:
+ if (context._matcher is None or
+ context._matcher.matchesNsTag(c_href, c_name)):
+ if node is None and context._target is None:
+ assert context._doc is not None
+ node = _elementFactory(context._doc, c_ctxt.node)
+ if context._event_filter & PARSE_EVENT_FILTER_START:
+ context.events_iterator._events.append(('start', node))
+ if (context._target is None and
+ context._event_filter & PARSE_EVENT_FILTER_END):
+ context._node_stack.append(node)
+ return 0
+
+
+cdef void _handleSaxEnd(void* ctxt, const_xmlChar* c_localname,
+ const_xmlChar* c_prefix,
+ const_xmlChar* c_namespace) with gil:
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ try:
+ if context._target is not None:
+ if context._target._sax_event_filter & SAX_EVENT_END:
+ node = context._target._handleSaxEnd(
+ _namespacedNameFromNsName(c_namespace, c_localname))
+ else:
+ node = None
+ else:
+ context._origSaxEnd(c_ctxt, c_localname, c_prefix, c_namespace)
+ node = None
+ _pushSaxEndEvent(context, c_namespace, c_localname, node)
+ _pushSaxNsEndEvents(context)
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef void _handleSaxEndNoNs(void* ctxt, const_xmlChar* c_name) with gil:
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ try:
+ if context._target is not None:
+ node = context._target._handleSaxEnd(funicode(c_name))
+ else:
+ context._origSaxEndNoNs(c_ctxt, c_name)
+ node = None
+ _pushSaxEndEvent(context, NULL, c_name, node)
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef int _pushSaxNsEndEvents(_SaxParserContext context) except -1:
+ cdef bint build_events = context._event_filter & PARSE_EVENT_FILTER_END_NS
+ cdef bint call_target = (
+ context._target is not None
+ and context._target._sax_event_filter & SAX_EVENT_END_NS)
+ if not build_events and not call_target:
+ return 0
+
+ cdef list declared_namespaces = context._ns_stack.pop()
+ if declared_namespaces is None:
+ return 0
+
+ cdef tuple prefix_uri
+ for prefix_uri in reversed(declared_namespaces):
+ if call_target:
+ context._target._handleSaxEndNs(prefix_uri[0])
+ if build_events:
+ context.events_iterator._events.append(('end-ns', None))
+
+ return 0
+
+
+cdef int _pushSaxEndEvent(_SaxParserContext context,
+ const_xmlChar* c_href,
+ const_xmlChar* c_name, node) except -1:
+ if context._event_filter & PARSE_EVENT_FILTER_END:
+ if (context._matcher is None or
+ context._matcher.matchesNsTag(c_href, c_name)):
+ if context._target is None:
+ node = context._node_stack.pop()
+ context.events_iterator._events.append(('end', node))
+ return 0
+
+
+cdef void _handleSaxData(void* ctxt, const_xmlChar* c_data, int data_len) with gil:
+ # can only be called if parsing with a target
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ try:
+ context._target._handleSaxData(
+ c_data[:data_len].decode('utf8'))
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef void _handleSaxTargetDoctype(void* ctxt, const_xmlChar* c_name,
+ const_xmlChar* c_public,
+ const_xmlChar* c_system) with gil:
+ # can only be called if parsing with a target
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ try:
+ context._target._handleSaxDoctype(
+ funicodeOrNone(c_name),
+ funicodeOrNone(c_public),
+ funicodeOrNone(c_system))
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef void _handleSaxStartDocument(void* ctxt) with gil:
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ context._origSaxStartDocument(ctxt)
+ c_doc = c_ctxt.myDoc
+ try:
+ context.startDocument(c_doc)
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef void _handleSaxTargetPI(void* ctxt, const_xmlChar* c_target,
+ const_xmlChar* c_data) with gil:
+ # can only be called if parsing with a target
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ try:
+ pi = context._target._handleSaxPi(
+ funicodeOrNone(c_target),
+ funicodeOrEmpty(c_data))
+ if context._event_filter & PARSE_EVENT_FILTER_PI:
+ context.events_iterator._events.append(('pi', pi))
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef void _handleSaxPIEvent(void* ctxt, const_xmlChar* target,
+ const_xmlChar* data) with gil:
+ # can only be called when collecting pi events
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ context._origSaxPI(ctxt, target, data)
+ c_node = _findLastEventNode(c_ctxt)
+ if c_node is NULL:
+ return
+ try:
+ context.pushEvent('pi', c_node)
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef void _handleSaxTargetComment(void* ctxt, const_xmlChar* c_data) with gil:
+ # can only be called if parsing with a target
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ try:
+ comment = context._target._handleSaxComment(funicodeOrEmpty(c_data))
+ if context._event_filter & PARSE_EVENT_FILTER_COMMENT:
+ context.events_iterator._events.append(('comment', comment))
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef void _handleSaxComment(void* ctxt, const_xmlChar* text) with gil:
+ # can only be called when collecting comment events
+ c_ctxt = <xmlparser.xmlParserCtxt*>ctxt
+ if c_ctxt._private is NULL or c_ctxt.disableSAX:
+ return
+ context = <_SaxParserContext>c_ctxt._private
+ context._origSaxComment(ctxt, text)
+ c_node = _findLastEventNode(c_ctxt)
+ if c_node is NULL:
+ return
+ try:
+ context.pushEvent('comment', c_node)
+ except:
+ context._handleSaxException(c_ctxt)
+ finally:
+ return # swallow any further exceptions
+
+
+cdef inline xmlNode* _findLastEventNode(xmlparser.xmlParserCtxt* c_ctxt):
+ # this mimics what libxml2 creates for comments/PIs
+ if c_ctxt.inSubset == 1:
+ return c_ctxt.myDoc.intSubset.last
+ elif c_ctxt.inSubset == 2:
+ return c_ctxt.myDoc.extSubset.last
+ elif c_ctxt.node is NULL:
+ return c_ctxt.myDoc.last
+ elif c_ctxt.node.type == tree.XML_ELEMENT_NODE:
+ return c_ctxt.node.last
+ else:
+ return c_ctxt.node.next
+
+
+############################################################
+## ET compatible XML tree builder
+############################################################
+
+cdef class TreeBuilder(_SaxParserTarget):
+ u"""TreeBuilder(self, element_factory=None, parser=None,
+ comment_factory=None, pi_factory=None,
+ insert_comments=True, insert_pis=True)
+
+ Parser target that builds a tree from parse event callbacks.
+
+ The factory arguments can be used to influence the creation of
+ elements, comments and processing instructions.
+
+ By default, comments and processing instructions are inserted into
+ the tree, but they can be ignored by passing the respective flags.
+
+ The final tree is returned by the ``close()`` method.
+ """
+ cdef _BaseParser _parser
+ cdef object _factory
+ cdef object _comment_factory
+ cdef object _pi_factory
+ cdef list _data
+ cdef list _element_stack
+ cdef object _element_stack_pop
+ cdef _Element _last # may be None
+ cdef bint _in_tail
+ cdef bint _insert_comments
+ cdef bint _insert_pis
+
+ def __init__(self, *, element_factory=None, parser=None,
+ comment_factory=None, pi_factory=None,
+ bint insert_comments=True, bint insert_pis=True):
+ self._sax_event_filter = \
+ SAX_EVENT_START | SAX_EVENT_END | SAX_EVENT_DATA | \
+ SAX_EVENT_PI | SAX_EVENT_COMMENT
+ self._data = [] # data collector
+ self._element_stack = [] # element stack
+ self._element_stack_pop = self._element_stack.pop
+ self._last = None # last element
+ self._in_tail = 0 # true if we're after an end tag
+ self._factory = element_factory
+ self._comment_factory = comment_factory if comment_factory is not None else Comment
+ self._pi_factory = pi_factory if pi_factory is not None else ProcessingInstruction
+ self._insert_comments = insert_comments
+ self._insert_pis = insert_pis
+ self._parser = parser
+
+ @cython.final
+ cdef int _flush(self) except -1:
+ if self._data:
+ if self._last is not None:
+ text = u"".join(self._data)
+ if self._in_tail:
+ assert self._last.tail is None, u"internal error (tail)"
+ self._last.tail = text
+ else:
+ assert self._last.text is None, u"internal error (text)"
+ self._last.text = text
+ del self._data[:]
+ return 0
+
+ # internal SAX event handlers
+
+ @cython.final
+ cdef _handleSaxStart(self, tag, attrib, nsmap):
+ self._flush()
+ if self._factory is not None:
+ self._last = self._factory(tag, attrib)
+ if self._element_stack:
+ _appendChild(self._element_stack[-1], self._last)
+ elif self._element_stack:
+ self._last = _makeSubElement(
+ self._element_stack[-1], tag, None, None, attrib, nsmap, None)
+ else:
+ self._last = _makeElement(
+ tag, NULL, None, self._parser, None, None, attrib, nsmap, None)
+ self._element_stack.append(self._last)
+ self._in_tail = 0
+ return self._last
+
+ @cython.final
+ cdef _handleSaxEnd(self, tag):
+ self._flush()
+ self._last = self._element_stack_pop()
+ self._in_tail = 1
+ return self._last
+
+ @cython.final
+ cdef int _handleSaxData(self, data) except -1:
+ self._data.append(data)
+
+ @cython.final
+ cdef _handleSaxPi(self, target, data):
+ elem = self._pi_factory(target, data)
+ if self._insert_pis:
+ self._flush()
+ self._last = elem
+ if self._element_stack:
+ _appendChild(self._element_stack[-1], self._last)
+ self._in_tail = 1
+ return self._last
+
+ @cython.final
+ cdef _handleSaxComment(self, comment):
+ elem = self._comment_factory(comment)
+ if self._insert_comments:
+ self._flush()
+ self._last = elem
+ if self._element_stack:
+ _appendChild(self._element_stack[-1], self._last)
+ self._in_tail = 1
+ return elem
+
+ # Python level event handlers
+
+ def close(self):
+ u"""close(self)
+
+ Flushes the builder buffers, and returns the toplevel document
+ element. Raises XMLSyntaxError on inconsistencies.
+ """
+ if self._element_stack:
+ raise XMLSyntaxAssertionError("missing end tags")
+ # TODO: this does not necessarily seem like an error case. Why not just return None?
+ if self._last is None:
+ raise XMLSyntaxAssertionError("missing toplevel element")
+ return self._last
+
+ def data(self, data):
+ u"""data(self, data)
+
+ Adds text to the current element. The value should be either an
+ 8-bit string containing ASCII text, or a Unicode string.
+ """
+ self._handleSaxData(data)
+
+ def start(self, tag, attrs, nsmap=None):
+ u"""start(self, tag, attrs, nsmap=None)
+
+ Opens a new element.
+ """
+ if nsmap is None:
+ nsmap = IMMUTABLE_EMPTY_MAPPING
+ return self._handleSaxStart(tag, attrs, nsmap)
+
+ def end(self, tag):
+ u"""end(self, tag)
+
+ Closes the current element.
+ """
+ element = self._handleSaxEnd(tag)
+ assert self._last.tag == tag,\
+ f"end tag mismatch (expected {self._last.tag}, got {tag})"
+ return element
+
+ def pi(self, target, data=None):
+ u"""pi(self, target, data=None)
+
+ Creates a processing instruction using the factory, appends it
+ (unless disabled) and returns it.
+ """
+ return self._handleSaxPi(target, data)
+
+ def comment(self, comment):
+ u"""comment(self, comment)
+
+ Creates a comment using the factory, appends it (unless disabled)
+ and returns it.
+ """
+ return self._handleSaxComment(comment)
diff --git a/src/lxml/schematron.pxi b/src/lxml/schematron.pxi
new file mode 100644
index 0000000..dfd2cc0
--- /dev/null
+++ b/src/lxml/schematron.pxi
@@ -0,0 +1,167 @@
+# support for Schematron validation
+from lxml.includes cimport schematron
+
+
+cdef class SchematronError(LxmlError):
+ """Base class of all Schematron errors.
+ """
+
+cdef class SchematronParseError(SchematronError):
+ """Error while parsing an XML document as Schematron schema.
+ """
+
+cdef class SchematronValidateError(SchematronError):
+ """Error while validating an XML document with a Schematron schema.
+ """
+
+
+################################################################################
+# Schematron
+
+cdef class Schematron(_Validator):
+ u"""Schematron(self, etree=None, file=None)
+ A Schematron validator.
+
+ Pass a root Element or an ElementTree to turn it into a validator.
+ Alternatively, pass a filename as keyword argument 'file' to parse from
+ the file system.
+
+ Schematron is a less well known, but very powerful schema language. The main
+ idea is to use the capabilities of XPath to put restrictions on the structure
+ and the content of XML documents. Here is a simple example::
+
+ >>> schematron = Schematron(XML('''
+ ... <schema xmlns="http://www.ascc.net/xml/schematron" >
+ ... <pattern name="id is the only permitted attribute name">
+ ... <rule context="*">
+ ... <report test="@*[not(name()='id')]">Attribute
+ ... <name path="@*[not(name()='id')]"/> is forbidden<name/>
+ ... </report>
+ ... </rule>
+ ... </pattern>
+ ... </schema>
+ ... '''))
+
+ >>> xml = XML('''
+ ... <AAA name="aaa">
+ ... <BBB id="bbb"/>
+ ... <CCC color="ccc"/>
+ ... </AAA>
+ ... ''')
+
+ >>> schematron.validate(xml)
+ 0
+
+ >>> xml = XML('''
+ ... <AAA id="aaa">
+ ... <BBB id="bbb"/>
+ ... <CCC/>
+ ... </AAA>
+ ... ''')
+
+ >>> schematron.validate(xml)
+ 1
+
+ Schematron was added to libxml2 in version 2.6.21. Before version 2.6.32,
+ however, Schematron lacked support for error reporting other than to stderr.
+ This version is therefore required to retrieve validation warnings and
+ errors in lxml.
+ """
+ cdef schematron.xmlSchematron* _c_schema
+ cdef xmlDoc* _c_schema_doc
+ def __cinit__(self):
+ self._c_schema = NULL
+ self._c_schema_doc = NULL
+
+ def __init__(self, etree=None, *, file=None):
+ cdef _Document doc
+ cdef _Element root_node
+ cdef xmlNode* c_node
+ cdef char* c_href
+ cdef schematron.xmlSchematronParserCtxt* parser_ctxt = NULL
+ _Validator.__init__(self)
+ if not config.ENABLE_SCHEMATRON:
+ raise SchematronError, \
+ u"lxml.etree was compiled without Schematron support."
+ if etree is not None:
+ doc = _documentOrRaise(etree)
+ root_node = _rootNodeOrRaise(etree)
+ self._c_schema_doc = _copyDocRoot(doc._c_doc, root_node._c_node)
+ parser_ctxt = schematron.xmlSchematronNewDocParserCtxt(self._c_schema_doc)
+ elif file is not None:
+ filename = _getFilenameForFile(file)
+ if filename is None:
+ # XXX assume a string object
+ filename = file
+ filename = _encodeFilename(filename)
+ with self._error_log:
+ orig_loader = _register_document_loader()
+ parser_ctxt = schematron.xmlSchematronNewParserCtxt(_cstr(filename))
+ _reset_document_loader(orig_loader)
+ else:
+ raise SchematronParseError, u"No tree or file given"
+
+ if parser_ctxt is NULL:
+ if self._c_schema_doc is not NULL:
+ tree.xmlFreeDoc(self._c_schema_doc)
+ self._c_schema_doc = NULL
+ raise MemoryError()
+
+ try:
+ with self._error_log:
+ orig_loader = _register_document_loader()
+ self._c_schema = schematron.xmlSchematronParse(parser_ctxt)
+ _reset_document_loader(orig_loader)
+ finally:
+ schematron.xmlSchematronFreeParserCtxt(parser_ctxt)
+
+ if self._c_schema is NULL:
+ raise SchematronParseError(
+ u"Document is not a valid Schematron schema",
+ self._error_log)
+
+ def __dealloc__(self):
+ schematron.xmlSchematronFree(self._c_schema)
+ if self._c_schema_doc is not NULL:
+ tree.xmlFreeDoc(self._c_schema_doc)
+
+ def __call__(self, etree):
+ u"""__call__(self, etree)
+
+ Validate doc using Schematron.
+
+ Returns true if document is valid, false if not."""
+ cdef _Document doc
+ cdef _Element root_node
+ cdef xmlDoc* c_doc
+ cdef schematron.xmlSchematronValidCtxt* valid_ctxt
+ cdef int ret
+
+ assert self._c_schema is not NULL, "Schematron instance not initialised"
+ doc = _documentOrRaise(etree)
+ root_node = _rootNodeOrRaise(etree)
+
+ valid_ctxt = schematron.xmlSchematronNewValidCtxt(
+ self._c_schema, schematron.XML_SCHEMATRON_OUT_ERROR)
+ if valid_ctxt is NULL:
+ raise MemoryError()
+
+ try:
+ self._error_log.clear()
+ schematron.xmlSchematronSetValidStructuredErrors(
+ valid_ctxt, _receiveError, <void*>self._error_log)
+ c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node)
+ with nogil:
+ ret = schematron.xmlSchematronValidateDoc(valid_ctxt, c_doc)
+ _destroyFakeDoc(doc._c_doc, c_doc)
+ finally:
+ schematron.xmlSchematronFreeValidCtxt(valid_ctxt)
+
+ if ret == -1:
+ raise SchematronValidateError(
+ u"Internal error in Schematron validation",
+ self._error_log)
+ if ret == 0:
+ return True
+ else:
+ return False
diff --git a/src/lxml/serializer.pxi b/src/lxml/serializer.pxi
new file mode 100644
index 0000000..d66f59a
--- /dev/null
+++ b/src/lxml/serializer.pxi
@@ -0,0 +1,1870 @@
+# XML serialization and output functions
+
+cdef object GzipFile
+from gzip import GzipFile
+
+
+cdef class SerialisationError(LxmlError):
+ """A libxml2 error that occurred during serialisation.
+ """
+
+
+cdef enum _OutputMethods:
+ OUTPUT_METHOD_XML
+ OUTPUT_METHOD_HTML
+ OUTPUT_METHOD_TEXT
+
+
+cdef int _findOutputMethod(method) except -1:
+ if method is None:
+ return OUTPUT_METHOD_XML
+ method = method.lower()
+ if method == "xml":
+ return OUTPUT_METHOD_XML
+ if method == "html":
+ return OUTPUT_METHOD_HTML
+ if method == "text":
+ return OUTPUT_METHOD_TEXT
+ raise ValueError(f"unknown output method {method!r}")
+
+
+cdef _textToString(xmlNode* c_node, encoding, bint with_tail):
+ cdef bint needs_conversion
+ cdef const_xmlChar* c_text
+ cdef xmlNode* c_text_node
+ cdef tree.xmlBuffer* c_buffer
+ cdef int error_result
+
+ c_buffer = tree.xmlBufferCreate()
+ if c_buffer is NULL:
+ raise MemoryError()
+
+ with nogil:
+ error_result = tree.xmlNodeBufGetContent(c_buffer, c_node)
+ if with_tail:
+ c_text_node = _textNodeOrSkip(c_node.next)
+ while c_text_node is not NULL:
+ tree.xmlBufferWriteChar(c_buffer, <const_char*>c_text_node.content)
+ c_text_node = _textNodeOrSkip(c_text_node.next)
+ c_text = tree.xmlBufferContent(c_buffer)
+
+ if error_result < 0 or c_text is NULL:
+ tree.xmlBufferFree(c_buffer)
+ raise SerialisationError, u"Error during serialisation (out of memory?)"
+
+ try:
+ needs_conversion = 0
+ if encoding is unicode:
+ needs_conversion = 1
+ elif encoding is not None:
+ # Python prefers lower case encoding names
+ encoding = encoding.lower()
+ if encoding not in (u'utf8', u'utf-8'):
+ if encoding == u'ascii':
+ if isutf8l(c_text, tree.xmlBufferLength(c_buffer)):
+ # will raise a decode error below
+ needs_conversion = 1
+ else:
+ needs_conversion = 1
+
+ if needs_conversion:
+ text = python.PyUnicode_DecodeUTF8(
+ <const_char*>c_text, tree.xmlBufferLength(c_buffer), 'strict')
+ if encoding is not unicode:
+ encoding = _utf8(encoding)
+ text = python.PyUnicode_AsEncodedString(
+ text, encoding, 'strict')
+ else:
+ text = (<unsigned char*>c_text)[:tree.xmlBufferLength(c_buffer)]
+ finally:
+ tree.xmlBufferFree(c_buffer)
+ return text
+
+
+cdef _tostring(_Element element, encoding, doctype, method,
+ bint write_xml_declaration, bint write_complete_document,
+ bint pretty_print, bint with_tail, int standalone):
+ u"""Serialize an element to an encoded string representation of its XML
+ tree.
+ """
+ cdef tree.xmlOutputBuffer* c_buffer
+ cdef tree.xmlBuf* c_result_buffer
+ cdef tree.xmlCharEncodingHandler* enchandler
+ cdef const_char* c_enc
+ cdef const_xmlChar* c_version
+ cdef const_xmlChar* c_doctype
+ cdef int c_method
+ cdef int error_result
+ if element is None:
+ return None
+ _assertValidNode(element)
+ c_method = _findOutputMethod(method)
+ if c_method == OUTPUT_METHOD_TEXT:
+ return _textToString(element._c_node, encoding, with_tail)
+ if encoding is None or encoding is unicode:
+ c_enc = NULL
+ else:
+ encoding = _utf8(encoding)
+ c_enc = _cstr(encoding)
+ if doctype is None:
+ c_doctype = NULL
+ else:
+ doctype = _utf8(doctype)
+ c_doctype = _xcstr(doctype)
+ # it is necessary to *and* find the encoding handler *and* use
+ # encoding during output
+ enchandler = tree.xmlFindCharEncodingHandler(c_enc)
+ if enchandler is NULL and c_enc is not NULL:
+ if encoding is not None:
+ encoding = encoding.decode('UTF-8')
+ raise LookupError, f"unknown encoding: '{encoding}'"
+ c_buffer = tree.xmlAllocOutputBuffer(enchandler)
+ if c_buffer is NULL:
+ tree.xmlCharEncCloseFunc(enchandler)
+ raise MemoryError()
+
+ with nogil:
+ _writeNodeToBuffer(c_buffer, element._c_node, c_enc, c_doctype, c_method,
+ write_xml_declaration, write_complete_document,
+ pretty_print, with_tail, standalone)
+ tree.xmlOutputBufferFlush(c_buffer)
+ if c_buffer.conv is not NULL:
+ c_result_buffer = c_buffer.conv
+ else:
+ c_result_buffer = c_buffer.buffer
+
+ error_result = c_buffer.error
+ if error_result != xmlerror.XML_ERR_OK:
+ tree.xmlOutputBufferClose(c_buffer)
+ _raiseSerialisationError(error_result)
+
+ try:
+ if encoding is unicode:
+ result = (<unsigned char*>tree.xmlBufContent(
+ c_result_buffer))[:tree.xmlBufUse(c_result_buffer)].decode('UTF-8')
+ else:
+ result = <bytes>(<unsigned char*>tree.xmlBufContent(
+ c_result_buffer))[:tree.xmlBufUse(c_result_buffer)]
+ finally:
+ error_result = tree.xmlOutputBufferClose(c_buffer)
+ if error_result == -1:
+ _raiseSerialisationError(error_result)
+ return result
+
+cdef bytes _tostringC14N(element_or_tree, bint exclusive, bint with_comments, inclusive_ns_prefixes):
+ cdef xmlDoc* c_doc
+ cdef xmlChar* c_buffer = NULL
+ cdef int byte_count = -1
+ cdef bytes result
+ cdef _Document doc
+ cdef _Element element
+ cdef xmlChar **c_inclusive_ns_prefixes
+
+ if isinstance(element_or_tree, _Element):
+ _assertValidNode(<_Element>element_or_tree)
+ doc = (<_Element>element_or_tree)._doc
+ c_doc = _plainFakeRootDoc(doc._c_doc, (<_Element>element_or_tree)._c_node, 0)
+ else:
+ doc = _documentOrRaise(element_or_tree)
+ _assertValidDoc(doc)
+ c_doc = doc._c_doc
+
+ c_inclusive_ns_prefixes = _convert_ns_prefixes(c_doc.dict, inclusive_ns_prefixes) if inclusive_ns_prefixes else NULL
+ try:
+ with nogil:
+ byte_count = c14n.xmlC14NDocDumpMemory(
+ c_doc, NULL, exclusive, c_inclusive_ns_prefixes, with_comments, &c_buffer)
+
+ finally:
+ _destroyFakeDoc(doc._c_doc, c_doc)
+ if c_inclusive_ns_prefixes is not NULL:
+ python.lxml_free(c_inclusive_ns_prefixes)
+
+ if byte_count < 0 or c_buffer is NULL:
+ if c_buffer is not NULL:
+ tree.xmlFree(c_buffer)
+ raise C14NError, u"C14N failed"
+ try:
+ result = c_buffer[:byte_count]
+ finally:
+ tree.xmlFree(c_buffer)
+ return result
+
+cdef _raiseSerialisationError(int error_result):
+ if error_result == xmlerror.XML_ERR_NO_MEMORY:
+ raise MemoryError()
+ message = ErrorTypes._getName(error_result)
+ if message is None:
+ message = f"unknown error {error_result}"
+ raise SerialisationError, message
+
+############################################################
+# low-level serialisation functions
+
+cdef void _writeDoctype(tree.xmlOutputBuffer* c_buffer,
+ const_xmlChar* c_doctype) nogil:
+ tree.xmlOutputBufferWrite(c_buffer, tree.xmlStrlen(c_doctype),
+ <const_char*>c_doctype)
+ tree.xmlOutputBufferWriteString(c_buffer, "\n")
+
+cdef void _writeNodeToBuffer(tree.xmlOutputBuffer* c_buffer,
+ xmlNode* c_node, const_char* encoding, const_xmlChar* c_doctype,
+ int c_method, bint write_xml_declaration,
+ bint write_complete_document,
+ bint pretty_print, bint with_tail,
+ int standalone) nogil:
+ cdef xmlNode* c_nsdecl_node
+ cdef xmlDoc* c_doc = c_node.doc
+ if write_xml_declaration and c_method == OUTPUT_METHOD_XML:
+ _writeDeclarationToBuffer(c_buffer, c_doc.version, encoding, standalone)
+
+ # comments/processing instructions before doctype declaration
+ if write_complete_document and not c_buffer.error and c_doc.intSubset:
+ _writePrevSiblings(c_buffer, <xmlNode*>c_doc.intSubset, encoding, pretty_print)
+
+ if c_doctype:
+ _writeDoctype(c_buffer, c_doctype)
+ # write internal DTD subset, preceding PIs/comments, etc.
+ if write_complete_document and not c_buffer.error:
+ if c_doctype is NULL:
+ _writeDtdToBuffer(c_buffer, c_doc, c_node.name, c_method, encoding)
+ _writePrevSiblings(c_buffer, c_node, encoding, pretty_print)
+
+ c_nsdecl_node = c_node
+ if not c_node.parent or c_node.parent.type != tree.XML_DOCUMENT_NODE:
+ # copy the node and add namespaces from parents
+ # this is required to make libxml write them
+ c_nsdecl_node = tree.xmlCopyNode(c_node, 2)
+ if not c_nsdecl_node:
+ c_buffer.error = xmlerror.XML_ERR_NO_MEMORY
+ return
+ _copyParentNamespaces(c_node, c_nsdecl_node)
+
+ c_nsdecl_node.parent = c_node.parent
+ c_nsdecl_node.children = c_node.children
+ c_nsdecl_node.last = c_node.last
+
+ # write node
+ if c_method == OUTPUT_METHOD_HTML:
+ tree.htmlNodeDumpFormatOutput(
+ c_buffer, c_doc, c_nsdecl_node, encoding, pretty_print)
+ else:
+ tree.xmlNodeDumpOutput(
+ c_buffer, c_doc, c_nsdecl_node, 0, pretty_print, encoding)
+
+ if c_nsdecl_node is not c_node:
+ # clean up
+ c_nsdecl_node.children = c_nsdecl_node.last = NULL
+ tree.xmlFreeNode(c_nsdecl_node)
+
+ if c_buffer.error:
+ return
+
+ # write tail, trailing comments, etc.
+ if with_tail:
+ _writeTail(c_buffer, c_node, encoding, c_method, pretty_print)
+ if write_complete_document:
+ _writeNextSiblings(c_buffer, c_node, encoding, pretty_print)
+ if pretty_print:
+ tree.xmlOutputBufferWrite(c_buffer, 1, "\n")
+
+cdef void _writeDeclarationToBuffer(tree.xmlOutputBuffer* c_buffer,
+ const_xmlChar* version, const_char* encoding,
+ int standalone) nogil:
+ if version is NULL:
+ version = <unsigned char*>"1.0"
+ tree.xmlOutputBufferWrite(c_buffer, 15, "<?xml version='")
+ tree.xmlOutputBufferWriteString(c_buffer, <const_char*>version)
+ tree.xmlOutputBufferWrite(c_buffer, 12, "' encoding='")
+ tree.xmlOutputBufferWriteString(c_buffer, encoding)
+ if standalone == 0:
+ tree.xmlOutputBufferWrite(c_buffer, 20, "' standalone='no'?>\n")
+ elif standalone == 1:
+ tree.xmlOutputBufferWrite(c_buffer, 21, "' standalone='yes'?>\n")
+ else:
+ tree.xmlOutputBufferWrite(c_buffer, 4, "'?>\n")
+
+cdef void _writeDtdToBuffer(tree.xmlOutputBuffer* c_buffer,
+ xmlDoc* c_doc, const_xmlChar* c_root_name,
+ int c_method, const_char* encoding) nogil:
+ cdef tree.xmlDtd* c_dtd
+ cdef xmlNode* c_node
+ cdef char* quotechar
+ c_dtd = c_doc.intSubset
+ if not c_dtd or not c_dtd.name:
+ return
+
+ # Name in document type declaration must match the root element tag.
+ # For XML, case sensitive match, for HTML insensitive.
+ if c_method == OUTPUT_METHOD_HTML:
+ if tree.xmlStrcasecmp(c_root_name, c_dtd.name) != 0:
+ return
+ else:
+ if tree.xmlStrcmp(c_root_name, c_dtd.name) != 0:
+ return
+
+ tree.xmlOutputBufferWrite(c_buffer, 10, "<!DOCTYPE ")
+ tree.xmlOutputBufferWriteString(c_buffer, <const_char*>c_dtd.name)
+
+ cdef const_xmlChar* public_id = c_dtd.ExternalID
+ cdef const_xmlChar* sys_url = c_dtd.SystemID
+ if public_id and public_id[0] == b'\0':
+ public_id = NULL
+ if sys_url and sys_url[0] == b'\0':
+ sys_url = NULL
+
+ if public_id:
+ tree.xmlOutputBufferWrite(c_buffer, 9, ' PUBLIC "')
+ tree.xmlOutputBufferWriteString(c_buffer, <const_char*>public_id)
+ if sys_url:
+ tree.xmlOutputBufferWrite(c_buffer, 2, '" ')
+ else:
+ tree.xmlOutputBufferWrite(c_buffer, 1, '"')
+ elif sys_url:
+ tree.xmlOutputBufferWrite(c_buffer, 8, ' SYSTEM ')
+
+ if sys_url:
+ if tree.xmlStrchr(sys_url, b'"'):
+ quotechar = '\''
+ else:
+ quotechar = '"'
+ tree.xmlOutputBufferWrite(c_buffer, 1, quotechar)
+ tree.xmlOutputBufferWriteString(c_buffer, <const_char*>sys_url)
+ tree.xmlOutputBufferWrite(c_buffer, 1, quotechar)
+
+ if (not c_dtd.entities and not c_dtd.elements and
+ not c_dtd.attributes and not c_dtd.notations and
+ not c_dtd.pentities):
+ tree.xmlOutputBufferWrite(c_buffer, 2, '>\n')
+ return
+
+ tree.xmlOutputBufferWrite(c_buffer, 3, ' [\n')
+ if c_dtd.notations and not c_buffer.error:
+ c_buf = tree.xmlBufferCreate()
+ if not c_buf:
+ c_buffer.error = xmlerror.XML_ERR_NO_MEMORY
+ return
+ tree.xmlDumpNotationTable(c_buf, <tree.xmlNotationTable*>c_dtd.notations)
+ tree.xmlOutputBufferWrite(
+ c_buffer, tree.xmlBufferLength(c_buf),
+ <const_char*>tree.xmlBufferContent(c_buf))
+ tree.xmlBufferFree(c_buf)
+ c_node = c_dtd.children
+ while c_node and not c_buffer.error:
+ tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_node, 0, 0, encoding)
+ c_node = c_node.next
+ tree.xmlOutputBufferWrite(c_buffer, 3, "]>\n")
+
+cdef void _writeTail(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node,
+ const_char* encoding, int c_method, bint pretty_print) nogil:
+ u"Write the element tail."
+ c_node = c_node.next
+ while c_node and not c_buffer.error and c_node.type in (
+ tree.XML_TEXT_NODE, tree.XML_CDATA_SECTION_NODE):
+ if c_method == OUTPUT_METHOD_HTML:
+ tree.htmlNodeDumpFormatOutput(
+ c_buffer, c_node.doc, c_node, encoding, pretty_print)
+ else:
+ tree.xmlNodeDumpOutput(
+ c_buffer, c_node.doc, c_node, 0, pretty_print, encoding)
+ c_node = c_node.next
+
+cdef void _writePrevSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node,
+ const_char* encoding, bint pretty_print) nogil:
+ cdef xmlNode* c_sibling
+ if c_node.parent and _isElement(c_node.parent):
+ return
+ # we are at a root node, so add PI and comment siblings
+ c_sibling = c_node
+ while c_sibling.prev and \
+ (c_sibling.prev.type == tree.XML_PI_NODE or
+ c_sibling.prev.type == tree.XML_COMMENT_NODE):
+ c_sibling = c_sibling.prev
+ while c_sibling is not c_node and not c_buffer.error:
+ tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_sibling, 0,
+ pretty_print, encoding)
+ if pretty_print:
+ tree.xmlOutputBufferWriteString(c_buffer, "\n")
+ c_sibling = c_sibling.next
+
+cdef void _writeNextSiblings(tree.xmlOutputBuffer* c_buffer, xmlNode* c_node,
+ const_char* encoding, bint pretty_print) nogil:
+ cdef xmlNode* c_sibling
+ if c_node.parent and _isElement(c_node.parent):
+ return
+ # we are at a root node, so add PI and comment siblings
+ c_sibling = c_node.next
+ while not c_buffer.error and c_sibling and \
+ (c_sibling.type == tree.XML_PI_NODE or
+ c_sibling.type == tree.XML_COMMENT_NODE):
+ if pretty_print:
+ tree.xmlOutputBufferWriteString(c_buffer, "\n")
+ tree.xmlNodeDumpOutput(c_buffer, c_node.doc, c_sibling, 0,
+ pretty_print, encoding)
+ c_sibling = c_sibling.next
+
+
+# copied and adapted from libxml2
+cdef unsigned char *xmlSerializeHexCharRef(unsigned char *out, int val):
+ cdef xmlChar *ptr
+ cdef xmlChar c
+
+ out[0] = '&'
+ out += 1
+
+ out[0] = '#'
+ out += 1
+
+ out[0] = 'x'
+ out += 1
+
+ if val < 0x10:
+ ptr = out
+ elif val < 0x100:
+ ptr = out + 1
+ elif val < 0x1000:
+ ptr = out + 2
+ elif val < 0x10000:
+ ptr = out + 3
+ elif val < 0x100000:
+ ptr = out + 4
+ else:
+ ptr = out + 5
+
+ out = ptr + 1
+ while val > 0:
+ c = (val & 0xF)
+
+ if c == 0:
+ ptr[0] = '0'
+ elif c == 1:
+ ptr[0] = '1'
+ elif c == 2:
+ ptr[0] = '2'
+ elif c == 3:
+ ptr[0] = '3'
+ elif c == 4:
+ ptr[0] = '4'
+ elif c == 5:
+ ptr[0] = '5'
+ elif c == 6:
+ ptr[0] = '6'
+ elif c == 7:
+ ptr[0] = '7'
+ elif c == 8:
+ ptr[0] = '8'
+ elif c == 9:
+ ptr[0] = '9'
+ elif c == 0xA:
+ ptr[0] = 'A'
+ elif c == 0xB:
+ ptr[0] = 'B'
+ elif c == 0xC:
+ ptr[0] = 'C'
+ elif c == 0xD:
+ ptr[0] = 'D'
+ elif c == 0xE:
+ ptr[0] = 'E'
+ elif c == 0xF:
+ ptr[0] = 'F'
+ else:
+ ptr[0] = '0'
+
+ ptr -= 1
+
+ val >>= 4
+
+ out[0] = ';'
+ out += 1
+ out[0] = 0
+
+ return out
+
+
+# copied and adapted from libxml2 (xmlBufAttrSerializeTxtContent())
+cdef _write_attr_string(tree.xmlOutputBuffer* buf, const char *string):
+ cdef const char *base
+ cdef const char *cur
+ cdef const unsigned char *ucur
+
+ cdef unsigned char tmp[12]
+ cdef int val = 0
+ cdef int l
+
+ if string == NULL:
+ return
+
+ base = cur = <const char*>string
+ while cur[0] != 0:
+ if cur[0] == '\n':
+ if base != cur:
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 5, "&#10;")
+ cur += 1
+ base = cur
+
+ elif cur[0] == '\r':
+ if base != cur:
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 5, "&#13;")
+ cur += 1
+ base = cur
+
+ elif cur[0] == '\t':
+ if base != cur:
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 4, "&#9;")
+ cur += 1
+ base = cur
+
+ elif cur[0] == '"':
+ if base != cur:
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 6, "&quot;")
+ cur += 1
+ base = cur
+
+ elif cur[0] == '<':
+ if base != cur:
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 4, "&lt;")
+ cur += 1
+ base = cur
+
+ elif cur[0] == '>':
+ if base != cur:
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 4, "&gt;")
+ cur += 1
+ base = cur
+ elif cur[0] == '&':
+ if base != cur:
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ tree.xmlOutputBufferWrite(buf, 5, "&amp;")
+ cur += 1
+ base = cur
+
+ elif (<const unsigned char>cur[0] >= 0x80) and (cur[1] != 0):
+
+ if base != cur:
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+ ucur = <const unsigned char *>cur
+
+ if ucur[0] < 0xC0:
+ # invalid UTF-8 sequence
+ val = ucur[0]
+ l = 1
+
+ elif ucur[0] < 0xE0:
+ val = (ucur[0]) & 0x1F
+ val <<= 6
+ val |= (ucur[1]) & 0x3F
+ l = 2
+
+ elif (ucur[0] < 0xF0) and (ucur[2] != 0):
+ val = (ucur[0]) & 0x0F
+ val <<= 6
+ val |= (ucur[1]) & 0x3F
+ val <<= 6
+ val |= (ucur[2]) & 0x3F
+ l = 3
+
+ elif (ucur[0] < 0xF8) and (ucur[2] != 0) and (ucur[3] != 0):
+ val = (ucur[0]) & 0x07
+ val <<= 6
+ val |= (ucur[1]) & 0x3F
+ val <<= 6
+ val |= (ucur[2]) & 0x3F
+ val <<= 6
+ val |= (ucur[3]) & 0x3F
+ l = 4
+ else:
+ # invalid UTF-8 sequence
+ val = ucur[0]
+ l = 1
+
+ if (l == 1) or (not tree.xmlIsCharQ(val)):
+ raise ValueError(f"Invalid character: {val:X}")
+
+ # We could do multiple things here. Just save
+ # as a char ref
+ xmlSerializeHexCharRef(tmp, val)
+ tree.xmlOutputBufferWrite(buf, len(tmp), <const char*> tmp)
+ cur += l
+ base = cur
+
+ else:
+ cur += 1
+
+ if base != cur:
+ tree.xmlOutputBufferWrite(buf, cur - base, base)
+
+
+############################################################
+# output to file-like objects
+
+cdef object io_open
+from io import open
+
+cdef object gzip
+import gzip
+
+cdef object getwriter
+from codecs import getwriter
+cdef object utf8_writer = getwriter('utf8')
+
+cdef object contextmanager
+from contextlib import contextmanager
+
+cdef object _open_utf8_file
+
+@contextmanager
+def _open_utf8_file(file, compression=0):
+ if _isString(file):
+ if compression:
+ with gzip.GzipFile(file, mode='wb', compresslevel=compression) as zf:
+ yield utf8_writer(zf)
+ else:
+ with io_open(file, 'w', encoding='utf8') as f:
+ yield f
+ else:
+ if compression:
+ with gzip.GzipFile(fileobj=file, mode='wb', compresslevel=compression) as zf:
+ yield utf8_writer(zf)
+ else:
+ yield utf8_writer(file)
+
+
+@cython.final
+@cython.internal
+cdef class _FilelikeWriter:
+ cdef object _filelike
+ cdef object _close_filelike
+ cdef _ExceptionContext _exc_context
+ cdef _ErrorLog error_log
+ def __cinit__(self, filelike, exc_context=None, compression=None, close=False):
+ if compression is not None and compression > 0:
+ filelike = GzipFile(
+ fileobj=filelike, mode='wb', compresslevel=compression)
+ self._close_filelike = filelike.close
+ elif close:
+ self._close_filelike = filelike.close
+ self._filelike = filelike
+ if exc_context is None:
+ self._exc_context = _ExceptionContext()
+ else:
+ self._exc_context = exc_context
+ self.error_log = _ErrorLog()
+
+ cdef tree.xmlOutputBuffer* _createOutputBuffer(
+ self, tree.xmlCharEncodingHandler* enchandler) except NULL:
+ cdef tree.xmlOutputBuffer* c_buffer
+ c_buffer = tree.xmlOutputBufferCreateIO(
+ <tree.xmlOutputWriteCallback>_writeFilelikeWriter, _closeFilelikeWriter,
+ <python.PyObject*>self, enchandler)
+ if c_buffer is NULL:
+ raise IOError, u"Could not create I/O writer context."
+ return c_buffer
+
+ cdef int write(self, char* c_buffer, int size):
+ try:
+ if self._filelike is None:
+ raise IOError, u"File is already closed"
+ py_buffer = <bytes>c_buffer[:size]
+ self._filelike.write(py_buffer)
+ except:
+ size = -1
+ self._exc_context._store_raised()
+ finally:
+ return size # and swallow any further exceptions
+
+ cdef int close(self):
+ retval = 0
+ try:
+ if self._close_filelike is not None:
+ self._close_filelike()
+ # we should not close the file here as we didn't open it
+ self._filelike = None
+ except:
+ retval = -1
+ self._exc_context._store_raised()
+ finally:
+ return retval # and swallow any further exceptions
+
+cdef int _writeFilelikeWriter(void* ctxt, char* c_buffer, int length):
+ return (<_FilelikeWriter>ctxt).write(c_buffer, length)
+
+cdef int _closeFilelikeWriter(void* ctxt):
+ return (<_FilelikeWriter>ctxt).close()
+
+cdef _tofilelike(f, _Element element, encoding, doctype, method,
+ bint write_xml_declaration, bint write_doctype,
+ bint pretty_print, bint with_tail, int standalone,
+ int compression):
+ cdef _FilelikeWriter writer = None
+ cdef tree.xmlOutputBuffer* c_buffer
+ cdef tree.xmlCharEncodingHandler* enchandler
+ cdef const_char* c_enc
+ cdef const_xmlChar* c_doctype
+ cdef int error_result
+
+ c_method = _findOutputMethod(method)
+ if c_method == OUTPUT_METHOD_TEXT:
+ data = _textToString(element._c_node, encoding, with_tail)
+ if compression:
+ bytes_out = BytesIO()
+ with GzipFile(fileobj=bytes_out, mode='wb', compresslevel=compression) as gzip_file:
+ gzip_file.write(data)
+ data = bytes_out.getvalue()
+ if _isString(f):
+ filename8 = _encodeFilename(f)
+ with open(filename8, 'wb') as f:
+ f.write(data)
+ else:
+ f.write(data)
+ return
+
+ if encoding is None:
+ c_enc = NULL
+ else:
+ encoding = _utf8(encoding)
+ c_enc = _cstr(encoding)
+ if doctype is None:
+ c_doctype = NULL
+ else:
+ doctype = _utf8(doctype)
+ c_doctype = _xcstr(doctype)
+
+ writer = _create_output_buffer(f, c_enc, compression, &c_buffer, close=False)
+ if writer is None:
+ with nogil:
+ error_result = _serialise_node(
+ c_buffer, c_doctype, c_enc, element._c_node, c_method,
+ write_xml_declaration, write_doctype, pretty_print, with_tail, standalone)
+ else:
+ error_result = _serialise_node(
+ c_buffer, c_doctype, c_enc, element._c_node, c_method,
+ write_xml_declaration, write_doctype, pretty_print, with_tail, standalone)
+
+ if writer is not None:
+ writer._exc_context._raise_if_stored()
+ if error_result != xmlerror.XML_ERR_OK:
+ _raiseSerialisationError(error_result)
+
+
+cdef int _serialise_node(tree.xmlOutputBuffer* c_buffer, const_xmlChar* c_doctype,
+ const_char* c_enc, xmlNode* c_node, int c_method,
+ bint write_xml_declaration, bint write_doctype, bint pretty_print,
+ bint with_tail, int standalone) nogil:
+ _writeNodeToBuffer(
+ c_buffer, c_node, c_enc, c_doctype, c_method,
+ write_xml_declaration, write_doctype, pretty_print, with_tail, standalone)
+ error_result = c_buffer.error
+ if error_result == xmlerror.XML_ERR_OK:
+ error_result = tree.xmlOutputBufferClose(c_buffer)
+ if error_result != -1:
+ error_result = xmlerror.XML_ERR_OK
+ else:
+ tree.xmlOutputBufferClose(c_buffer)
+ return error_result
+
+
+cdef _FilelikeWriter _create_output_buffer(
+ f, const_char* c_enc, int c_compression,
+ tree.xmlOutputBuffer** c_buffer_ret, bint close):
+ cdef tree.xmlOutputBuffer* c_buffer
+ cdef _FilelikeWriter writer
+ cdef bytes filename8
+ enchandler = tree.xmlFindCharEncodingHandler(c_enc)
+ if enchandler is NULL:
+ raise LookupError(
+ f"unknown encoding: '{c_enc.decode('UTF-8') if c_enc is not NULL else u''}'")
+ try:
+ if _isString(f):
+ filename8 = _encodeFilename(f)
+ if b'%' in filename8 and (
+ # Exclude absolute Windows paths and file:// URLs.
+ _isFilePath(<const xmlChar*>filename8) not in (NO_FILE_PATH, ABS_WIN_FILE_PATH)
+ or filename8[:7].lower() == b'file://'):
+ # A file path (not a URL) containing the '%' URL escape character.
+ # libxml2 uses URL-unescaping on these, so escape the path before passing it in.
+ filename8 = filename8.replace(b'%', b'%25')
+ c_buffer = tree.xmlOutputBufferCreateFilename(
+ _cstr(filename8), enchandler, c_compression)
+ if c_buffer is NULL:
+ python.PyErr_SetFromErrno(IOError) # raises IOError
+ writer = None
+ elif hasattr(f, 'write'):
+ writer = _FilelikeWriter(f, compression=c_compression, close=close)
+ c_buffer = writer._createOutputBuffer(enchandler)
+ else:
+ raise TypeError(
+ f"File or filename expected, got '{python._fqtypename(f).decode('UTF-8')}'")
+ except:
+ tree.xmlCharEncCloseFunc(enchandler)
+ raise
+ c_buffer_ret[0] = c_buffer
+ return writer
+
+cdef xmlChar **_convert_ns_prefixes(tree.xmlDict* c_dict, ns_prefixes) except NULL:
+ cdef size_t i, num_ns_prefixes = len(ns_prefixes)
+ # Need to allocate one extra memory block to handle last NULL entry
+ c_ns_prefixes = <xmlChar **>python.lxml_malloc(num_ns_prefixes + 1, sizeof(xmlChar*))
+ if not c_ns_prefixes:
+ raise MemoryError()
+ i = 0
+ try:
+ for prefix in ns_prefixes:
+ prefix_utf = _utf8(prefix)
+ c_prefix = tree.xmlDictExists(c_dict, _xcstr(prefix_utf), len(prefix_utf))
+ if c_prefix:
+ # unknown prefixes do not need to get serialised
+ c_ns_prefixes[i] = <xmlChar*>c_prefix
+ i += 1
+ except:
+ python.lxml_free(c_ns_prefixes)
+ raise
+
+ c_ns_prefixes[i] = NULL # append end marker
+ return c_ns_prefixes
+
+cdef _tofilelikeC14N(f, _Element element, bint exclusive, bint with_comments,
+ int compression, inclusive_ns_prefixes):
+ cdef _FilelikeWriter writer = None
+ cdef tree.xmlOutputBuffer* c_buffer
+ cdef xmlChar **c_inclusive_ns_prefixes = NULL
+ cdef char* c_filename
+ cdef xmlDoc* c_base_doc
+ cdef xmlDoc* c_doc
+ cdef int bytes_count, error = 0
+
+ c_base_doc = element._c_node.doc
+ c_doc = _fakeRootDoc(c_base_doc, element._c_node)
+ try:
+ c_inclusive_ns_prefixes = (
+ _convert_ns_prefixes(c_doc.dict, inclusive_ns_prefixes)
+ if inclusive_ns_prefixes else NULL)
+
+ if _isString(f):
+ filename8 = _encodeFilename(f)
+ c_filename = _cstr(filename8)
+ with nogil:
+ error = c14n.xmlC14NDocSave(
+ c_doc, NULL, exclusive, c_inclusive_ns_prefixes,
+ with_comments, c_filename, compression)
+ elif hasattr(f, 'write'):
+ writer = _FilelikeWriter(f, compression=compression)
+ c_buffer = writer._createOutputBuffer(NULL)
+ with writer.error_log:
+ bytes_count = c14n.xmlC14NDocSaveTo(
+ c_doc, NULL, exclusive, c_inclusive_ns_prefixes,
+ with_comments, c_buffer)
+ error = tree.xmlOutputBufferClose(c_buffer)
+ if bytes_count < 0:
+ error = bytes_count
+ elif error != -1:
+ error = xmlerror.XML_ERR_OK
+ else:
+ raise TypeError(f"File or filename expected, got '{python._fqtypename(f).decode('UTF-8')}'")
+ finally:
+ _destroyFakeDoc(c_base_doc, c_doc)
+ if c_inclusive_ns_prefixes is not NULL:
+ python.lxml_free(c_inclusive_ns_prefixes)
+
+ if writer is not None:
+ writer._exc_context._raise_if_stored()
+
+ if error < 0:
+ message = u"C14N failed"
+ if writer is not None:
+ errors = writer.error_log
+ if len(errors):
+ message = errors[0].message
+ raise C14NError(message)
+
+
+# C14N 2.0
+
+def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
+ """Convert XML to its C14N 2.0 serialised form.
+
+ If *out* is provided, it must be a file or file-like object that receives
+ the serialised canonical XML output (text, not bytes) through its ``.write()``
+ method. To write to a file, open it in text mode with encoding "utf-8".
+ If *out* is not provided, this function returns the output as text string.
+
+ Either *xml_data* (an XML string, tree or Element) or *file*
+ (a file path or file-like object) must be provided as input.
+
+ The configuration options are the same as for the ``C14NWriterTarget``.
+ """
+ if xml_data is None and from_file is None:
+ raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
+
+ sio = None
+ if out is None:
+ sio = out = StringIO()
+
+ target = C14NWriterTarget(out.write, **options)
+
+ if xml_data is not None and not isinstance(xml_data, basestring):
+ _tree_to_target(xml_data, target)
+ return sio.getvalue() if sio is not None else None
+
+ cdef _FeedParser parser = XMLParser(
+ target=target,
+ attribute_defaults=True,
+ collect_ids=False,
+ )
+
+ if xml_data is not None:
+ parser.feed(xml_data)
+ parser.close()
+ elif from_file is not None:
+ try:
+ _parseDocument(from_file, parser, base_url=None)
+ except _TargetParserResult:
+ pass
+
+ return sio.getvalue() if sio is not None else None
+
+
+cdef _tree_to_target(element, target):
+ for event, elem in iterwalk(element, events=('start', 'end', 'start-ns', 'comment', 'pi')):
+ text = None
+ if event == 'start':
+ target.start(elem.tag, elem.attrib)
+ text = elem.text
+ elif event == 'end':
+ target.end(elem.tag)
+ text = elem.tail
+ elif event == 'start-ns':
+ target.start_ns(*elem)
+ continue
+ elif event == 'comment':
+ target.comment(elem.text)
+ text = elem.tail
+ elif event == 'pi':
+ target.pi(elem.target, elem.text)
+ text = elem.tail
+ if text:
+ target.data(text)
+ return target.close()
+
+
+cdef object _looks_like_prefix_name = re.compile('^\w+:\w+$', re.UNICODE).match
+
+
+cdef class C14NWriterTarget:
+ """
+ Canonicalization writer target for the XMLParser.
+
+ Serialises parse events to XML C14N 2.0.
+
+ Configuration options:
+
+ - *with_comments*: set to true to include comments
+ - *strip_text*: set to true to strip whitespace before and after text content
+ - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
+ - *qname_aware_tags*: a set of qname aware tag names in which prefixes
+ should be replaced in text content
+ - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
+ should be replaced in text content
+ - *exclude_attrs*: a set of attribute names that should not be serialised
+ - *exclude_tags*: a set of tag names that should not be serialised
+ """
+ cdef object _write
+ cdef list _data
+ cdef set _qname_aware_tags
+ cdef object _find_qname_aware_attrs
+ cdef list _declared_ns_stack
+ cdef list _ns_stack
+ cdef dict _prefix_map
+ cdef list _preserve_space
+ cdef tuple _pending_start
+ cdef set _exclude_tags
+ cdef set _exclude_attrs
+ cdef Py_ssize_t _ignored_depth
+ cdef bint _with_comments
+ cdef bint _strip_text
+ cdef bint _rewrite_prefixes
+ cdef bint _root_seen
+ cdef bint _root_done
+
+ def __init__(self, write, *,
+ with_comments=False, strip_text=False, rewrite_prefixes=False,
+ qname_aware_tags=None, qname_aware_attrs=None,
+ exclude_attrs=None, exclude_tags=None):
+ self._write = write
+ self._data = []
+ self._with_comments = with_comments
+ self._strip_text = strip_text
+ self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
+ self._exclude_tags = set(exclude_tags) if exclude_tags else None
+
+ self._rewrite_prefixes = rewrite_prefixes
+ if qname_aware_tags:
+ self._qname_aware_tags = set(qname_aware_tags)
+ else:
+ self._qname_aware_tags = None
+ if qname_aware_attrs:
+ self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
+ else:
+ self._find_qname_aware_attrs = None
+
+ # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
+ self._declared_ns_stack = [[
+ ("http://www.w3.org/XML/1998/namespace", "xml"),
+ ]]
+ # Stack with user declared namespace prefixes as (uri, prefix) pairs.
+ self._ns_stack = []
+ if not rewrite_prefixes:
+ self._ns_stack.append(_DEFAULT_NAMESPACE_PREFIXES.items())
+ self._ns_stack.append([])
+ self._prefix_map = {}
+ self._preserve_space = [False]
+ self._pending_start = None
+ self._ignored_depth = 0
+ self._root_seen = False
+ self._root_done = False
+
+ def _iter_namespaces(self, ns_stack):
+ for namespaces in reversed(ns_stack):
+ if namespaces: # almost no element declares new namespaces
+ yield from namespaces
+
+ cdef _resolve_prefix_name(self, prefixed_name):
+ prefix, name = prefixed_name.split(':', 1)
+ for uri, p in self._iter_namespaces(self._ns_stack):
+ if p == prefix:
+ return f'{{{uri}}}{name}'
+ raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
+
+ cdef _qname(self, qname, uri=None):
+ if uri is None:
+ uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
+ else:
+ tag = qname
+
+ prefixes_seen = set()
+ for u, prefix in self._iter_namespaces(self._declared_ns_stack):
+ if u == uri and prefix not in prefixes_seen:
+ return f'{prefix}:{tag}' if prefix else tag, tag, uri
+ prefixes_seen.add(prefix)
+
+ # Not declared yet => add new declaration.
+ if self._rewrite_prefixes:
+ if uri in self._prefix_map:
+ prefix = self._prefix_map[uri]
+ else:
+ prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
+ self._declared_ns_stack[-1].append((uri, prefix))
+ return f'{prefix}:{tag}', tag, uri
+
+ if not uri and '' not in prefixes_seen:
+ # No default namespace declared => no prefix needed.
+ return tag, tag, uri
+
+ for u, prefix in self._iter_namespaces(self._ns_stack):
+ if u == uri:
+ self._declared_ns_stack[-1].append((uri, prefix))
+ return f'{prefix}:{tag}' if prefix else tag, tag, uri
+
+ if not uri:
+ # As soon as a default namespace is defined,
+ # anything that has no namespace (and thus, no prefix) goes there.
+ return tag, tag, uri
+
+ raise ValueError(f'Namespace "{uri}" of name "{tag}" is not declared in scope')
+
+ def data(self, data):
+ if not self._ignored_depth:
+ self._data.append(data)
+
+ cdef _flush(self):
+ data = u''.join(self._data)
+ del self._data[:]
+ if self._strip_text and not self._preserve_space[-1]:
+ data = data.strip()
+ if self._pending_start is not None:
+ (tag, attrs, new_namespaces), self._pending_start = self._pending_start, None
+ qname_text = data if u':' in data and _looks_like_prefix_name(data) else None
+ self._start(tag, attrs, new_namespaces, qname_text)
+ if qname_text is not None:
+ return
+ if data and self._root_seen:
+ self._write(_escape_cdata_c14n(data))
+
+ def start_ns(self, prefix, uri):
+ if self._ignored_depth:
+ return
+ # we may have to resolve qnames in text content
+ if self._data:
+ self._flush()
+ self._ns_stack[-1].append((uri, prefix))
+
+ def start(self, tag, attrs):
+ if self._exclude_tags is not None and (
+ self._ignored_depth or tag in self._exclude_tags):
+ self._ignored_depth += 1
+ return
+ if self._data:
+ self._flush()
+
+ new_namespaces = []
+ self._declared_ns_stack.append(new_namespaces)
+
+ if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
+ # Need to parse text first to see if it requires a prefix declaration.
+ self._pending_start = (tag, attrs, new_namespaces)
+ return
+ self._start(tag, attrs, new_namespaces)
+
+ cdef _start(self, tag, attrs, new_namespaces, qname_text=None):
+ if self._exclude_attrs is not None and attrs:
+ attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
+
+ qnames = {tag, *attrs}
+ resolved_names = {}
+
+ # Resolve prefixes in attribute and tag text.
+ if qname_text is not None:
+ qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
+ qnames.add(qname)
+ if self._find_qname_aware_attrs is not None and attrs:
+ qattrs = self._find_qname_aware_attrs(attrs)
+ if qattrs:
+ for attr_name in qattrs:
+ value = attrs[attr_name]
+ if _looks_like_prefix_name(value):
+ qname = resolved_names[value] = self._resolve_prefix_name(value)
+ qnames.add(qname)
+ else:
+ qattrs = None
+ else:
+ qattrs = None
+
+ # Assign prefixes in lexicographical order of used URIs.
+ parsed_qnames = {n: self._qname(n) for n in sorted(
+ qnames, key=lambda n: n.split('}', 1))}
+
+ # Write namespace declarations in prefix order ...
+ if new_namespaces:
+ attr_list = [
+ (u'xmlns:' + prefix if prefix else u'xmlns', uri)
+ for uri, prefix in new_namespaces
+ ]
+ attr_list.sort()
+ else:
+ # almost always empty
+ attr_list = []
+
+ # ... followed by attributes in URI+name order
+ if attrs:
+ for k, v in sorted(attrs.items()):
+ if qattrs is not None and k in qattrs and v in resolved_names:
+ v = parsed_qnames[resolved_names[v]][0]
+ attr_qname, attr_name, uri = parsed_qnames[k]
+ # No prefix for attributes in default ('') namespace.
+ attr_list.append((attr_qname if uri else attr_name, v))
+
+ # Honour xml:space attributes.
+ space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
+ self._preserve_space.append(
+ space_behaviour == 'preserve' if space_behaviour
+ else self._preserve_space[-1])
+
+ # Write the tag.
+ write = self._write
+ write(u'<' + parsed_qnames[tag][0])
+ if attr_list:
+ write(u''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
+ write(u'>')
+
+ # Write the resolved qname text content.
+ if qname_text is not None:
+ write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
+
+ self._root_seen = True
+ self._ns_stack.append([])
+
+ def end(self, tag):
+ if self._ignored_depth:
+ self._ignored_depth -= 1
+ return
+ if self._data:
+ self._flush()
+ self._write(f'</{self._qname(tag)[0]}>')
+ self._preserve_space.pop()
+ self._root_done = len(self._preserve_space) == 1
+ self._declared_ns_stack.pop()
+ self._ns_stack.pop()
+
+ def comment(self, text):
+ if not self._with_comments:
+ return
+ if self._ignored_depth:
+ return
+ if self._root_done:
+ self._write(u'\n')
+ elif self._root_seen and self._data:
+ self._flush()
+ self._write(f'<!--{_escape_cdata_c14n(text)}-->')
+ if not self._root_seen:
+ self._write(u'\n')
+
+ def pi(self, target, data):
+ if self._ignored_depth:
+ return
+ if self._root_done:
+ self._write(u'\n')
+ elif self._root_seen and self._data:
+ self._flush()
+ self._write(
+ f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
+ if not self._root_seen:
+ self._write(u'\n')
+
+ def close(self):
+ return None
+
+
+cdef _raise_serialization_error(text):
+ raise TypeError("cannot serialize %r (type %s)" % (text, type(text).__name__))
+
+
+cdef unicode _escape_cdata_c14n(stext):
+ # escape character data
+ cdef unicode text
+ try:
+ # it's worth avoiding do-nothing calls for strings that are
+ # shorter than 500 character, or so. assume that's, by far,
+ # the most common case in most applications.
+ text = unicode(stext)
+ if u'&' in text:
+ text = text.replace(u'&', u'&amp;')
+ if u'<' in text:
+ text = text.replace(u'<', u'&lt;')
+ if u'>' in text:
+ text = text.replace(u'>', u'&gt;')
+ if u'\r' in text:
+ text = text.replace(u'\r', u'&#xD;')
+ return text
+ except (TypeError, AttributeError):
+ _raise_serialization_error(stext)
+
+
+cdef unicode _escape_attrib_c14n(stext):
+ # escape attribute value
+ cdef unicode text
+ try:
+ text = unicode(stext)
+ if u'&' in text:
+ text = text.replace(u'&', u'&amp;')
+ if u'<' in text:
+ text = text.replace(u'<', u'&lt;')
+ if u'"' in text:
+ text = text.replace(u'"', u'&quot;')
+ if u'\t' in text:
+ text = text.replace(u'\t', u'&#x9;')
+ if u'\n' in text:
+ text = text.replace(u'\n', u'&#xA;')
+ if u'\r' in text:
+ text = text.replace(u'\r', u'&#xD;')
+ return text
+ except (TypeError, AttributeError):
+ _raise_serialization_error(stext)
+
+
+# incremental serialisation
+
+cdef class xmlfile:
+ """xmlfile(self, output_file, encoding=None, compression=None, close=False, buffered=True)
+
+ A simple mechanism for incremental XML serialisation.
+
+ Usage example::
+
+ with xmlfile("somefile.xml", encoding='utf-8') as xf:
+ xf.write_declaration(standalone=True)
+ xf.write_doctype('<!DOCTYPE root SYSTEM "some.dtd">')
+
+ # generate an element (the root element)
+ with xf.element('root'):
+ # write a complete Element into the open root element
+ xf.write(etree.Element('test'))
+
+ # generate and write more Elements, e.g. through iterparse
+ for element in generate_some_elements():
+ # serialise generated elements into the XML file
+ xf.write(element)
+
+ # or write multiple Elements or strings at once
+ xf.write(etree.Element('start'), "text", etree.Element('end'))
+
+ If 'output_file' is a file(-like) object, passing ``close=True`` will
+ close it when exiting the context manager. By default, it is left
+ to the owner to do that. When a file path is used, lxml will take care
+ of opening and closing the file itself. Also, when a compression level
+ is set, lxml will deliberately close the file to make sure all data gets
+ compressed and written.
+
+ Setting ``buffered=False`` will flush the output after each operation,
+ such as opening or closing an ``xf.element()`` block or calling
+ ``xf.write()``. Alternatively, calling ``xf.flush()`` can be used to
+ explicitly flush any pending output when buffering is enabled.
+ """
+ cdef object output_file
+ cdef bytes encoding
+ cdef _IncrementalFileWriter writer
+ cdef _AsyncIncrementalFileWriter async_writer
+ cdef int compresslevel
+ cdef bint close
+ cdef bint buffered
+ cdef int method
+
+ def __init__(self, output_file not None, encoding=None, compression=None,
+ close=False, buffered=True):
+ self.output_file = output_file
+ self.encoding = _utf8orNone(encoding)
+ self.compresslevel = compression or 0
+ self.close = close
+ self.buffered = buffered
+ self.method = OUTPUT_METHOD_XML
+
+ def __enter__(self):
+ assert self.output_file is not None
+ self.writer = _IncrementalFileWriter(
+ self.output_file, self.encoding, self.compresslevel,
+ self.close, self.buffered, self.method)
+ return self.writer
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ if self.writer is not None:
+ old_writer, self.writer = self.writer, None
+ raise_on_error = exc_type is None
+ old_writer._close(raise_on_error)
+ if self.close:
+ self.output_file = None
+
+ async def __aenter__(self):
+ assert self.output_file is not None
+ if isinstance(self.output_file, basestring):
+ raise TypeError("Cannot asynchronously write to a plain file")
+ if not hasattr(self.output_file, 'write'):
+ raise TypeError("Output file needs an async .write() method")
+ self.async_writer = _AsyncIncrementalFileWriter(
+ self.output_file, self.encoding, self.compresslevel,
+ self.close, self.buffered, self.method)
+ return self.async_writer
+
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
+ if self.async_writer is not None:
+ old_writer, self.async_writer = self.async_writer, None
+ raise_on_error = exc_type is None
+ await old_writer._close(raise_on_error)
+ if self.close:
+ self.output_file = None
+
+
+cdef class htmlfile(xmlfile):
+ """htmlfile(self, output_file, encoding=None, compression=None, close=False, buffered=True)
+
+ A simple mechanism for incremental HTML serialisation. Works the same as
+ xmlfile.
+ """
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.method = OUTPUT_METHOD_HTML
+
+
+cdef enum _IncrementalFileWriterStatus:
+ WRITER_STARTING = 0
+ WRITER_DECL_WRITTEN = 1
+ WRITER_DTD_WRITTEN = 2
+ WRITER_IN_ELEMENT = 3
+ WRITER_FINISHED = 4
+
+
+@cython.final
+@cython.internal
+cdef class _IncrementalFileWriter:
+ cdef tree.xmlOutputBuffer* _c_out
+ cdef bytes _encoding
+ cdef const_char* _c_encoding
+ cdef _FilelikeWriter _target
+ cdef list _element_stack
+ cdef int _status
+ cdef int _method
+ cdef bint _buffered
+
+ def __cinit__(self, outfile, bytes encoding, int compresslevel, bint close,
+ bint buffered, int method):
+ self._status = WRITER_STARTING
+ self._element_stack = []
+ if encoding is None:
+ encoding = b'ASCII'
+ self._encoding = encoding
+ self._c_encoding = _cstr(encoding) if encoding is not None else NULL
+ self._buffered = buffered
+ self._target = _create_output_buffer(
+ outfile, self._c_encoding, compresslevel, &self._c_out, close)
+ self._method = method
+
+ def __dealloc__(self):
+ if self._c_out is not NULL:
+ tree.xmlOutputBufferClose(self._c_out)
+
+ def write_declaration(self, version=None, standalone=None, doctype=None):
+ """write_declaration(self, version=None, standalone=None, doctype=None)
+
+ Write an XML declaration and (optionally) a doctype into the file.
+ """
+ assert self._c_out is not NULL
+ cdef const_xmlChar* c_version
+ cdef int c_standalone
+ if self._method != OUTPUT_METHOD_XML:
+ raise LxmlSyntaxError("only XML documents have declarations")
+ if self._status >= WRITER_DECL_WRITTEN:
+ raise LxmlSyntaxError("XML declaration already written")
+ version = _utf8orNone(version)
+ c_version = _xcstr(version) if version is not None else NULL
+ doctype = _utf8orNone(doctype)
+ if standalone is None:
+ c_standalone = -1
+ else:
+ c_standalone = 1 if standalone else 0
+ _writeDeclarationToBuffer(self._c_out, c_version, self._c_encoding, c_standalone)
+ if doctype is not None:
+ _writeDoctype(self._c_out, _xcstr(doctype))
+ self._status = WRITER_DTD_WRITTEN
+ else:
+ self._status = WRITER_DECL_WRITTEN
+ if not self._buffered:
+ tree.xmlOutputBufferFlush(self._c_out)
+ self._handle_error(self._c_out.error)
+
+ def write_doctype(self, doctype):
+ """write_doctype(self, doctype)
+
+ Writes the given doctype declaration verbatimly into the file.
+ """
+ assert self._c_out is not NULL
+ if doctype is None:
+ return
+ if self._status >= WRITER_DTD_WRITTEN:
+ raise LxmlSyntaxError("DOCTYPE already written or cannot write it here")
+ doctype = _utf8(doctype)
+ _writeDoctype(self._c_out, _xcstr(doctype))
+ self._status = WRITER_DTD_WRITTEN
+ if not self._buffered:
+ tree.xmlOutputBufferFlush(self._c_out)
+ self._handle_error(self._c_out.error)
+
+ def method(self, method):
+ """method(self, method)
+
+ Returns a context manager that overrides and restores the output method.
+ method is one of (None, 'xml', 'html') where None means 'xml'.
+ """
+ assert self._c_out is not NULL
+ c_method = self._method if method is None else _findOutputMethod(method)
+ return _MethodChanger(self, c_method)
+
+ def element(self, tag, attrib=None, nsmap=None, method=None, **_extra):
+ """element(self, tag, attrib=None, nsmap=None, method, **_extra)
+
+ Returns a context manager that writes an opening and closing tag.
+ method is one of (None, 'xml', 'html') where None means 'xml'.
+ """
+ assert self._c_out is not NULL
+ attributes = []
+ if attrib is not None:
+ for name, value in _iter_attrib(attrib):
+ if name not in _extra:
+ ns, name = _getNsTag(name)
+ attributes.append((ns, name, _utf8(value)))
+ if _extra:
+ for name, value in _extra.iteritems():
+ ns, name = _getNsTag(name)
+ attributes.append((ns, name, _utf8(value)))
+ reversed_nsmap = {}
+ if nsmap:
+ for prefix, ns in nsmap.items():
+ if prefix is not None:
+ prefix = _utf8(prefix)
+ _prefixValidOrRaise(prefix)
+ reversed_nsmap[_utf8(ns)] = prefix
+ ns, name = _getNsTag(tag)
+
+ c_method = self._method if method is None else _findOutputMethod(method)
+
+ return _FileWriterElement(self, (ns, name, attributes, reversed_nsmap), c_method)
+
+ cdef _write_qname(self, bytes name, bytes prefix):
+ if prefix: # empty bytes for no prefix (not None to allow sorting)
+ tree.xmlOutputBufferWrite(self._c_out, len(prefix), _cstr(prefix))
+ tree.xmlOutputBufferWrite(self._c_out, 1, ':')
+ tree.xmlOutputBufferWrite(self._c_out, len(name), _cstr(name))
+
+ cdef _write_start_element(self, element_config):
+ if self._status > WRITER_IN_ELEMENT:
+ raise LxmlSyntaxError("cannot append trailing element to complete XML document")
+ ns, name, attributes, nsmap = element_config
+ flat_namespace_map, new_namespaces = self._collect_namespaces(nsmap)
+ prefix = self._find_prefix(ns, flat_namespace_map, new_namespaces)
+ tree.xmlOutputBufferWrite(self._c_out, 1, '<')
+ self._write_qname(name, prefix)
+
+ self._write_attributes_and_namespaces(
+ attributes, flat_namespace_map, new_namespaces)
+
+ tree.xmlOutputBufferWrite(self._c_out, 1, '>')
+ if not self._buffered:
+ tree.xmlOutputBufferFlush(self._c_out)
+ self._handle_error(self._c_out.error)
+
+ self._element_stack.append((ns, name, prefix, flat_namespace_map))
+ self._status = WRITER_IN_ELEMENT
+
+ cdef _write_attributes_and_namespaces(self, list attributes,
+ dict flat_namespace_map,
+ list new_namespaces):
+ if attributes:
+ # _find_prefix() may append to new_namespaces => build them first
+ attributes = [
+ (self._find_prefix(ns, flat_namespace_map, new_namespaces), name, value)
+ for ns, name, value in attributes ]
+ if new_namespaces:
+ new_namespaces.sort()
+ self._write_attributes_list(new_namespaces)
+ if attributes:
+ self._write_attributes_list(attributes)
+
+ cdef _write_attributes_list(self, list attributes):
+ for prefix, name, value in attributes:
+ tree.xmlOutputBufferWrite(self._c_out, 1, ' ')
+ self._write_qname(name, prefix)
+ tree.xmlOutputBufferWrite(self._c_out, 2, '="')
+ _write_attr_string(self._c_out, _cstr(value))
+
+ tree.xmlOutputBufferWrite(self._c_out, 1, '"')
+
+ cdef _write_end_element(self, element_config):
+ if self._status != WRITER_IN_ELEMENT:
+ raise LxmlSyntaxError("not in an element")
+ if not self._element_stack or self._element_stack[-1][:2] != element_config[:2]:
+ raise LxmlSyntaxError("inconsistent exit action in context manager")
+
+ # If previous write operations failed, the context manager exit might still call us.
+ # That is ok, but we stop writing closing tags and handling errors in that case.
+ # For all non-I/O errors, we continue writing closing tags if we can.
+ ok_to_write = self._c_out.error == xmlerror.XML_ERR_OK
+
+ name, prefix = self._element_stack.pop()[1:3]
+ if ok_to_write:
+ tree.xmlOutputBufferWrite(self._c_out, 2, '</')
+ self._write_qname(name, prefix)
+ tree.xmlOutputBufferWrite(self._c_out, 1, '>')
+
+ if not self._element_stack:
+ self._status = WRITER_FINISHED
+ if ok_to_write:
+ if not self._buffered:
+ tree.xmlOutputBufferFlush(self._c_out)
+ self._handle_error(self._c_out.error)
+
+ cdef _find_prefix(self, bytes href, dict flat_namespaces_map, list new_namespaces):
+ if href is None:
+ return None
+ if href in flat_namespaces_map:
+ return flat_namespaces_map[href]
+ # need to create a new prefix
+ prefixes = flat_namespaces_map.values()
+ i = 0
+ while True:
+ prefix = _utf8('ns%d' % i)
+ if prefix not in prefixes:
+ new_namespaces.append((b'xmlns', prefix, href))
+ flat_namespaces_map[href] = prefix
+ return prefix
+ i += 1
+
+ cdef _collect_namespaces(self, dict nsmap):
+ new_namespaces = []
+ flat_namespaces_map = {}
+ for ns, prefix in nsmap.iteritems():
+ flat_namespaces_map[ns] = prefix
+ if prefix is None:
+ # use empty bytes rather than None to allow sorting
+ new_namespaces.append((b'', b'xmlns', ns))
+ else:
+ new_namespaces.append((b'xmlns', prefix, ns))
+ # merge in flat namespace map of parent
+ if self._element_stack:
+ for ns, prefix in (<dict>self._element_stack[-1][-1]).iteritems():
+ if flat_namespaces_map.get(ns) is None:
+ # unknown or empty prefix => prefer a 'real' prefix
+ flat_namespaces_map[ns] = prefix
+ return flat_namespaces_map, new_namespaces
+
+ def write(self, *args, bint with_tail=True, bint pretty_print=False, method=None):
+ """write(self, *args, with_tail=True, pretty_print=False, method=None)
+
+ Write subtrees or strings into the file.
+
+ If method is not None, it should be one of ('html', 'xml', 'text')
+ to temporarily override the output method.
+ """
+ assert self._c_out is not NULL
+ c_method = self._method if method is None else _findOutputMethod(method)
+
+ for content in args:
+ if _isString(content):
+ if self._status != WRITER_IN_ELEMENT:
+ if self._status > WRITER_IN_ELEMENT or content.strip():
+ raise LxmlSyntaxError("not in an element")
+ bstring = _utf8(content)
+ if not bstring:
+ continue
+
+ ns, name, _, _ = self._element_stack[-1]
+ if (c_method == OUTPUT_METHOD_HTML and
+ ns in (None, b'http://www.w3.org/1999/xhtml') and
+ name in (b'script', b'style')):
+ tree.xmlOutputBufferWrite(self._c_out, len(bstring), _cstr(bstring))
+
+ else:
+ tree.xmlOutputBufferWriteEscape(self._c_out, _xcstr(bstring), NULL)
+
+ elif iselement(content):
+ if self._status > WRITER_IN_ELEMENT:
+ raise LxmlSyntaxError("cannot append trailing element to complete XML document")
+ _writeNodeToBuffer(self._c_out, (<_Element>content)._c_node,
+ self._c_encoding, NULL, c_method,
+ False, False, pretty_print, with_tail, False)
+ if (<_Element>content)._c_node.type == tree.XML_ELEMENT_NODE:
+ if not self._element_stack:
+ self._status = WRITER_FINISHED
+
+ elif content is not None:
+ raise TypeError(
+ f"got invalid input value of type {type(content)}, expected string or Element")
+ self._handle_error(self._c_out.error)
+ if not self._buffered:
+ tree.xmlOutputBufferFlush(self._c_out)
+ self._handle_error(self._c_out.error)
+
+ def flush(self):
+ """flush(self)
+
+ Write any pending content of the current output buffer to the stream.
+ """
+ assert self._c_out is not NULL
+ tree.xmlOutputBufferFlush(self._c_out)
+ self._handle_error(self._c_out.error)
+
+ cdef _close(self, bint raise_on_error):
+ if raise_on_error:
+ if self._status < WRITER_IN_ELEMENT:
+ raise LxmlSyntaxError("no content written")
+ if self._element_stack:
+ raise LxmlSyntaxError("pending open tags on close")
+ error_result = self._c_out.error
+ if error_result == xmlerror.XML_ERR_OK:
+ error_result = tree.xmlOutputBufferClose(self._c_out)
+ if error_result != -1:
+ error_result = xmlerror.XML_ERR_OK
+ else:
+ tree.xmlOutputBufferClose(self._c_out)
+ self._status = WRITER_FINISHED
+ self._c_out = NULL
+ del self._element_stack[:]
+ if raise_on_error:
+ self._handle_error(error_result)
+
+ cdef _handle_error(self, int error_result):
+ if error_result != xmlerror.XML_ERR_OK:
+ if self._target is not None:
+ self._target._exc_context._raise_if_stored()
+ _raiseSerialisationError(error_result)
+
+
+@cython.final
+@cython.internal
+cdef class _AsyncDataWriter:
+ cdef list _data
+ def __cinit__(self):
+ self._data = []
+
+ cdef bytes collect(self):
+ data = b''.join(self._data)
+ del self._data[:]
+ return data
+
+ def write(self, data):
+ self._data.append(data)
+
+ def close(self):
+ pass
+
+
+@cython.final
+@cython.internal
+cdef class _AsyncIncrementalFileWriter:
+ cdef _IncrementalFileWriter _writer
+ cdef _AsyncDataWriter _buffer
+ cdef object _async_outfile
+ cdef int _flush_after_writes
+ cdef bint _should_close
+ cdef bint _buffered
+
+ def __cinit__(self, async_outfile, bytes encoding, int compresslevel, bint close,
+ bint buffered, int method):
+ self._flush_after_writes = 20
+ self._async_outfile = async_outfile
+ self._should_close = close
+ self._buffered = buffered
+ self._buffer = _AsyncDataWriter()
+ self._writer = _IncrementalFileWriter(
+ self._buffer, encoding, compresslevel, close=True, buffered=False, method=method)
+
+ cdef bytes _flush(self):
+ if not self._buffered or len(self._buffer._data) > self._flush_after_writes:
+ return self._buffer.collect()
+ return None
+
+ async def flush(self):
+ self._writer.flush()
+ data = self._buffer.collect()
+ if data:
+ await self._async_outfile.write(data)
+
+ async def write_declaration(self, version=None, standalone=None, doctype=None):
+ self._writer.write_declaration(version, standalone, doctype)
+ data = self._flush()
+ if data:
+ await self._async_outfile.write(data)
+
+ async def write_doctype(self, doctype):
+ self._writer.write_doctype(doctype)
+ data = self._flush()
+ if data:
+ await self._async_outfile.write(data)
+
+ async def write(self, *args, with_tail=True, pretty_print=False, method=None):
+ self._writer.write(*args, with_tail=with_tail, pretty_print=pretty_print, method=method)
+ data = self._flush()
+ if data:
+ await self._async_outfile.write(data)
+
+ def method(self, method):
+ return self._writer.method(method)
+
+ def element(self, tag, attrib=None, nsmap=None, method=None, **_extra):
+ element_writer = self._writer.element(tag, attrib, nsmap, method, **_extra)
+ return _AsyncFileWriterElement(element_writer, self)
+
+ async def _close(self, bint raise_on_error):
+ self._writer._close(raise_on_error)
+ data = self._buffer.collect()
+ if data:
+ await self._async_outfile.write(data)
+ if self._should_close:
+ await self._async_outfile.close()
+
+
+@cython.final
+@cython.internal
+cdef class _AsyncFileWriterElement:
+ cdef _FileWriterElement _element_writer
+ cdef _AsyncIncrementalFileWriter _writer
+
+ def __cinit__(self, _FileWriterElement element_writer not None,
+ _AsyncIncrementalFileWriter writer not None):
+ self._element_writer = element_writer
+ self._writer = writer
+
+ async def __aenter__(self):
+ self._element_writer.__enter__()
+ data = self._writer._flush()
+ if data:
+ await self._writer._async_outfile.write(data)
+
+ async def __aexit__(self, *args):
+ self._element_writer.__exit__(*args)
+ data = self._writer._flush()
+ if data:
+ await self._writer._async_outfile.write(data)
+
+
+@cython.final
+@cython.internal
+@cython.freelist(8)
+cdef class _FileWriterElement:
+ cdef _IncrementalFileWriter _writer
+ cdef object _element
+ cdef int _new_method
+ cdef int _old_method
+
+ def __cinit__(self, _IncrementalFileWriter writer not None, element_config, int method):
+ self._writer = writer
+ self._element = element_config
+ self._new_method = method
+ self._old_method = writer._method
+
+ def __enter__(self):
+ self._writer._method = self._new_method
+ self._writer._write_start_element(self._element)
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ self._writer._write_end_element(self._element)
+ self._writer._method = self._old_method
+
+
+@cython.final
+@cython.internal
+@cython.freelist(8)
+cdef class _MethodChanger:
+ cdef _IncrementalFileWriter _writer
+ cdef int _new_method
+ cdef int _old_method
+ cdef bint _entered
+ cdef bint _exited
+
+ def __cinit__(self, _IncrementalFileWriter writer not None, int method):
+ self._writer = writer
+ self._new_method = method
+ self._old_method = writer._method
+ self._entered = False
+ self._exited = False
+
+ def __enter__(self):
+ if self._entered:
+ raise LxmlSyntaxError("Inconsistent enter action in context manager")
+ self._writer._method = self._new_method
+ self._entered = True
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ if self._exited:
+ raise LxmlSyntaxError("Inconsistent exit action in context manager")
+ if self._writer._method != self._new_method:
+ raise LxmlSyntaxError("Method changed outside of context manager")
+ self._writer._method = self._old_method
+ self._exited = True
+
+ async def __aenter__(self):
+ # for your async convenience
+ return self.__enter__()
+
+ async def __aexit__(self, *args):
+ # for your async convenience
+ return self.__exit__(*args)
diff --git a/src/lxml/tests/__init__.py b/src/lxml/tests/__init__.py
new file mode 100644
index 0000000..85dfe70
--- /dev/null
+++ b/src/lxml/tests/__init__.py
@@ -0,0 +1,4 @@
+"""
+The lxml test suite for lxml, ElementTree and cElementTree.
+"""
+
diff --git a/src/lxml/tests/c14n-20/c14nComment.xml b/src/lxml/tests/c14n-20/c14nComment.xml
new file mode 100644
index 0000000..e95aa30
--- /dev/null
+++ b/src/lxml/tests/c14n-20/c14nComment.xml
@@ -0,0 +1,4 @@
+<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
+ <c14n2:IgnoreComments>true</c14n2:IgnoreComments>
+</dsig:CanonicalizationMethod>
+
diff --git a/src/lxml/tests/c14n-20/c14nDefault.xml b/src/lxml/tests/c14n-20/c14nDefault.xml
new file mode 100644
index 0000000..c136414
--- /dev/null
+++ b/src/lxml/tests/c14n-20/c14nDefault.xml
@@ -0,0 +1,3 @@
+<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" Algorithm="http://www.w3.org/2010/xml-c14n2">
+</dsig:CanonicalizationMethod>
+
diff --git a/src/lxml/tests/c14n-20/c14nPrefix.xml b/src/lxml/tests/c14n-20/c14nPrefix.xml
new file mode 100644
index 0000000..fb233b4
--- /dev/null
+++ b/src/lxml/tests/c14n-20/c14nPrefix.xml
@@ -0,0 +1,4 @@
+<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
+ <c14n2:PrefixRewrite>sequential</c14n2:PrefixRewrite>
+</dsig:CanonicalizationMethod>
+
diff --git a/src/lxml/tests/c14n-20/c14nPrefixQname.xml b/src/lxml/tests/c14n-20/c14nPrefixQname.xml
new file mode 100644
index 0000000..23188ee
--- /dev/null
+++ b/src/lxml/tests/c14n-20/c14nPrefixQname.xml
@@ -0,0 +1,7 @@
+<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
+ <c14n2:PrefixRewrite>sequential</c14n2:PrefixRewrite>
+ <c14n2:QNameAware>
+ <c14n2:QualifiedAttr Name="type" NS="http://www.w3.org/2001/XMLSchema-instance"/>
+ </c14n2:QNameAware>
+</dsig:CanonicalizationMethod>
+
diff --git a/src/lxml/tests/c14n-20/c14nPrefixQnameXpathElem.xml b/src/lxml/tests/c14n-20/c14nPrefixQnameXpathElem.xml
new file mode 100644
index 0000000..626fc48
--- /dev/null
+++ b/src/lxml/tests/c14n-20/c14nPrefixQnameXpathElem.xml
@@ -0,0 +1,8 @@
+<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
+ <c14n2:PrefixRewrite>sequential</c14n2:PrefixRewrite>
+ <c14n2:QNameAware>
+ <c14n2:Element Name="bar" NS="http://a"/>
+ <c14n2:XPathElement Name="IncludedXPath" NS="http://www.w3.org/2010/xmldsig2#"/>
+ </c14n2:QNameAware>
+</dsig:CanonicalizationMethod>
+
diff --git a/src/lxml/tests/c14n-20/c14nQname.xml b/src/lxml/tests/c14n-20/c14nQname.xml
new file mode 100644
index 0000000..919e590
--- /dev/null
+++ b/src/lxml/tests/c14n-20/c14nQname.xml
@@ -0,0 +1,6 @@
+<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
+ <c14n2:QNameAware>
+ <c14n2:QualifiedAttr Name="type" NS="http://www.w3.org/2001/XMLSchema-instance"/>
+ </c14n2:QNameAware>
+</dsig:CanonicalizationMethod>
+
diff --git a/src/lxml/tests/c14n-20/c14nQnameElem.xml b/src/lxml/tests/c14n-20/c14nQnameElem.xml
new file mode 100644
index 0000000..0321f80
--- /dev/null
+++ b/src/lxml/tests/c14n-20/c14nQnameElem.xml
@@ -0,0 +1,6 @@
+<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
+ <c14n2:QNameAware>
+ <c14n2:Element Name="bar" NS="http://a"/>
+ </c14n2:QNameAware>
+</dsig:CanonicalizationMethod>
+
diff --git a/src/lxml/tests/c14n-20/c14nQnameXpathElem.xml b/src/lxml/tests/c14n-20/c14nQnameXpathElem.xml
new file mode 100644
index 0000000..c4890bc
--- /dev/null
+++ b/src/lxml/tests/c14n-20/c14nQnameXpathElem.xml
@@ -0,0 +1,7 @@
+<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
+ <c14n2:QNameAware>
+ <c14n2:Element Name="bar" NS="http://a"/>
+ <c14n2:XPathElement Name="IncludedXPath" NS="http://www.w3.org/2010/xmldsig2#"/>
+ </c14n2:QNameAware>
+</dsig:CanonicalizationMethod>
+
diff --git a/src/lxml/tests/c14n-20/c14nTrim.xml b/src/lxml/tests/c14n-20/c14nTrim.xml
new file mode 100644
index 0000000..ccb9cf6
--- /dev/null
+++ b/src/lxml/tests/c14n-20/c14nTrim.xml
@@ -0,0 +1,4 @@
+<dsig:CanonicalizationMethod xmlns:dsig="http://www.w3.org/2000/09/xmldsig#" xmlns:c14n2="http://www.w3.org/2010/xml-c14n2" Algorithm="http://www.w3.org/2010/xml-c14n2">
+ <c14n2:TrimTextNodes>true</c14n2:TrimTextNodes>
+</dsig:CanonicalizationMethod>
+
diff --git a/src/lxml/tests/c14n-20/doc.dtd b/src/lxml/tests/c14n-20/doc.dtd
new file mode 100644
index 0000000..5c5d544
--- /dev/null
+++ b/src/lxml/tests/c14n-20/doc.dtd
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!ELEMENT doc (#PCDATA)>
+
+
+
diff --git a/src/lxml/tests/c14n-20/doc.xsl b/src/lxml/tests/c14n-20/doc.xsl
new file mode 100644
index 0000000..a3f2348
--- /dev/null
+++ b/src/lxml/tests/c14n-20/doc.xsl
@@ -0,0 +1,5 @@
+<?xml version="1.0"?>
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ >
+</xsl:stylesheet>
diff --git a/src/lxml/tests/c14n-20/inC14N1.xml b/src/lxml/tests/c14n-20/inC14N1.xml
new file mode 100644
index 0000000..ed450c7
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inC14N1.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0"?>
+
+<?xml-stylesheet href="doc.xsl"
+ type="text/xsl" ?>
+
+<!DOCTYPE doc SYSTEM "doc.dtd">
+
+<doc>Hello, world!<!-- Comment 1 --></doc>
+
+<?pi-without-data ?>
+
+<!-- Comment 2 -->
+
+<!-- Comment 3 -->
diff --git a/src/lxml/tests/c14n-20/inC14N2.xml b/src/lxml/tests/c14n-20/inC14N2.xml
new file mode 100644
index 0000000..74eeea1
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inC14N2.xml
@@ -0,0 +1,11 @@
+<doc>
+ <clean> </clean>
+ <dirty> A B </dirty>
+ <mixed>
+ A
+ <clean> </clean>
+ B
+ <dirty> A B </dirty>
+ C
+ </mixed>
+</doc>
diff --git a/src/lxml/tests/c14n-20/inC14N3.xml b/src/lxml/tests/c14n-20/inC14N3.xml
new file mode 100644
index 0000000..fea7821
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inC14N3.xml
@@ -0,0 +1,18 @@
+<!DOCTYPE doc [<!ATTLIST e9 attr CDATA "default">]>
+<doc>
+ <e1 />
+ <e2 ></e2>
+ <e3 name = "elem3" id="elem3" />
+ <e4 name="elem4" id="elem4" ></e4>
+ <e5 a:attr="out" b:attr="sorted" attr2="all" attr="I'm"
+ xmlns:b="http://www.ietf.org"
+ xmlns:a="http://www.w3.org"
+ xmlns="http://example.org"/>
+ <e6 xmlns="" xmlns:a="http://www.w3.org">
+ <e7 xmlns="http://www.ietf.org">
+ <e8 xmlns="" xmlns:a="http://www.w3.org">
+ <e9 xmlns="" xmlns:a="http://www.ietf.org"/>
+ </e8>
+ </e7>
+ </e6>
+</doc>
diff --git a/src/lxml/tests/c14n-20/inC14N4.xml b/src/lxml/tests/c14n-20/inC14N4.xml
new file mode 100644
index 0000000..909a847
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inC14N4.xml
@@ -0,0 +1,13 @@
+<!DOCTYPE doc [
+<!ATTLIST normId id ID #IMPLIED>
+<!ATTLIST normNames attr NMTOKENS #IMPLIED>
+]>
+<doc>
+ <text>First line&#x0d;&#10;Second line</text>
+ <value>&#x32;</value>
+ <compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>
+ <compute expr='value>"0" &amp;&amp; value&lt;"10" ?"valid":"error"'>valid</compute>
+ <norm attr=' &apos; &#x20;&#13;&#xa;&#9; &apos; '/>
+ <normNames attr=' A &#x20;&#13;&#xa;&#9; B '/>
+ <normId id=' &apos;&#x20;&#13;&#xa;&#9; &apos; '/>
+</doc>
diff --git a/src/lxml/tests/c14n-20/inC14N5.xml b/src/lxml/tests/c14n-20/inC14N5.xml
new file mode 100644
index 0000000..501161b
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inC14N5.xml
@@ -0,0 +1,12 @@
+<!DOCTYPE doc [
+<!ATTLIST doc attrExtEnt CDATA #IMPLIED>
+<!ENTITY ent1 "Hello">
+<!ENTITY ent2 SYSTEM "world.txt">
+<!ENTITY entExt SYSTEM "earth.gif" NDATA gif>
+<!NOTATION gif SYSTEM "viewgif.exe">
+]>
+<doc attrExtEnt="entExt">
+ &ent1;, &ent2;!
+</doc>
+
+<!-- Let world.txt contain "world" (excluding the quotes) -->
diff --git a/src/lxml/tests/c14n-20/inC14N6.xml b/src/lxml/tests/c14n-20/inC14N6.xml
new file mode 100644
index 0000000..31e2071
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inC14N6.xml
@@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="ISO-8859-1"?>
+<doc>&#169;</doc>
diff --git a/src/lxml/tests/c14n-20/inNsContent.xml b/src/lxml/tests/c14n-20/inNsContent.xml
new file mode 100644
index 0000000..b992466
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inNsContent.xml
@@ -0,0 +1,4 @@
+<a:foo xmlns:a="http://a" xmlns:b="http://b" xmlns:child="http://c" xmlns:soap-env="http://schemas.xmlsoap.org/wsdl/soap/" xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <a:bar>xsd:string</a:bar>
+ <dsig2:IncludedXPath xmlns:dsig2="http://www.w3.org/2010/xmldsig2#">/soap-env:body/child::b:foo[@att1 != "c:val" and @att2 != 'xsd:string']</dsig2:IncludedXPath>
+</a:foo>
diff --git a/src/lxml/tests/c14n-20/inNsDefault.xml b/src/lxml/tests/c14n-20/inNsDefault.xml
new file mode 100644
index 0000000..3e0d323
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inNsDefault.xml
@@ -0,0 +1,3 @@
+<foo xmlns:a="http://a" xmlns:b="http://b">
+ <b:bar b:att1="val" att2="val"/>
+</foo>
diff --git a/src/lxml/tests/c14n-20/inNsPushdown.xml b/src/lxml/tests/c14n-20/inNsPushdown.xml
new file mode 100644
index 0000000..daa67d8
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inNsPushdown.xml
@@ -0,0 +1,6 @@
+<a:foo xmlns:a="http://a" xmlns:b="http://b" xmlns:c="http://c">
+ <b:bar/>
+ <b:bar/>
+ <b:bar/>
+ <a:bar b:att1="val"/>
+</a:foo>
diff --git a/src/lxml/tests/c14n-20/inNsRedecl.xml b/src/lxml/tests/c14n-20/inNsRedecl.xml
new file mode 100644
index 0000000..10bd97b
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inNsRedecl.xml
@@ -0,0 +1,3 @@
+<foo xmlns:a="http://z3" xmlns:b="http://z2" a:att1="val1" b:att2="val2">
+ <bar xmlns="http://z0" xmlns:a="http://z2" a:att1="val1" b:att2="val2" xmlns:b="http://z3" />
+</foo>
diff --git a/src/lxml/tests/c14n-20/inNsSort.xml b/src/lxml/tests/c14n-20/inNsSort.xml
new file mode 100644
index 0000000..8e9fc01
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inNsSort.xml
@@ -0,0 +1,4 @@
+<a:foo xmlns:a="http://z3" xmlns:b="http://z2" b:att1="val1" c:att3="val3" b:att2="val2" xmlns:c="http://z1" xmlns:d="http://z0">
+ <c:bar/>
+ <c:bar d:att3="val3"/>
+</a:foo>
diff --git a/src/lxml/tests/c14n-20/inNsSuperfluous.xml b/src/lxml/tests/c14n-20/inNsSuperfluous.xml
new file mode 100644
index 0000000..f77720f
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inNsSuperfluous.xml
@@ -0,0 +1,4 @@
+<foo xmlns:a="http://z0" xmlns:b="http://z0" a:att1="val1" b:att2="val2" xmlns="http://z0">
+ <c:bar xmlns:a="http://z0" xmlns:c="http://z0" c:att3="val3"/>
+ <d:bar xmlns:d="http://z0"/>
+</foo>
diff --git a/src/lxml/tests/c14n-20/inNsXml.xml b/src/lxml/tests/c14n-20/inNsXml.xml
new file mode 100644
index 0000000..7520cf3
--- /dev/null
+++ b/src/lxml/tests/c14n-20/inNsXml.xml
@@ -0,0 +1,3 @@
+<foo xmlns="http://z0" xml:id="23">
+ <bar xsi:type="xsd:string" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema">data</bar>
+</foo>
diff --git a/src/lxml/tests/c14n-20/out_inC14N1_c14nComment.xml b/src/lxml/tests/c14n-20/out_inC14N1_c14nComment.xml
new file mode 100644
index 0000000..d98d168
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N1_c14nComment.xml
@@ -0,0 +1,6 @@
+<?xml-stylesheet href="doc.xsl"
+ type="text/xsl" ?>
+<doc>Hello, world!<!-- Comment 1 --></doc>
+<?pi-without-data?>
+<!-- Comment 2 -->
+<!-- Comment 3 --> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inC14N1_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inC14N1_c14nDefault.xml
new file mode 100644
index 0000000..af9a977
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N1_c14nDefault.xml
@@ -0,0 +1,4 @@
+<?xml-stylesheet href="doc.xsl"
+ type="text/xsl" ?>
+<doc>Hello, world!</doc>
+<?pi-without-data?> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inC14N2_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inC14N2_c14nDefault.xml
new file mode 100644
index 0000000..2afa15c
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N2_c14nDefault.xml
@@ -0,0 +1,11 @@
+<doc>
+ <clean> </clean>
+ <dirty> A B </dirty>
+ <mixed>
+ A
+ <clean> </clean>
+ B
+ <dirty> A B </dirty>
+ C
+ </mixed>
+</doc> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inC14N2_c14nTrim.xml b/src/lxml/tests/c14n-20/out_inC14N2_c14nTrim.xml
new file mode 100644
index 0000000..7a1dc32
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N2_c14nTrim.xml
@@ -0,0 +1 @@
+<doc><clean></clean><dirty>A B</dirty><mixed>A<clean></clean>B<dirty>A B</dirty>C</mixed></doc> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inC14N3_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inC14N3_c14nDefault.xml
new file mode 100644
index 0000000..662e108
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N3_c14nDefault.xml
@@ -0,0 +1,14 @@
+<doc>
+ <e1></e1>
+ <e2></e2>
+ <e3 id="elem3" name="elem3"></e3>
+ <e4 id="elem4" name="elem4"></e4>
+ <e5 xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I'm" attr2="all" b:attr="sorted" a:attr="out"></e5>
+ <e6>
+ <e7 xmlns="http://www.ietf.org">
+ <e8 xmlns="">
+ <e9 attr="default"></e9>
+ </e8>
+ </e7>
+ </e6>
+</doc> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inC14N3_c14nPrefix.xml b/src/lxml/tests/c14n-20/out_inC14N3_c14nPrefix.xml
new file mode 100644
index 0000000..041e1ec
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N3_c14nPrefix.xml
@@ -0,0 +1,14 @@
+<n0:doc xmlns:n0="">
+ <n0:e1></n0:e1>
+ <n0:e2></n0:e2>
+ <n0:e3 id="elem3" name="elem3"></n0:e3>
+ <n0:e4 id="elem4" name="elem4"></n0:e4>
+ <n1:e5 xmlns:n1="http://example.org" xmlns:n2="http://www.ietf.org" xmlns:n3="http://www.w3.org" attr="I'm" attr2="all" n2:attr="sorted" n3:attr="out"></n1:e5>
+ <n0:e6>
+ <n2:e7 xmlns:n2="http://www.ietf.org">
+ <n0:e8>
+ <n0:e9 attr="default"></n0:e9>
+ </n0:e8>
+ </n2:e7>
+ </n0:e6>
+</n0:doc> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inC14N3_c14nTrim.xml b/src/lxml/tests/c14n-20/out_inC14N3_c14nTrim.xml
new file mode 100644
index 0000000..4f35ad9
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N3_c14nTrim.xml
@@ -0,0 +1 @@
+<doc><e1></e1><e2></e2><e3 id="elem3" name="elem3"></e3><e4 id="elem4" name="elem4"></e4><e5 xmlns="http://example.org" xmlns:a="http://www.w3.org" xmlns:b="http://www.ietf.org" attr="I'm" attr2="all" b:attr="sorted" a:attr="out"></e5><e6><e7 xmlns="http://www.ietf.org"><e8 xmlns=""><e9 attr="default"></e9></e8></e7></e6></doc> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inC14N4_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inC14N4_c14nDefault.xml
new file mode 100644
index 0000000..243d0e6
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N4_c14nDefault.xml
@@ -0,0 +1,10 @@
+<doc>
+ <text>First line&#xD;
+Second line</text>
+ <value>2</value>
+ <compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute>
+ <compute expr="value>&quot;0&quot; &amp;&amp; value&lt;&quot;10&quot; ?&quot;valid&quot;:&quot;error&quot;">valid</compute>
+ <norm attr=" ' &#xD;&#xA;&#x9; ' "></norm>
+ <normNames attr="A &#xD;&#xA;&#x9; B"></normNames>
+ <normId id="' &#xD;&#xA;&#x9; '"></normId>
+</doc> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inC14N4_c14nTrim.xml b/src/lxml/tests/c14n-20/out_inC14N4_c14nTrim.xml
new file mode 100644
index 0000000..24d83ba
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N4_c14nTrim.xml
@@ -0,0 +1,2 @@
+<doc><text>First line&#xD;
+Second line</text><value>2</value><compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute><compute expr="value>&quot;0&quot; &amp;&amp; value&lt;&quot;10&quot; ?&quot;valid&quot;:&quot;error&quot;">valid</compute><norm attr=" ' &#xD;&#xA;&#x9; ' "></norm><normNames attr="A &#xD;&#xA;&#x9; B"></normNames><normId id="' &#xD;&#xA;&#x9; '"></normId></doc> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inC14N5_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inC14N5_c14nDefault.xml
new file mode 100644
index 0000000..c232e74
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N5_c14nDefault.xml
@@ -0,0 +1,3 @@
+<doc attrExtEnt="entExt">
+ Hello, world!
+</doc> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inC14N5_c14nTrim.xml b/src/lxml/tests/c14n-20/out_inC14N5_c14nTrim.xml
new file mode 100644
index 0000000..3fa84b1
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N5_c14nTrim.xml
@@ -0,0 +1 @@
+<doc attrExtEnt="entExt">Hello, world!</doc> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inC14N6_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inC14N6_c14nDefault.xml
new file mode 100644
index 0000000..0be38f9
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inC14N6_c14nDefault.xml
@@ -0,0 +1 @@
+<doc>©</doc> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsContent_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inNsContent_c14nDefault.xml
new file mode 100644
index 0000000..62d7e00
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsContent_c14nDefault.xml
@@ -0,0 +1,4 @@
+<a:foo xmlns:a="http://a">
+ <a:bar>xsd:string</a:bar>
+ <dsig2:IncludedXPath xmlns:dsig2="http://www.w3.org/2010/xmldsig2#">/soap-env:body/child::b:foo[@att1 != "c:val" and @att2 != 'xsd:string']</dsig2:IncludedXPath>
+</a:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsContent_c14nPrefixQnameXpathElem.xml b/src/lxml/tests/c14n-20/out_inNsContent_c14nPrefixQnameXpathElem.xml
new file mode 100644
index 0000000..20e1c2e
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsContent_c14nPrefixQnameXpathElem.xml
@@ -0,0 +1,4 @@
+<n0:foo xmlns:n0="http://a">
+ <n0:bar xmlns:n1="http://www.w3.org/2001/XMLSchema">n1:string</n0:bar>
+ <n4:IncludedXPath xmlns:n2="http://b" xmlns:n3="http://schemas.xmlsoap.org/wsdl/soap/" xmlns:n4="http://www.w3.org/2010/xmldsig2#">/n3:body/child::n2:foo[@att1 != "c:val" and @att2 != 'xsd:string']</n4:IncludedXPath>
+</n0:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsContent_c14nQnameElem.xml b/src/lxml/tests/c14n-20/out_inNsContent_c14nQnameElem.xml
new file mode 100644
index 0000000..db8680d
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsContent_c14nQnameElem.xml
@@ -0,0 +1,4 @@
+<a:foo xmlns:a="http://a">
+ <a:bar xmlns:xsd="http://www.w3.org/2001/XMLSchema">xsd:string</a:bar>
+ <dsig2:IncludedXPath xmlns:dsig2="http://www.w3.org/2010/xmldsig2#">/soap-env:body/child::b:foo[@att1 != "c:val" and @att2 != 'xsd:string']</dsig2:IncludedXPath>
+</a:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsContent_c14nQnameXpathElem.xml b/src/lxml/tests/c14n-20/out_inNsContent_c14nQnameXpathElem.xml
new file mode 100644
index 0000000..df3b215
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsContent_c14nQnameXpathElem.xml
@@ -0,0 +1,4 @@
+<a:foo xmlns:a="http://a">
+ <a:bar xmlns:xsd="http://www.w3.org/2001/XMLSchema">xsd:string</a:bar>
+ <dsig2:IncludedXPath xmlns:b="http://b" xmlns:dsig2="http://www.w3.org/2010/xmldsig2#" xmlns:soap-env="http://schemas.xmlsoap.org/wsdl/soap/">/soap-env:body/child::b:foo[@att1 != "c:val" and @att2 != 'xsd:string']</dsig2:IncludedXPath>
+</a:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsDefault_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inNsDefault_c14nDefault.xml
new file mode 100644
index 0000000..674b076
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsDefault_c14nDefault.xml
@@ -0,0 +1,3 @@
+<foo>
+ <b:bar xmlns:b="http://b" att2="val" b:att1="val"></b:bar>
+</foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsDefault_c14nPrefix.xml b/src/lxml/tests/c14n-20/out_inNsDefault_c14nPrefix.xml
new file mode 100644
index 0000000..83edaae
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsDefault_c14nPrefix.xml
@@ -0,0 +1,3 @@
+<n0:foo xmlns:n0="">
+ <n1:bar xmlns:n1="http://b" att2="val" n1:att1="val"></n1:bar>
+</n0:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsPushdown_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inNsPushdown_c14nDefault.xml
new file mode 100644
index 0000000..fa4f21b
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsPushdown_c14nDefault.xml
@@ -0,0 +1,6 @@
+<a:foo xmlns:a="http://a">
+ <b:bar xmlns:b="http://b"></b:bar>
+ <b:bar xmlns:b="http://b"></b:bar>
+ <b:bar xmlns:b="http://b"></b:bar>
+ <a:bar xmlns:b="http://b" b:att1="val"></a:bar>
+</a:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsPushdown_c14nPrefix.xml b/src/lxml/tests/c14n-20/out_inNsPushdown_c14nPrefix.xml
new file mode 100644
index 0000000..6d57920
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsPushdown_c14nPrefix.xml
@@ -0,0 +1,6 @@
+<n0:foo xmlns:n0="http://a">
+ <n1:bar xmlns:n1="http://b"></n1:bar>
+ <n1:bar xmlns:n1="http://b"></n1:bar>
+ <n1:bar xmlns:n1="http://b"></n1:bar>
+ <n0:bar xmlns:n1="http://b" n1:att1="val"></n0:bar>
+</n0:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsRedecl_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inNsRedecl_c14nDefault.xml
new file mode 100644
index 0000000..ba37f92
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsRedecl_c14nDefault.xml
@@ -0,0 +1,3 @@
+<foo xmlns:a="http://z3" xmlns:b="http://z2" b:att2="val2" a:att1="val1">
+ <bar xmlns="http://z0" xmlns:a="http://z2" xmlns:b="http://z3" a:att1="val1" b:att2="val2"></bar>
+</foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsRedecl_c14nPrefix.xml b/src/lxml/tests/c14n-20/out_inNsRedecl_c14nPrefix.xml
new file mode 100644
index 0000000..af3bb2d
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsRedecl_c14nPrefix.xml
@@ -0,0 +1,3 @@
+<n0:foo xmlns:n0="" xmlns:n1="http://z2" xmlns:n2="http://z3" n1:att2="val2" n2:att1="val1">
+ <n3:bar xmlns:n3="http://z0" n1:att1="val1" n2:att2="val2"></n3:bar>
+</n0:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsSort_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inNsSort_c14nDefault.xml
new file mode 100644
index 0000000..8a92c5c
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsSort_c14nDefault.xml
@@ -0,0 +1,4 @@
+<a:foo xmlns:a="http://z3" xmlns:b="http://z2" xmlns:c="http://z1" c:att3="val3" b:att1="val1" b:att2="val2">
+ <c:bar></c:bar>
+ <c:bar xmlns:d="http://z0" d:att3="val3"></c:bar>
+</a:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsSort_c14nPrefix.xml b/src/lxml/tests/c14n-20/out_inNsSort_c14nPrefix.xml
new file mode 100644
index 0000000..8d44c84
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsSort_c14nPrefix.xml
@@ -0,0 +1,4 @@
+<n2:foo xmlns:n0="http://z1" xmlns:n1="http://z2" xmlns:n2="http://z3" n0:att3="val3" n1:att1="val1" n1:att2="val2">
+ <n0:bar></n0:bar>
+ <n0:bar xmlns:n3="http://z0" n3:att3="val3"></n0:bar>
+</n2:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsSuperfluous_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inNsSuperfluous_c14nDefault.xml
new file mode 100644
index 0000000..6bb862d
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsSuperfluous_c14nDefault.xml
@@ -0,0 +1,4 @@
+<foo xmlns="http://z0" xmlns:a="http://z0" xmlns:b="http://z0" a:att1="val1" b:att2="val2">
+ <c:bar xmlns:c="http://z0" c:att3="val3"></c:bar>
+ <d:bar xmlns:d="http://z0"></d:bar>
+</foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsSuperfluous_c14nPrefix.xml b/src/lxml/tests/c14n-20/out_inNsSuperfluous_c14nPrefix.xml
new file mode 100644
index 0000000..700a16d
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsSuperfluous_c14nPrefix.xml
@@ -0,0 +1,4 @@
+<n0:foo xmlns:n0="http://z0" n0:att1="val1" n0:att2="val2">
+ <n0:bar n0:att3="val3"></n0:bar>
+ <n0:bar></n0:bar>
+</n0:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsXml_c14nDefault.xml b/src/lxml/tests/c14n-20/out_inNsXml_c14nDefault.xml
new file mode 100644
index 0000000..1689f3b
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsXml_c14nDefault.xml
@@ -0,0 +1,3 @@
+<foo xmlns="http://z0" xml:id="23">
+ <bar xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xsd:string">data</bar>
+</foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsXml_c14nPrefix.xml b/src/lxml/tests/c14n-20/out_inNsXml_c14nPrefix.xml
new file mode 100644
index 0000000..38508a4
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsXml_c14nPrefix.xml
@@ -0,0 +1,3 @@
+<n0:foo xmlns:n0="http://z0" xml:id="23">
+ <n0:bar xmlns:n1="http://www.w3.org/2001/XMLSchema-instance" n1:type="xsd:string">data</n0:bar>
+</n0:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsXml_c14nPrefixQname.xml b/src/lxml/tests/c14n-20/out_inNsXml_c14nPrefixQname.xml
new file mode 100644
index 0000000..867980f
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsXml_c14nPrefixQname.xml
@@ -0,0 +1,3 @@
+<n0:foo xmlns:n0="http://z0" xml:id="23">
+ <n0:bar xmlns:n1="http://www.w3.org/2001/XMLSchema" xmlns:n2="http://www.w3.org/2001/XMLSchema-instance" n2:type="n1:string">data</n0:bar>
+</n0:foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/out_inNsXml_c14nQname.xml b/src/lxml/tests/c14n-20/out_inNsXml_c14nQname.xml
new file mode 100644
index 0000000..0300f9d
--- /dev/null
+++ b/src/lxml/tests/c14n-20/out_inNsXml_c14nQname.xml
@@ -0,0 +1,3 @@
+<foo xmlns="http://z0" xml:id="23">
+ <bar xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="xsd:string">data</bar>
+</foo> \ No newline at end of file
diff --git a/src/lxml/tests/c14n-20/world.txt b/src/lxml/tests/c14n-20/world.txt
new file mode 100644
index 0000000..04fea06
--- /dev/null
+++ b/src/lxml/tests/c14n-20/world.txt
@@ -0,0 +1 @@
+world \ No newline at end of file
diff --git a/src/lxml/tests/common_imports.py b/src/lxml/tests/common_imports.py
new file mode 100644
index 0000000..0a6cbbf
--- /dev/null
+++ b/src/lxml/tests/common_imports.py
@@ -0,0 +1,284 @@
+"""
+Common helpers and adaptations for Py2/3.
+To be used in tests.
+"""
+
+# Slows down test runs by factors. Enable to debug proxy handling issues.
+DEBUG_PROXY_ISSUES = False # True
+
+import gc
+import os
+import os.path
+import re
+import sys
+import tempfile
+import unittest
+from contextlib import contextmanager
+
+try:
+ import urlparse
+except ImportError:
+ import urllib.parse as urlparse
+
+try:
+ from urllib import pathname2url
+except:
+ from urllib.request import pathname2url
+
+from lxml import etree, html
+
+def make_version_tuple(version_string):
+ return tuple(
+ int(part) if part.isdigit() else part
+ for part in re.findall('([0-9]+|[^0-9.]+)', version_string)
+ )
+
+IS_PYPY = (getattr(sys, 'implementation', None) == 'pypy' or
+ getattr(sys, 'pypy_version_info', None) is not None)
+
+IS_PYTHON3 = sys.version_info[0] >= 3
+IS_PYTHON2 = sys.version_info[0] < 3
+
+from xml.etree import ElementTree
+
+if hasattr(ElementTree, 'VERSION'):
+ ET_VERSION = make_version_tuple(ElementTree.VERSION)
+else:
+ ET_VERSION = (0,0,0)
+
+if IS_PYTHON2:
+ from xml.etree import cElementTree
+
+ if hasattr(cElementTree, 'VERSION'):
+ CET_VERSION = make_version_tuple(cElementTree.VERSION)
+ else:
+ CET_VERSION = (0,0,0)
+else:
+ CET_VERSION = (0, 0, 0)
+ cElementTree = None
+
+
+def filter_by_version(test_class, version_dict, current_version):
+ """Remove test methods that do not work with the current lib version.
+ """
+ find_required_version = version_dict.get
+ def dummy_test_method(self):
+ pass
+ for name in dir(test_class):
+ expected_version = find_required_version(name, (0,0,0))
+ if expected_version > current_version:
+ setattr(test_class, name, dummy_test_method)
+
+import doctest
+
+try:
+ import pytest
+except ImportError:
+ class skipif(object):
+ "Using a class because a function would bind into a method when used in classes"
+ def __init__(self, *args): pass
+ def __call__(self, func, *args): return func
+else:
+ skipif = pytest.mark.skipif
+
+def _get_caller_relative_path(filename, frame_depth=2):
+ module = sys.modules[sys._getframe(frame_depth).f_globals['__name__']]
+ return os.path.normpath(os.path.join(
+ os.path.dirname(getattr(module, '__file__', '')), filename))
+
+from io import StringIO
+
+unichr_escape = re.compile(r'\\u[0-9a-fA-F]{4}|\\U[0-9a-fA-F]{8}')
+
+if sys.version_info[0] >= 3:
+ # Python 3
+ from builtins import str as unicode
+ from codecs import unicode_escape_decode
+ _chr = chr
+ def _str(s, encoding="UTF-8"):
+ return unichr_escape.sub(lambda x: unicode_escape_decode(x.group(0))[0], s)
+ def _bytes(s, encoding="UTF-8"):
+ return s.encode(encoding)
+ from io import BytesIO as _BytesIO
+ def BytesIO(*args):
+ if args and isinstance(args[0], str):
+ args = (args[0].encode("UTF-8"),)
+ return _BytesIO(*args)
+
+ doctest_parser = doctest.DocTestParser()
+ _fix_unicode = re.compile(r'(\s+)u(["\'])').sub
+ _fix_exceptions = re.compile(r'(.*except [^(]*),\s*(.*:)').sub
+ def make_doctest(filename):
+ filename = _get_caller_relative_path(filename)
+ doctests = read_file(filename)
+ doctests = _fix_unicode(r'\1\2', doctests)
+ doctests = _fix_exceptions(r'\1 as \2', doctests)
+ return doctest.DocTestCase(
+ doctest_parser.get_doctest(
+ doctests, {}, os.path.basename(filename), filename, 0))
+else:
+ # Python 2
+ from __builtin__ import unicode
+ _chr = unichr
+ def _str(s, encoding="UTF-8"):
+ s = unicode(s, encoding=encoding)
+ return unichr_escape.sub(lambda x:
+ x.group(0).decode('unicode-escape'),
+ s)
+ def _bytes(s, encoding="UTF-8"):
+ return s
+ from io import BytesIO
+
+ doctest_parser = doctest.DocTestParser()
+ _fix_traceback = re.compile(r'^(\s*)(?:\w+\.)+(\w*(?:Error|Exception|Invalid):)', re.M).sub
+ _fix_exceptions = re.compile(r'(.*except [^(]*)\s+as\s+(.*:)').sub
+ _fix_bytes = re.compile(r'(\s+)b(["\'])').sub
+ def make_doctest(filename):
+ filename = _get_caller_relative_path(filename)
+ doctests = read_file(filename)
+ doctests = _fix_traceback(r'\1\2', doctests)
+ doctests = _fix_exceptions(r'\1, \2', doctests)
+ doctests = _fix_bytes(r'\1\2', doctests)
+ return doctest.DocTestCase(
+ doctest_parser.get_doctest(
+ doctests, {}, os.path.basename(filename), filename, 0))
+
+try:
+ skipIf = unittest.skipIf
+except AttributeError:
+ def skipIf(condition, why):
+ def _skip(thing):
+ import types
+ if isinstance(thing, (type, types.ClassType)):
+ return type(thing.__name__, (object,), {})
+ else:
+ return None
+ if condition:
+ return _skip
+ return lambda thing: thing
+
+
+class HelperTestCase(unittest.TestCase):
+ def tearDown(self):
+ if DEBUG_PROXY_ISSUES:
+ gc.collect()
+
+ def parse(self, text, parser=None):
+ f = BytesIO(text) if isinstance(text, bytes) else StringIO(text)
+ return etree.parse(f, parser=parser)
+
+ def _rootstring(self, tree):
+ return etree.tostring(tree.getroot()).replace(
+ _bytes(' '), _bytes('')).replace(_bytes('\n'), _bytes(''))
+
+
+class SillyFileLike:
+ def __init__(self, xml_data=_bytes('<foo><bar/></foo>')):
+ self.xml_data = xml_data
+
+ def read(self, amount=None):
+ if self.xml_data:
+ if amount:
+ data = self.xml_data[:amount]
+ self.xml_data = self.xml_data[amount:]
+ else:
+ data = self.xml_data
+ self.xml_data = _bytes('')
+ return data
+ return _bytes('')
+
+class LargeFileLike:
+ def __init__(self, charlen=100, depth=4, children=5):
+ self.data = BytesIO()
+ self.chars = _bytes('a') * charlen
+ self.children = range(children)
+ self.more = self.iterelements(depth)
+
+ def iterelements(self, depth):
+ yield _bytes('<root>')
+ depth -= 1
+ if depth > 0:
+ for child in self.children:
+ for element in self.iterelements(depth):
+ yield element
+ yield self.chars
+ else:
+ yield self.chars
+ yield _bytes('</root>')
+
+ def read(self, amount=None):
+ data = self.data
+ append = data.write
+ if amount:
+ for element in self.more:
+ append(element)
+ if data.tell() >= amount:
+ break
+ else:
+ for element in self.more:
+ append(element)
+ result = data.getvalue()
+ data.seek(0)
+ data.truncate()
+ if amount:
+ append(result[amount:])
+ result = result[:amount]
+ return result
+
+class LargeFileLikeUnicode(LargeFileLike):
+ def __init__(self, charlen=100, depth=4, children=5):
+ LargeFileLike.__init__(self, charlen, depth, children)
+ self.data = StringIO()
+ self.chars = _str('a') * charlen
+ self.more = self.iterelements(depth)
+
+ def iterelements(self, depth):
+ yield _str('<root>')
+ depth -= 1
+ if depth > 0:
+ for child in self.children:
+ for element in self.iterelements(depth):
+ yield element
+ yield self.chars
+ else:
+ yield self.chars
+ yield _str('</root>')
+
+def fileInTestDir(name):
+ _testdir = os.path.dirname(__file__)
+ return os.path.join(_testdir, name)
+
+def path2url(path):
+ return urlparse.urljoin(
+ 'file:', pathname2url(path))
+
+def fileUrlInTestDir(name):
+ return path2url(fileInTestDir(name))
+
+def read_file(name, mode='r'):
+ with open(name, mode) as f:
+ data = f.read()
+ return data
+
+def write_to_file(name, data, mode='w'):
+ with open(name, mode) as f:
+ f.write(data)
+
+def readFileInTestDir(name, mode='r'):
+ return read_file(fileInTestDir(name), mode)
+
+def canonicalize(xml):
+ tree = etree.parse(BytesIO(xml) if isinstance(xml, bytes) else StringIO(xml))
+ f = BytesIO()
+ tree.write_c14n(f)
+ return f.getvalue()
+
+
+@contextmanager
+def tmpfile(**kwargs):
+ handle, filename = tempfile.mkstemp(**kwargs)
+ try:
+ yield filename
+ finally:
+ os.close(handle)
+ os.remove(filename)
diff --git a/src/lxml/tests/dummy_http_server.py b/src/lxml/tests/dummy_http_server.py
new file mode 100644
index 0000000..70ef8d6
--- /dev/null
+++ b/src/lxml/tests/dummy_http_server.py
@@ -0,0 +1,84 @@
+"""
+Simple HTTP request dumper for tests.
+"""
+
+import sys
+from contextlib import contextmanager
+
+try:
+ import urlparse
+except ImportError:
+ # Python 3
+ import urllib.parse as urlparse
+
+
+@contextmanager
+def webserver(app, port=0, host=None):
+ """Context manager entry point for the 'with' statement.
+
+ Pass 0 as port number to dynamically allocate a free port.
+
+ Usage:
+
+ with webserver(wsgi_app_function, 8080) as host_url:
+ do_ws_calls(host_url)
+ """
+ server = build_web_server(app, port, host or '127.0.0.1')
+ host, port = server.socket.getsockname()
+
+ import threading
+ thread = threading.Thread(target=server.serve_forever,
+ kwargs={'poll_interval': 0.5})
+ thread.setDaemon(True)
+ thread.start()
+ try:
+ yield 'http://%s:%s/' % (host, port) # yield control to 'with' body
+ finally:
+ server.shutdown()
+ server.server_close()
+ thread.join(timeout=1)
+
+
+try:
+ from SocketServer import ThreadingMixIn
+except ImportError:
+ # Python 3
+ from socketserver import ThreadingMixIn
+
+import wsgiref.simple_server as wsgiserver
+class WebServer(wsgiserver.WSGIServer, ThreadingMixIn):
+ """A web server that starts a new thread for each request.
+ """
+
+
+class _RequestHandler(wsgiserver.WSGIRequestHandler):
+ def get_stderr(self):
+ # don't write to stderr
+ return sys.stdout
+
+ def log_message(self, format, *args):
+ # message = "wsmock(%s) %s" % (self.address_string(), format % args)
+ pass # don't log messages
+
+
+def build_web_server(app, port, host=None):
+ server = wsgiserver.make_server(
+ host or '', port, app,
+ server_class=WebServer,
+ handler_class=_RequestHandler)
+ return server
+
+
+class HTTPRequestCollector(object):
+ def __init__(self, response_data, response_code=200, headers=()):
+ self.requests = []
+ self.response_code = response_code
+ self.response_data = response_data
+ self.headers = list(headers or ())
+
+ def __call__(self, environ, start_response):
+ self.requests.append((
+ environ.get('PATH_INFO'),
+ urlparse.parse_qsl(environ.get('QUERY_STRING'))))
+ start_response('%s OK' % self.response_code, self.headers)
+ return [self.response_data]
diff --git a/src/lxml/tests/include/test_xinclude.xml b/src/lxml/tests/include/test_xinclude.xml
new file mode 100644
index 0000000..1cc05cf
--- /dev/null
+++ b/src/lxml/tests/include/test_xinclude.xml
@@ -0,0 +1,4 @@
+<doc xmlns:xi="http://www.w3.org/2001/XInclude">
+<foo/>
+<xi:include href="../test.xml" />
+</doc> \ No newline at end of file
diff --git a/src/lxml/tests/selftest.py b/src/lxml/tests/selftest.py
new file mode 100644
index 0000000..6ee0ff6
--- /dev/null
+++ b/src/lxml/tests/selftest.py
@@ -0,0 +1,1253 @@
+# $Id: selftest.py 3276 2007-09-12 06:52:30Z fredrik $
+# -*- coding: iso-8859-1 -*-
+# elementtree selftest program
+
+# this test script uses Python's "doctest" module to check that the
+# *test script* works as expected.
+
+# TODO: add more elementtree method tests
+# TODO: add xml/html parsing tests
+# TODO: etc
+
+import re, sys
+
+def stdout():
+ if sys.version_info[0] < 3:
+ return sys.stdout
+ class bytes_stdout(object):
+ def write(self, data):
+ if isinstance(data, bytes):
+ data = data.decode('ISO8859-1')
+ sys.stdout.write(data)
+ return bytes_stdout()
+
+try:
+ from StringIO import StringIO as BytesIO
+except ImportError:
+ from io import BytesIO
+
+from lxml import etree as ElementTree
+from lxml import _elementpath as ElementPath
+from lxml import ElementInclude
+ET = ElementTree
+
+#from elementtree import ElementTree
+#from elementtree import ElementPath
+#from elementtree import ElementInclude
+#from elementtree import HTMLTreeBuilder
+#from elementtree import SimpleXMLWriter
+
+def fix_compatibility(xml_data):
+ xml_data = re.sub(r'\s*xmlns:[a-z0-9]+="http://www.w3.org/2001/XInclude"', '', xml_data)
+ xml_data = xml_data.replace(' />', '/>')
+ if xml_data[-1:] == '\n':
+ xml_data = xml_data[:-1]
+ return xml_data
+
+def serialize(elem, **options):
+ file = BytesIO()
+ tree = ElementTree.ElementTree(elem)
+ tree.write(file, **options)
+ if sys.version_info[0] < 3:
+ try:
+ encoding = options["encoding"]
+ except KeyError:
+ encoding = "utf-8"
+ else:
+ encoding = 'ISO8859-1'
+ result = fix_compatibility(file.getvalue().decode(encoding))
+ if sys.version_info[0] < 3:
+ result = result.encode(encoding)
+ return result
+
+def summarize(elem):
+ return elem.tag
+
+def summarize_list(seq):
+ return list(map(summarize, seq))
+
+def normalize_crlf(tree):
+ for elem in tree.getiterator():
+ if elem.text: elem.text = elem.text.replace("\r\n", "\n")
+ if elem.tail: elem.tail = elem.tail.replace("\r\n", "\n")
+
+SAMPLE_XML = ElementTree.XML("""
+<body>
+ <tag class='a'>text</tag>
+ <tag class='b' />
+ <section>
+ <tag class='b' id='inner'>subtext</tag>
+ </section>
+</body>
+""")
+
+#
+# interface tests
+
+def check_string(string):
+ len(string)
+ for char in string:
+ if len(char) != 1:
+ print("expected one-character string, got %r" % char)
+ new_string = string + ""
+ new_string = string + " "
+ string[:0]
+
+def check_string_or_none(value):
+ if value is None:
+ return
+ return check_string(value)
+
+def check_mapping(mapping):
+ len(mapping)
+ keys = mapping.keys()
+ items = mapping.items()
+ for key in keys:
+ item = mapping[key]
+ mapping["key"] = "value"
+ if mapping["key"] != "value":
+ print("expected value string, got %r" % mapping["key"])
+
+def check_element(element):
+ if not hasattr(element, "tag"):
+ print("no tag member")
+ if not hasattr(element, "attrib"):
+ print("no attrib member")
+ if not hasattr(element, "text"):
+ print("no text member")
+ if not hasattr(element, "tail"):
+ print("no tail member")
+ check_string(element.tag)
+ check_mapping(element.attrib)
+ check_string_or_none(element.text)
+ check_string_or_none(element.tail)
+ for elem in element:
+ check_element(elem)
+
+def check_element_tree(tree):
+ check_element(tree.getroot())
+
+# --------------------------------------------------------------------
+# element tree tests
+
+def sanity():
+ """
+ >>> from elementtree.ElementTree import *
+ >>> from elementtree.ElementInclude import *
+ >>> from elementtree.ElementPath import *
+ >>> from elementtree.HTMLTreeBuilder import *
+ >>> from elementtree.SimpleXMLWriter import *
+ >>> from elementtree.TidyTools import *
+ """
+
+# doesn't work with lxml.etree
+del sanity
+
+def version():
+ """
+ >>> ElementTree.VERSION
+ '1.3a2'
+ """
+
+# doesn't work with lxml.etree
+del version
+
+def interface():
+ """
+ Test element tree interface.
+
+ >>> element = ElementTree.Element("tag")
+ >>> check_element(element)
+ >>> tree = ElementTree.ElementTree(element)
+ >>> check_element_tree(tree)
+ """
+
+def simpleops():
+ """
+ >>> elem = ElementTree.XML("<body><tag/></body>")
+ >>> serialize(elem)
+ '<body><tag/></body>'
+ >>> e = ElementTree.Element("tag2")
+ >>> elem.append(e)
+ >>> serialize(elem)
+ '<body><tag/><tag2/></body>'
+ >>> elem.remove(e)
+ >>> serialize(elem)
+ '<body><tag/></body>'
+ >>> elem.insert(0, e)
+ >>> serialize(elem)
+ '<body><tag2/><tag/></body>'
+ >>> elem.remove(e)
+ >>> elem.extend([e])
+ >>> serialize(elem)
+ '<body><tag/><tag2/></body>'
+ >>> elem.remove(e)
+ """
+
+def simplefind():
+ """
+ Test find methods using the elementpath fallback.
+
+ >>> CurrentElementPath = ElementTree.ElementPath
+ >>> ElementTree.ElementPath = ElementTree._SimpleElementPath()
+ >>> elem = SAMPLE_XML
+ >>> elem.find("tag").tag
+ 'tag'
+ >>> ElementTree.ElementTree(elem).find("tag").tag
+ 'tag'
+ >>> elem.findtext("tag")
+ 'text'
+ >>> elem.findtext("tog")
+ >>> elem.findtext("tog", "default")
+ 'default'
+ >>> ElementTree.ElementTree(elem).findtext("tag")
+ 'text'
+ >>> summarize_list(elem.findall("tag"))
+ ['tag', 'tag']
+ >>> summarize_list(elem.findall(".//tag"))
+ ['tag', 'tag', 'tag']
+
+ Path syntax doesn't work in this case.
+
+ >>> elem.find("section/tag")
+ >>> elem.findtext("section/tag")
+ >>> elem.findall("section/tag")
+ []
+
+ >>> ElementTree.ElementPath = CurrentElementPath
+ """
+
+# doesn't work with lxml.etree
+del simplefind
+
+def find():
+ """
+ Test find methods (including xpath syntax).
+
+ >>> elem = SAMPLE_XML
+ >>> elem.find("tag").tag
+ 'tag'
+ >>> ElementTree.ElementTree(elem).find("tag").tag
+ 'tag'
+ >>> elem.find("section/tag").tag
+ 'tag'
+ >>> ElementTree.ElementTree(elem).find("section/tag").tag
+ 'tag'
+ >>> elem.findtext("tag")
+ 'text'
+ >>> elem.findtext("tog")
+ >>> elem.findtext("tog", "default")
+ 'default'
+ >>> ElementTree.ElementTree(elem).findtext("tag")
+ 'text'
+ >>> elem.findtext("section/tag")
+ 'subtext'
+ >>> ElementTree.ElementTree(elem).findtext("section/tag")
+ 'subtext'
+ >>> summarize_list(elem.findall("tag"))
+ ['tag', 'tag']
+ >>> summarize_list(elem.findall("*"))
+ ['tag', 'tag', 'section']
+ >>> summarize_list(elem.findall(".//tag"))
+ ['tag', 'tag', 'tag']
+ >>> summarize_list(elem.findall("section/tag"))
+ ['tag']
+ >>> summarize_list(elem.findall("section//tag"))
+ ['tag']
+ >>> summarize_list(elem.findall("section/*"))
+ ['tag']
+ >>> summarize_list(elem.findall("section//*"))
+ ['tag']
+ >>> summarize_list(elem.findall("section/.//*"))
+ ['tag']
+ >>> summarize_list(elem.findall("*/*"))
+ ['tag']
+ >>> summarize_list(elem.findall("*//*"))
+ ['tag']
+ >>> summarize_list(elem.findall("*/tag"))
+ ['tag']
+ >>> summarize_list(elem.findall("*/./tag"))
+ ['tag']
+ >>> summarize_list(elem.findall("./tag"))
+ ['tag', 'tag']
+ >>> summarize_list(elem.findall(".//tag"))
+ ['tag', 'tag', 'tag']
+ >>> summarize_list(elem.findall("././tag"))
+ ['tag', 'tag']
+ >>> summarize_list(elem.findall(".//tag[@class]"))
+ ['tag', 'tag', 'tag']
+ >>> summarize_list(elem.findall(".//tag[@class='a']"))
+ ['tag']
+ >>> summarize_list(elem.findall(".//tag[@class='b']"))
+ ['tag', 'tag']
+ >>> summarize_list(elem.findall(".//tag[@id]"))
+ ['tag']
+ >>> summarize_list(elem.findall(".//section[tag]"))
+ ['section']
+ >>> summarize_list(elem.findall(".//section[element]"))
+ []
+ >>> summarize_list(elem.findall("../tag"))
+ []
+ >>> summarize_list(elem.findall("section/../tag"))
+ ['tag', 'tag']
+ >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag"))
+ ['tag', 'tag']
+
+ FIXME: ET's Path module handles this case incorrectly; this gives
+ a warning in 1.3, and the behaviour will be modified in 1.4.
+
+ >>> summarize_list(ElementTree.ElementTree(elem).findall("/tag"))
+ ['tag', 'tag']
+ """
+
+def bad_find():
+ """
+ Check bad or unsupported path expressions.
+
+ >>> elem = SAMPLE_XML
+ >>> elem.findall("/tag")
+ Traceback (most recent call last):
+ SyntaxError: cannot use absolute path on element
+
+ # this is supported in ET 1.3:
+ #>>> elem.findall("section//")
+ #Traceback (most recent call last):
+ #SyntaxError: invalid path
+ """
+
+def parsefile():
+ """
+ Test parsing from file.
+
+ >>> tree = ElementTree.parse("samples/simple.xml")
+ >>> normalize_crlf(tree)
+ >>> tree.write(stdout())
+ <root>
+ <element key="value">text</element>
+ <element>text</element>tail
+ <empty-element/>
+ </root>
+ >>> tree = ElementTree.parse("samples/simple-ns.xml")
+ >>> normalize_crlf(tree)
+ >>> tree.write(stdout())
+ <root xmlns="http://namespace/">
+ <element key="value">text</element>
+ <element>text</element>tail
+ <empty-element/>
+ </root>
+
+## <ns0:root xmlns:ns0="http://namespace/">
+## <ns0:element key="value">text</ns0:element>
+## <ns0:element>text</ns0:element>tail
+## <ns0:empty-element/>
+## </ns0:root>
+ """
+
+def parsehtml():
+ """
+ Test HTML parsing.
+
+ >>> # p = HTMLTreeBuilder.TreeBuilder()
+ >>> p = ElementTree.HTMLParser()
+ >>> p.feed("<p><p>spam<b>egg</b></p>")
+ >>> serialize(p.close())
+ '<p>spam<b>egg</b></p>'
+ """
+
+# doesn't work with lxml.etree
+del parsehtml
+
+def parseliteral():
+ r"""
+ >>> element = ElementTree.XML("<html><body>text</body></html>")
+ >>> ElementTree.ElementTree(element).write(stdout())
+ <html><body>text</body></html>
+ >>> element = ElementTree.fromstring("<html><body>text</body></html>")
+ >>> ElementTree.ElementTree(element).write(stdout())
+ <html><body>text</body></html>
+
+## >>> sequence = ["<html><body>", "text</bo", "dy></html>"]
+## >>> element = ElementTree.fromstringlist(sequence)
+## >>> ElementTree.ElementTree(element).write(stdout())
+## <html><body>text</body></html>
+
+ >>> print(repr(ElementTree.tostring(element)).lstrip('b'))
+ '<html><body>text</body></html>'
+
+# looks different in lxml
+# >>> print(ElementTree.tostring(element, "ascii"))
+# <?xml version='1.0' encoding='ascii'?>
+# <html><body>text</body></html>
+
+ >>> _, ids = ElementTree.XMLID("<html><body>text</body></html>")
+ >>> len(ids)
+ 0
+ >>> _, ids = ElementTree.XMLID("<html><body id='body'>text</body></html>")
+ >>> len(ids)
+ 1
+ >>> ids["body"].tag
+ 'body'
+ """
+
+def simpleparsefile():
+ """
+ Test the xmllib-based parser.
+
+ >>> from elementtree import SimpleXMLTreeBuilder
+ >>> parser = SimpleXMLTreeBuilder.TreeBuilder()
+ >>> tree = ElementTree.parse("samples/simple.xml", parser)
+ >>> normalize_crlf(tree)
+ >>> tree.write(sys.stdout)
+ <root>
+ <element key="value">text</element>
+ <element>text</element>tail
+ <empty-element />
+ </root>
+ """
+
+# doesn't work with lxml.etree
+del simpleparsefile
+
+def iterparse():
+ """
+ Test iterparse interface.
+
+ >>> iterparse = ElementTree.iterparse
+
+ >>> context = iterparse("samples/simple.xml")
+ >>> for action, elem in context:
+ ... print("%s %s" % (action, elem.tag))
+ end element
+ end element
+ end empty-element
+ end root
+ >>> context.root.tag
+ 'root'
+
+ >>> context = iterparse("samples/simple-ns.xml")
+ >>> for action, elem in context:
+ ... print("%s %s" % (action, elem.tag))
+ end {http://namespace/}element
+ end {http://namespace/}element
+ end {http://namespace/}empty-element
+ end {http://namespace/}root
+
+ >>> events = ()
+ >>> context = iterparse("samples/simple.xml", events)
+ >>> for action, elem in context:
+ ... print("%s %s" % (action, elem.tag))
+
+ >>> events = ()
+ >>> context = iterparse("samples/simple.xml", events=events)
+ >>> for action, elem in context:
+ ... print("%s %s" % (action, elem.tag))
+
+ >>> events = ("start", "end")
+ >>> context = iterparse("samples/simple.xml", events)
+ >>> for action, elem in context:
+ ... print("%s %s" % (action, elem.tag))
+ start root
+ start element
+ end element
+ start element
+ end element
+ start empty-element
+ end empty-element
+ end root
+
+ >>> events = ("start", "end", "start-ns", "end-ns")
+ >>> context = iterparse("samples/simple-ns.xml", events)
+ >>> for action, elem in context:
+ ... if action in ("start", "end"):
+ ... print("%s %s" % (action, elem.tag))
+ ... else:
+ ... print("%s %s" % (action, elem))
+ start-ns ('', 'http://namespace/')
+ start {http://namespace/}root
+ start {http://namespace/}element
+ end {http://namespace/}element
+ start {http://namespace/}element
+ end {http://namespace/}element
+ start {http://namespace/}empty-element
+ end {http://namespace/}empty-element
+ end {http://namespace/}root
+ end-ns None
+
+ """
+
+def fancyparsefile():
+ """
+ Test the "fancy" parser.
+
+ Sanity check.
+ >>> from elementtree import XMLTreeBuilder
+ >>> parser = XMLTreeBuilder.FancyTreeBuilder()
+ >>> tree = ElementTree.parse("samples/simple.xml", parser)
+ >>> normalize_crlf(tree)
+ >>> tree.write(sys.stdout)
+ <root>
+ <element key="value">text</element>
+ <element>text</element>tail
+ <empty-element />
+ </root>
+
+ Callback check.
+ >>> class MyFancyParser(XMLTreeBuilder.FancyTreeBuilder):
+ ... def start(self, elem):
+ ... print("START %s" % elem.tag)
+ ... def end(self, elem):
+ ... print("END %s" % elem.tag)
+ >>> parser = MyFancyParser()
+ >>> tree = ElementTree.parse("samples/simple.xml", parser)
+ START root
+ START element
+ END element
+ START element
+ END element
+ START empty-element
+ END empty-element
+ END root
+ """
+
+# doesn't work with lxml.etree
+del fancyparsefile
+
+def writefile():
+ """
+ >>> elem = ElementTree.Element("tag")
+ >>> elem.text = "text"
+ >>> serialize(elem)
+ '<tag>text</tag>'
+ >>> ElementTree.SubElement(elem, "subtag").text = "subtext"
+ >>> serialize(elem)
+ '<tag>text<subtag>subtext</subtag></tag>'
+
+## Test tag suppression
+## >>> elem.tag = None
+## >>> serialize(elem)
+## 'text<subtag>subtext</subtag>'
+ """
+
+def writestring():
+ """
+ >>> elem = ElementTree.XML("<html><body>text</body></html>")
+ >>> print(repr(ElementTree.tostring(elem)).lstrip('b'))
+ '<html><body>text</body></html>'
+ >>> elem = ElementTree.fromstring("<html><body>text</body></html>")
+ >>> print(repr(ElementTree.tostring(elem)).lstrip('b'))
+ '<html><body>text</body></html>'
+ """
+
+def encoding():
+ r"""
+ Test encoding issues.
+
+ >>> elem = ElementTree.Element("tag")
+ >>> elem.text = u'abc'
+ >>> serialize(elem)
+ '<tag>abc</tag>'
+ >>> serialize(elem, encoding="utf-8")
+ '<tag>abc</tag>'
+ >>> serialize(elem, encoding="us-ascii")
+ '<tag>abc</tag>'
+ >>> serialize(elem, encoding="iso-8859-1").lower()
+ "<?xml version='1.0' encoding='iso-8859-1'?>\n<tag>abc</tag>"
+
+ >>> elem.text = "<&\"\'>"
+ >>> serialize(elem)
+ '<tag>&lt;&amp;"\'&gt;</tag>'
+ >>> serialize(elem, encoding="utf-8")
+ '<tag>&lt;&amp;"\'&gt;</tag>'
+ >>> serialize(elem, encoding="us-ascii") # cdata characters
+ '<tag>&lt;&amp;"\'&gt;</tag>'
+ >>> serialize(elem, encoding="iso-8859-1").lower()
+ '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag>&lt;&amp;"\'&gt;</tag>'
+
+ >>> elem.attrib["key"] = "<&\"\'>"
+ >>> elem.text = None
+ >>> serialize(elem)
+ '<tag key="&lt;&amp;&quot;\'&gt;"/>'
+ >>> serialize(elem, encoding="utf-8")
+ '<tag key="&lt;&amp;&quot;\'&gt;"/>'
+ >>> serialize(elem, encoding="us-ascii")
+ '<tag key="&lt;&amp;&quot;\'&gt;"/>'
+ >>> serialize(elem, encoding="iso-8859-1").lower()
+ '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="&lt;&amp;&quot;\'&gt;"/>'
+
+ >>> elem.text = u'\xe5\xf6\xf6<>'
+ >>> elem.attrib.clear()
+ >>> serialize(elem)
+ '<tag>&#229;&#246;&#246;&lt;&gt;</tag>'
+ >>> serialize(elem, encoding="utf-8")
+ '<tag>\xc3\xa5\xc3\xb6\xc3\xb6&lt;&gt;</tag>'
+ >>> serialize(elem, encoding="us-ascii")
+ '<tag>&#229;&#246;&#246;&lt;&gt;</tag>'
+ >>> serialize(elem, encoding="iso-8859-1").lower()
+ "<?xml version='1.0' encoding='iso-8859-1'?>\n<tag>\xe5\xf6\xf6&lt;&gt;</tag>"
+
+ >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>'
+ >>> elem.text = None
+ >>> serialize(elem)
+ '<tag key="&#229;&#246;&#246;&lt;&gt;"/>'
+ >>> serialize(elem, encoding="utf-8")
+ '<tag key="\xc3\xa5\xc3\xb6\xc3\xb6&lt;&gt;"/>'
+ >>> serialize(elem, encoding="us-ascii")
+ '<tag key="&#229;&#246;&#246;&lt;&gt;"/>'
+ >>> serialize(elem, encoding="iso-8859-1").lower()
+ '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="\xe5\xf6\xf6&lt;&gt;"/>'
+ """
+
+if sys.version_info[0] >= 3:
+ encoding.__doc__ = encoding.__doc__.replace("u'", "'")
+
+def methods():
+ r"""
+ Test serialization methods.
+
+ >>> e = ET.XML("<html><link/><script>1 &lt; 2</script></html>")
+ >>> e.tail = "\n"
+ >>> serialize(e)
+ '<html><link /><script>1 &lt; 2</script></html>\n'
+ >>> serialize(e, method=None)
+ '<html><link /><script>1 &lt; 2</script></html>\n'
+ >>> serialize(e, method="xml")
+ '<html><link /><script>1 &lt; 2</script></html>\n'
+ >>> serialize(e, method="html")
+ '<html><link><script>1 < 2</script></html>\n'
+ >>> serialize(e, method="text")
+ '1 < 2\n'
+
+ """
+
+# doesn't work with lxml.etree
+del methods
+
+def iterators():
+ """
+ Test iterators.
+
+ >>> e = ET.XML("<html><body>this is a <i>paragraph</i>.</body>..</html>")
+ >>> summarize_list(e.iter())
+ ['html', 'body', 'i']
+ >>> summarize_list(e.find("body").iter())
+ ['body', 'i']
+ >>> "".join(e.itertext())
+ 'this is a paragraph...'
+ >>> "".join(e.find("body").itertext())
+ 'this is a paragraph.'
+ """
+
+ENTITY_XML = """\
+<!DOCTYPE points [
+<!ENTITY % user-entities SYSTEM 'user-entities.xml'>
+%user-entities;
+]>
+<document>&entity;</document>
+"""
+
+def entity():
+ """
+ Test entity handling.
+
+ 1) bad entities
+
+ >>> ElementTree.XML("<document>&entity;</document>")
+ Traceback (most recent call last):
+ ExpatError: undefined entity: line 1, column 10
+
+ >>> ElementTree.XML(ENTITY_XML)
+ Traceback (most recent call last):
+ ExpatError: undefined entity &entity;: line 5, column 10
+
+ (add more tests here)
+
+ """
+
+# doesn't work with lxml.etree
+del entity
+
+def error(xml):
+ """
+ Test error handling.
+
+ >>> error("foo").position
+ (1, 0)
+ >>> error("<tag>&foo;</tag>").position
+ (1, 5)
+ >>> error("foobar<").position
+ (1, 6)
+
+ """
+ try:
+ ET.XML(xml)
+ except ET.ParseError:
+ return sys.exc_value
+
+# doesn't work with lxml.etree -> different positions
+del error
+
+def namespace():
+ """
+ Test namespace issues.
+
+ 1) xml namespace
+
+ >>> elem = ElementTree.XML("<tag xml:lang='en' />")
+ >>> serialize(elem) # 1.1
+ '<tag xml:lang="en"/>'
+
+ 2) other "well-known" namespaces
+
+ >>> elem = ElementTree.XML("<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' />")
+ >>> serialize(elem) # 2.1
+ '<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"/>'
+
+ >>> elem = ElementTree.XML("<html:html xmlns:html='http://www.w3.org/1999/xhtml' />")
+ >>> serialize(elem) # 2.2
+ '<html:html xmlns:html="http://www.w3.org/1999/xhtml"/>'
+
+ >>> elem = ElementTree.XML("<soap:Envelope xmlns:soap='http://schemas.xmlsoap.org/soap/envelope' />")
+ >>> serialize(elem) # 2.3
+ '<soap:Envelope xmlns:soap="http://schemas.xmlsoap.org/soap/envelope"/>'
+
+ 3) unknown namespaces
+
+ """
+
+def qname():
+ """
+ Test QName handling.
+
+ 1) decorated tags
+
+ >>> elem = ElementTree.Element("{uri}tag")
+ >>> serialize(elem) # 1.1
+ '<ns0:tag xmlns:ns0="uri"/>'
+ >>> elem = ElementTree.Element(ElementTree.QName("{uri}tag"))
+ >>> serialize(elem) # 1.2
+ '<ns0:tag xmlns:ns0="uri"/>'
+ >>> elem = ElementTree.Element(ElementTree.QName("uri", "tag"))
+ >>> serialize(elem) # 1.3
+ '<ns0:tag xmlns:ns0="uri"/>'
+
+# ns/attribute order ...
+
+## 2) decorated attributes
+
+## >>> elem.clear()
+## >>> elem.attrib["{uri}key"] = "value"
+## >>> serialize(elem) # 2.1
+## '<ns0:tag ns0:key="value" xmlns:ns0="uri"/>'
+
+## >>> elem.clear()
+## >>> elem.attrib[ElementTree.QName("{uri}key")] = "value"
+## >>> serialize(elem) # 2.2
+## '<ns0:tag ns0:key="value" xmlns:ns0="uri"/>'
+
+## 3) decorated values are not converted by default, but the
+## QName wrapper can be used for values
+
+## >>> elem.clear()
+## >>> elem.attrib["{uri}key"] = "{uri}value"
+## >>> serialize(elem) # 3.1
+## '<ns0:tag ns0:key="{uri}value" xmlns:ns0="uri"/>'
+
+## >>> elem.clear()
+## >>> elem.attrib["{uri}key"] = ElementTree.QName("{uri}value")
+## >>> serialize(elem) # 3.2
+## '<ns0:tag ns0:key="ns0:value" xmlns:ns0="uri"/>'
+
+## >>> elem.clear()
+## >>> subelem = ElementTree.Element("tag")
+## >>> subelem.attrib["{uri1}key"] = ElementTree.QName("{uri2}value")
+## >>> elem.append(subelem)
+## >>> elem.append(subelem)
+## >>> serialize(elem) # 3.3
+## '<ns0:tag xmlns:ns0="uri"><tag ns1:key="ns2:value" xmlns:ns1="uri1" xmlns:ns2="uri2"/><tag ns1:key="ns2:value" xmlns:ns1="uri1" xmlns:ns2="uri2"/></ns0:tag>'
+
+ """
+
+def xpath_tokenizer(p):
+ """
+ Test the XPath tokenizer.
+
+ >>> # tests from the xml specification
+ >>> xpath_tokenizer("*")
+ ['*']
+ >>> xpath_tokenizer("text()")
+ ['text', '()']
+ >>> xpath_tokenizer("@name")
+ ['@', 'name']
+ >>> xpath_tokenizer("@*")
+ ['@', '*']
+ >>> xpath_tokenizer("para[1]")
+ ['para', '[', '1', ']']
+ >>> xpath_tokenizer("para[last()]")
+ ['para', '[', 'last', '()', ']']
+ >>> xpath_tokenizer("*/para")
+ ['*', '/', 'para']
+ >>> xpath_tokenizer("/doc/chapter[5]/section[2]")
+ ['/', 'doc', '/', 'chapter', '[', '5', ']', '/', 'section', '[', '2', ']']
+ >>> xpath_tokenizer("chapter//para")
+ ['chapter', '//', 'para']
+ >>> xpath_tokenizer("//para")
+ ['//', 'para']
+ >>> xpath_tokenizer("//olist/item")
+ ['//', 'olist', '/', 'item']
+ >>> xpath_tokenizer(".")
+ ['.']
+ >>> xpath_tokenizer(".//para")
+ ['.', '//', 'para']
+ >>> xpath_tokenizer("..")
+ ['..']
+ >>> xpath_tokenizer("../@lang")
+ ['..', '/', '@', 'lang']
+ >>> xpath_tokenizer("chapter[title]")
+ ['chapter', '[', 'title', ']']
+ >>> xpath_tokenizer("employee[@secretary and @assistant]")
+ ['employee', '[', '@', 'secretary', '', 'and', '', '@', 'assistant', ']']
+
+ >>> # additional tests
+ >>> xpath_tokenizer("{http://spam}egg")
+ ['{http://spam}egg']
+ >>> xpath_tokenizer("./spam.egg")
+ ['.', '/', 'spam.egg']
+ >>> xpath_tokenizer(".//{http://spam}egg")
+ ['.', '//', '{http://spam}egg']
+ """
+ out = []
+ for op, tag in ElementPath.xpath_tokenizer(p):
+ out.append(op or tag)
+ return out
+
+#
+# xinclude tests (samples from appendix C of the xinclude specification)
+
+XINCLUDE = {
+ "C1.xml": """\
+<?xml version='1.0'?>
+<document xmlns:xi="http://www.w3.org/2001/XInclude">
+ <p>120 Mz is adequate for an average home user.</p>
+ <xi:include href="disclaimer.xml"/>
+</document>
+""", "disclaimer.xml": """\
+<?xml version='1.0'?>
+<disclaimer>
+ <p>The opinions represented herein represent those of the individual
+ and should not be interpreted as official policy endorsed by this
+ organization.</p>
+</disclaimer>
+""",
+ "C2.xml": """\
+<?xml version='1.0'?>
+<document xmlns:xi="http://www.w3.org/2001/XInclude">
+ <p>This document has been accessed
+ <xi:include href="count.txt" parse="text"/> times.</p>
+</document>
+""", "count.txt": "324387", "C3.xml": """\
+<?xml version='1.0'?>
+<document xmlns:xi="http://www.w3.org/2001/XInclude">
+ <p>The following is the source of the "data.xml" resource:</p>
+ <example><xi:include href="data.xml" parse="text"/></example>
+</document>
+""", "data.xml": """\
+<?xml version='1.0'?>
+<data>
+ <item><![CDATA[Brooks & Shields]]></item>
+</data>
+""",
+ "C5.xml": """\
+<?xml version='1.0'?>
+<div xmlns:xi="http://www.w3.org/2001/XInclude">
+ <xi:include href="example.txt" parse="text">
+ <xi:fallback>
+ <xi:include href="fallback-example.txt" parse="text">
+ <xi:fallback><a href="mailto:bob@example.org">Report error</a></xi:fallback>
+ </xi:include>
+ </xi:fallback>
+ </xi:include>
+</div>
+""",
+ "default.xml": """\
+<?xml version='1.0'?>
+<document xmlns:xi="http://www.w3.org/2001/XInclude">
+ <p>Example.</p>
+ <xi:include href="samples/simple.xml"/>
+</document>
+"""}
+
+
+def xinclude_loader(href, parse="xml", encoding=None):
+ try:
+ data = XINCLUDE[href]
+ except KeyError:
+ raise IOError("resource not found")
+ if parse == "xml":
+ return ElementTree.XML(data)
+ return data
+
+def xinclude():
+ r"""
+ Basic inclusion example (XInclude C.1)
+
+ >>> document = xinclude_loader("C1.xml")
+ >>> ElementInclude.include(document, xinclude_loader)
+ >>> print(serialize(document)) # C1
+ <document>
+ <p>120 Mz is adequate for an average home user.</p>
+ <disclaimer>
+ <p>The opinions represented herein represent those of the individual
+ and should not be interpreted as official policy endorsed by this
+ organization.</p>
+ </disclaimer>
+ </document>
+
+ Textual inclusion example (XInclude C.2)
+
+ >>> document = xinclude_loader("C2.xml")
+ >>> ElementInclude.include(document, xinclude_loader)
+ >>> print(serialize(document)) # C2
+ <document>
+ <p>This document has been accessed
+ 324387 times.</p>
+ </document>
+
+ Textual inclusion of XML example (XInclude C.3)
+
+ >>> document = xinclude_loader("C3.xml")
+ >>> ElementInclude.include(document, xinclude_loader)
+ >>> print(serialize(document)) # C3
+ <document>
+ <p>The following is the source of the "data.xml" resource:</p>
+ <example>&lt;?xml version='1.0'?&gt;
+ &lt;data&gt;
+ &lt;item&gt;&lt;![CDATA[Brooks &amp; Shields]]&gt;&lt;/item&gt;
+ &lt;/data&gt;
+ </example>
+ </document>
+
+## Fallback example (XInclude C.5)
+## Note! Fallback support is not yet implemented
+
+## >>> document = xinclude_loader("C5.xml")
+## >>> ElementInclude.include(document, xinclude_loader)
+## Traceback (most recent call last):
+## IOError: resource not found
+## >>> # print(serialize(document)) # C5
+
+ """
+
+def xinclude_default():
+ """
+ >>> document = xinclude_loader("default.xml")
+ >>> ElementInclude.include(document)
+ >>> print(serialize(document)) # default
+ <document>
+ <p>Example.</p>
+ <root>
+ <element key="value">text</element>
+ <element>text</element>tail
+ <empty-element/>
+ </root>
+ </document>
+ """
+
+#
+# xmlwriter
+
+def xmlwriter():
+ r"""
+ >>> file = BytesIO()
+ >>> w = SimpleXMLWriter.XMLWriter(file)
+ >>> html = w.start("html")
+ >>> x = w.start("head")
+ >>> w.element("title", "my document")
+ >>> w.data("\n")
+ >>> w.element("meta", name="hello", value="goodbye")
+ >>> w.data("\n")
+ >>> w.end()
+ >>> x = w.start("body")
+ >>> w.element("h1", "this is a heading")
+ >>> w.data("\n")
+ >>> w.element("p", u"this is a paragraph")
+ >>> w.data("\n")
+ >>> w.element("p", u"reserved characters: <&>")
+ >>> w.data("\n")
+ >>> w.element("p", u"detta är också ett stycke")
+ >>> w.data("\n")
+ >>> w.close(html)
+ >>> print(file.getvalue())
+ <html><head><title>my document</title>
+ <meta name="hello" value="goodbye" />
+ </head><body><h1>this is a heading</h1>
+ <p>this is a paragraph</p>
+ <p>reserved characters: &lt;&amp;&gt;</p>
+ <p>detta &#228;r ocks&#229; ett stycke</p>
+ </body></html>
+ """
+
+# doesn't work with lxml.etree
+del xmlwriter
+
+# --------------------------------------------------------------------
+# reported bugs
+
+def bug_xmltoolkit21():
+ """
+ marshaller gives obscure errors for non-string values
+
+ >>> elem = ElementTree.Element(123)
+ >>> serialize(elem) # tag
+ Traceback (most recent call last):
+ TypeError: cannot serialize 123 (type int)
+ >>> elem = ElementTree.Element("elem")
+ >>> elem.text = 123
+ >>> serialize(elem) # text
+ Traceback (most recent call last):
+ TypeError: cannot serialize 123 (type int)
+ >>> elem = ElementTree.Element("elem")
+ >>> elem.tail = 123
+ >>> serialize(elem) # tail
+ Traceback (most recent call last):
+ TypeError: cannot serialize 123 (type int)
+ >>> elem = ElementTree.Element("elem")
+ >>> elem.set(123, "123")
+ >>> serialize(elem) # attribute key
+ Traceback (most recent call last):
+ TypeError: cannot serialize 123 (type int)
+ >>> elem = ElementTree.Element("elem")
+ >>> elem.set("123", 123)
+ >>> serialize(elem) # attribute value
+ Traceback (most recent call last):
+ TypeError: cannot serialize 123 (type int)
+
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit21
+
+def bug_xmltoolkit25():
+ """
+ typo in ElementTree.findtext
+
+ >>> tree = ElementTree.ElementTree(SAMPLE_XML)
+ >>> tree.findtext("tag")
+ 'text'
+ >>> tree.findtext("section/tag")
+ 'subtext'
+ """
+
+def bug_xmltoolkit28():
+ """
+ .//tag causes exceptions
+
+ >>> tree = ElementTree.XML("<doc><table><tbody/></table></doc>")
+ >>> summarize_list(tree.findall(".//thead"))
+ []
+ >>> summarize_list(tree.findall(".//tbody"))
+ ['tbody']
+ """
+
+def bug_xmltoolkitX1():
+ """
+ dump() doesn't flush the output buffer
+
+ >>> tree = ElementTree.XML("<doc><table><tbody/></table></doc>")
+ >>> ElementTree.dump(tree); sys.stdout.write("tail")
+ <doc><table><tbody /></table></doc>
+ tail
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkitX1
+
+def bug_xmltoolkit39():
+ """
+ non-ascii element and attribute names doesn't work
+
+ >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg />")
+ >>> ElementTree.tostring(tree, "utf-8")
+ '<t\\xc3\\xa4g />'
+
+ >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><tag ättr='v&#228;lue' />")
+ >>> tree.attrib
+ {u'\\xe4ttr': u'v\\xe4lue'}
+ >>> ElementTree.tostring(tree, "utf-8")
+ '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />'
+
+ >>> tree = ElementTree.XML("<?xml version='1.0' encoding='iso-8859-1'?><täg>text</täg>")
+ >>> ElementTree.tostring(tree, "utf-8")
+ '<t\\xc3\\xa4g>text</t\\xc3\\xa4g>'
+
+ >>> tree = ElementTree.Element(u"täg")
+ >>> ElementTree.tostring(tree, "utf-8")
+ '<t\\xc3\\xa4g />'
+
+ >>> tree = ElementTree.Element("tag")
+ >>> tree.set(u"ättr", u"välue")
+ >>> ElementTree.tostring(tree, "utf-8")
+ '<tag \\xc3\\xa4ttr="v\\xc3\\xa4lue" />'
+
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit39
+
+def bug_xmltoolkit45():
+ """
+ problems parsing mixed unicode/non-ascii html documents
+
+ latin-1 text
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>välue</p>")
+ >>> serialize(p.close())
+ '<p>v&#228;lue</p>'
+
+ utf-8 text
+ >>> p = HTMLTreeBuilder.TreeBuilder(encoding="utf-8")
+ >>> p.feed("<p>v\xc3\xa4lue</p>")
+ >>> serialize(p.close())
+ '<p>v&#228;lue</p>'
+
+ utf-8 text using meta tag
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<html><meta http-equiv='Content-Type' content='text/html; charset=utf-8'><p>v\xc3\xa4lue</p></html>")
+ >>> serialize(p.close().find("p"))
+ '<p>v&#228;lue</p>'
+
+ latin-1 character references
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>v&#228;lue</p>")
+ >>> serialize(p.close())
+ '<p>v&#228;lue</p>'
+
+ latin-1 character entities
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>v&auml;lue</p>")
+ >>> serialize(p.close())
+ '<p>v&#228;lue</p>'
+
+ mixed latin-1 text and unicode entities
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>&#8221;välue&#8221;</p>")
+ >>> serialize(p.close())
+ '<p>&#8221;v&#228;lue&#8221;</p>'
+
+ mixed unicode and latin-1 entities
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>&#8221;v&auml;lue&#8221;</p>")
+ >>> serialize(p.close())
+ '<p>&#8221;v&#228;lue&#8221;</p>'
+
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit45
+
+def bug_xmltoolkit46():
+ """
+ problems parsing open BR tags
+
+ >>> p = HTMLTreeBuilder.TreeBuilder()
+ >>> p.feed("<p>key<br>value</p>")
+ >>> serialize(p.close())
+ '<p>key<br />value</p>'
+
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit46
+
+def bug_xmltoolkit54():
+ """
+ problems handling internally defined entities
+
+ >>> e = ElementTree.XML("<!DOCTYPE doc [<!ENTITY ldots '&#x8230;'>]><doc>&ldots;</doc>")
+ >>> serialize(e)
+ '<doc>&#33328;</doc>'
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit54
+
+def bug_xmltoolkit55():
+ """
+ make sure we're reporting the first error, not the last
+
+ >>> e = ElementTree.XML("<!DOCTYPE doc SYSTEM 'doc.dtd'><doc>&ldots;&ndots;&rdots;</doc>")
+ Traceback (most recent call last):
+ ParseError: undefined entity &ldots;: line 1, column 36
+ """
+
+# doesn't work with lxml.etree
+del bug_xmltoolkit55
+
+def bug_200708_version():
+ """
+ >>> parser = ET.XMLParser()
+ >>> parser.version
+ 'Expat 2.0.0'
+ >>> parser.feed(open("samples/simple.xml").read())
+ >>> print(serialize(parser.close()))
+ <root>
+ <element key="value">text</element>
+ <element>text</element>tail
+ <empty-element />
+ </root>
+ """
+
+# doesn't work with lxml.etree
+del bug_200708_version
+
+def bug_200708_newline():
+ r"""
+
+ Preserve newlines in attributes.
+
+ >>> e = ET.Element('SomeTag', text="def _f():\n return 3\n")
+ >>> ET.tostring(e)
+ '<SomeTag text="def _f():&#10; return 3&#10;" />'
+ >>> ET.XML(ET.tostring(e)).get("text")
+ 'def _f():\n return 3\n'
+ >>> ET.tostring(ET.XML(ET.tostring(e)))
+ '<SomeTag text="def _f():&#10; return 3&#10;" />'
+ """
+
+# doesn't work with lxml.etree
+del bug_200708_newline
+
+def bug_200709_default_namespace():
+ """
+
+ >>> e = ET.Element("{default}elem")
+ >>> s = ET.SubElement(e, "{default}elem")
+ >>> serialize(e, default_namespace="default") # 1
+ '<elem xmlns="default"><elem /></elem>'
+
+ >>> e = ET.Element("{default}elem")
+ >>> s = ET.SubElement(e, "{default}elem")
+ >>> s = ET.SubElement(e, "{not-default}elem")
+ >>> serialize(e, default_namespace="default") # 2
+ '<elem xmlns="default" xmlns:ns1="not-default"><elem /><ns1:elem /></elem>'
+
+ >>> e = ET.Element("{default}elem")
+ >>> s = ET.SubElement(e, "{default}elem")
+ >>> s = ET.SubElement(e, "elem") # unprefixed name
+ >>> serialize(e, default_namespace="default") # 3
+ Traceback (most recent call last):
+ ValueError: cannot use non-qualified names with default_namespace option
+
+ """
+
+# doesn't work with lxml.etree
+del bug_200709_default_namespace
+
+# --------------------------------------------------------------------
+
+if __name__ == "__main__":
+ import doctest, selftest
+ failed, tested = doctest.testmod(selftest)
+ print("%d tests ok." % (tested - failed))
+ if failed > 0:
+ print("%d tests failed. Exiting with non-zero return code." % failed)
+ sys.exit(1)
diff --git a/src/lxml/tests/selftest2.py b/src/lxml/tests/selftest2.py
new file mode 100644
index 0000000..80477af
--- /dev/null
+++ b/src/lxml/tests/selftest2.py
@@ -0,0 +1,452 @@
+# $Id: selftest.py 2213 2005-01-11 18:49:47Z fredrik $
+# elementtree selftest program
+
+# this test script uses Python's "doctest" module to check that the
+# *test script* works as expected.
+
+import sys
+
+try:
+ from StringIO import StringIO
+ BytesIO = StringIO
+except ImportError:
+ from io import BytesIO, StringIO
+
+from lxml import etree as ElementTree
+
+def stdout():
+ if sys.version_info[0] < 3:
+ return sys.stdout
+ class bytes_stdout(object):
+ def write(self, data):
+ if isinstance(data, bytes):
+ data = data.decode('ISO8859-1')
+ sys.stdout.write(data)
+ return bytes_stdout()
+
+def unserialize(text):
+ file = StringIO(text)
+ tree = ElementTree.parse(file)
+ return tree.getroot()
+
+def serialize(elem, encoding=None):
+ file = BytesIO()
+ tree = ElementTree.ElementTree(elem)
+ if encoding:
+ tree.write(file, encoding=encoding)
+ else:
+ tree.write(file)
+ result = file.getvalue()
+ if sys.version_info[0] >= 3:
+ result = result.decode('ISO8859-1')
+ result = result.replace(' />', '/>')
+ if result[-1:] == '\n':
+ result = result[:-1]
+ return result
+
+def summarize(elem):
+ return elem.tag
+
+def summarize_list(seq):
+ return list(map(summarize, seq))
+
+SAMPLE_XML = unserialize("""
+<body>
+ <tag>text</tag>
+ <tag />
+ <section>
+ <tag>subtext</tag>
+ </section>
+</body>
+""")
+
+SAMPLE_XML_NS = unserialize("""
+<body xmlns="http://effbot.org/ns">
+ <tag>text</tag>
+ <tag />
+ <section>
+ <tag>subtext</tag>
+ </section>
+</body>
+""")
+
+# interface tests
+
+def check_string(string):
+ len(string)
+ for char in string:
+ if len(char) != 1:
+ print("expected one-character string, got %r" % char)
+ new_string = string + ""
+ new_string = string + " "
+ string[:0]
+
+def check_mapping(mapping):
+ len(mapping)
+ keys = mapping.keys()
+ items = mapping.items()
+ for key in keys:
+ item = mapping[key]
+ mapping["key"] = "value"
+ if mapping["key"] != "value":
+ print("expected value string, got %r" % mapping["key"])
+
+def check_element(element):
+ if not hasattr(element, "tag"):
+ print("no tag member")
+ if not hasattr(element, "attrib"):
+ print("no attrib member")
+ if not hasattr(element, "text"):
+ print("no text member")
+ if not hasattr(element, "tail"):
+ print("no tail member")
+ check_string(element.tag)
+ check_mapping(element.attrib)
+ if element.text is not None:
+ check_string(element.text)
+ if element.tail is not None:
+ check_string(element.tail)
+
+def check_element_tree(tree):
+ check_element(tree.getroot())
+
+def element():
+ """
+ Test element tree interface.
+
+ >>> element = ElementTree.Element("tag")
+ >>> check_element(element)
+ >>> tree = ElementTree.ElementTree(element)
+ >>> check_element_tree(tree)
+ """
+
+def parsefile():
+ """
+ Test parsing from file. Note that we're opening the files in
+ here; by default, the 'parse' function opens the file in binary
+ mode, and doctest doesn't filter out carriage returns.
+
+ >>> file = open("samples/simple.xml", "rb")
+ >>> tree = ElementTree.parse(file)
+ >>> file.close()
+ >>> tree.write(stdout())
+ <root>
+ <element key="value">text</element>
+ <element>text</element>tail
+ <empty-element/>
+ </root>
+ >>> file = open("samples/simple-ns.xml", "rb")
+ >>> tree = ElementTree.parse(file)
+ >>> file.close()
+ >>> tree.write(stdout())
+ <root xmlns="http://namespace/">
+ <element key="value">text</element>
+ <element>text</element>tail
+ <empty-element/>
+ </root>
+ """
+
+def writefile():
+ """
+ >>> elem = ElementTree.Element("tag")
+ >>> elem.text = "text"
+ >>> serialize(elem)
+ '<tag>text</tag>'
+ >>> ElementTree.SubElement(elem, "subtag").text = "subtext"
+ >>> serialize(elem)
+ '<tag>text<subtag>subtext</subtag></tag>'
+ """
+
+def encoding():
+ r"""
+ Test encoding issues.
+
+ >>> elem = ElementTree.Element("tag")
+ >>> elem.text = u'abc'
+ >>> serialize(elem)
+ '<tag>abc</tag>'
+ >>> serialize(elem, "utf-8")
+ '<tag>abc</tag>'
+ >>> serialize(elem, "us-ascii")
+ '<tag>abc</tag>'
+ >>> serialize(elem, "iso-8859-1").lower()
+ "<?xml version='1.0' encoding='iso-8859-1'?>\n<tag>abc</tag>"
+
+ >>> elem.text = "<&\"\'>"
+ >>> serialize(elem)
+ '<tag>&lt;&amp;"\'&gt;</tag>'
+ >>> serialize(elem, "utf-8")
+ '<tag>&lt;&amp;"\'&gt;</tag>'
+ >>> serialize(elem, "us-ascii") # cdata characters
+ '<tag>&lt;&amp;"\'&gt;</tag>'
+ >>> serialize(elem, "iso-8859-1").lower()
+ '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag>&lt;&amp;"\'&gt;</tag>'
+
+ >>> elem.attrib["key"] = "<&\"\'>"
+ >>> elem.text = None
+ >>> serialize(elem)
+ '<tag key="&lt;&amp;&quot;\'&gt;"/>'
+ >>> serialize(elem, "utf-8")
+ '<tag key="&lt;&amp;&quot;\'&gt;"/>'
+ >>> serialize(elem, "us-ascii")
+ '<tag key="&lt;&amp;&quot;\'&gt;"/>'
+ >>> serialize(elem, "iso-8859-1").lower()
+ '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="&lt;&amp;&quot;\'&gt;"/>'
+
+ >>> elem.text = u'\xe5\xf6\xf6<>'
+ >>> elem.attrib.clear()
+ >>> serialize(elem)
+ '<tag>&#229;&#246;&#246;&lt;&gt;</tag>'
+ >>> serialize(elem, "utf-8")
+ '<tag>\xc3\xa5\xc3\xb6\xc3\xb6&lt;&gt;</tag>'
+ >>> serialize(elem, "us-ascii")
+ '<tag>&#229;&#246;&#246;&lt;&gt;</tag>'
+ >>> serialize(elem, "iso-8859-1").lower()
+ "<?xml version='1.0' encoding='iso-8859-1'?>\n<tag>\xe5\xf6\xf6&lt;&gt;</tag>"
+
+ >>> elem.attrib["key"] = u'\xe5\xf6\xf6<>'
+ >>> elem.text = None
+ >>> serialize(elem)
+ '<tag key="&#229;&#246;&#246;&lt;&gt;"/>'
+ >>> serialize(elem, "utf-8")
+ '<tag key="\xc3\xa5\xc3\xb6\xc3\xb6&lt;&gt;"/>'
+ >>> serialize(elem, "us-ascii")
+ '<tag key="&#229;&#246;&#246;&lt;&gt;"/>'
+ >>> serialize(elem, "iso-8859-1").lower()
+ '<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>\n<tag key="\xe5\xf6\xf6&lt;&gt;"/>'
+
+ """
+
+if sys.version_info[0] >= 3:
+ encoding.__doc__ = encoding.__doc__.replace("u'", "'")
+
+def qname():
+ """
+ Test QName handling.
+
+ 1) decorated tags
+
+ >>> elem = ElementTree.Element("{uri}tag")
+ >>> serialize(elem) # 1.1
+ '<ns0:tag xmlns:ns0="uri"/>'
+
+## 2) decorated attributes
+
+## >>> elem.attrib["{uri}key"] = "value"
+## >>> serialize(elem) # 2.1
+## '<ns0:tag ns0:key="value" xmlns:ns0="uri"/>'
+
+ """
+
+def cdata():
+ """
+ Test CDATA handling (etc).
+
+ >>> serialize(unserialize("<tag>hello</tag>"))
+ '<tag>hello</tag>'
+ >>> serialize(unserialize("<tag>&#104;&#101;&#108;&#108;&#111;</tag>"))
+ '<tag>hello</tag>'
+ >>> serialize(unserialize("<tag><![CDATA[hello]]></tag>"))
+ '<tag>hello</tag>'
+
+ """
+
+def find():
+ """
+ Test find methods (including xpath syntax).
+
+ >>> elem = SAMPLE_XML
+ >>> elem.find("tag").tag
+ 'tag'
+ >>> ElementTree.ElementTree(elem).find("tag").tag
+ 'tag'
+ >>> elem.find("section/tag").tag
+ 'tag'
+ >>> ElementTree.ElementTree(elem).find("section/tag").tag
+ 'tag'
+ >>> elem.findtext("tag")
+ 'text'
+ >>> elem.findtext("tog", "default")
+ 'default'
+ >>> ElementTree.ElementTree(elem).findtext("tag")
+ 'text'
+ >>> elem.findtext("section/tag")
+ 'subtext'
+ >>> ElementTree.ElementTree(elem).findtext("section/tag")
+ 'subtext'
+ >>> summarize_list(elem.findall("tag"))
+ ['tag', 'tag']
+ >>> summarize_list(elem.findall("*"))
+ ['tag', 'tag', 'section']
+ >>> summarize_list(elem.findall(".//tag"))
+ ['tag', 'tag', 'tag']
+ >>> summarize_list(elem.findall("section/tag"))
+ ['tag']
+ >>> summarize_list(elem.findall("section//tag"))
+ ['tag']
+ >>> summarize_list(elem.findall("section/*"))
+ ['tag']
+ >>> summarize_list(elem.findall("section//*"))
+ ['tag']
+ >>> summarize_list(elem.findall("section/.//*"))
+ ['tag']
+ >>> summarize_list(elem.findall("*/*"))
+ ['tag']
+ >>> summarize_list(elem.findall("*//*"))
+ ['tag']
+ >>> summarize_list(elem.findall("*/tag"))
+ ['tag']
+ >>> summarize_list(elem.findall("*/./tag"))
+ ['tag']
+ >>> summarize_list(elem.findall("./tag"))
+ ['tag', 'tag']
+ >>> summarize_list(elem.findall(".//tag"))
+ ['tag', 'tag', 'tag']
+ >>> summarize_list(elem.findall("././tag"))
+ ['tag', 'tag']
+ >>> summarize_list(ElementTree.ElementTree(elem).findall("/tag"))
+ ['tag', 'tag']
+ >>> summarize_list(ElementTree.ElementTree(elem).findall("./tag"))
+ ['tag', 'tag']
+ >>> elem = SAMPLE_XML_NS
+ >>> summarize_list(elem.findall("tag"))
+ []
+ >>> summarize_list(elem.findall("{http://effbot.org/ns}tag"))
+ ['{http://effbot.org/ns}tag', '{http://effbot.org/ns}tag']
+ >>> summarize_list(elem.findall(".//{http://effbot.org/ns}tag"))
+ ['{http://effbot.org/ns}tag', '{http://effbot.org/ns}tag', '{http://effbot.org/ns}tag']
+ """
+
+# XXX only deep copying is supported
+
+def copy():
+ """
+ Test copy handling (etc).
+
+ >>> import copy
+ >>> e1 = unserialize("<tag>hello<foo/></tag>")
+ >>> # e2 = copy.copy(e1)
+ >>> e3 = copy.deepcopy(e1)
+ >>> e1.find("foo").tag = "bar"
+
+ >>> serialize(e1).replace(' ', '')
+ '<tag>hello<bar/></tag>'
+
+## >>> serialize(e2).replace(' ', '')
+## '<tag>hello<bar/></tag>'
+
+ >>> serialize(e3).replace(' ', '')
+ '<tag>hello<foo/></tag>'
+
+ """
+
+def attrib():
+ """
+ Test attribute handling.
+
+ >>> elem = ElementTree.Element("tag")
+ >>> elem.get("key") # 1.1
+ >>> elem.get("key", "default") # 1.2
+ 'default'
+ >>> elem.set("key", "value")
+ >>> elem.get("key") # 1.3
+ 'value'
+
+ >>> elem = ElementTree.Element("tag", key="value")
+ >>> elem.get("key") # 2.1
+ 'value'
+ >>> elem.attrib # 2.2
+ {'key': 'value'}
+
+ >>> elem = ElementTree.Element("tag", {"key": "value"})
+ >>> elem.get("key") # 3.1
+ 'value'
+ >>> elem.attrib # 3.2
+ {'key': 'value'}
+
+ >>> elem = ElementTree.Element("tag", {"key": "other"}, key="value")
+ >>> elem.get("key") # 4.1
+ 'value'
+ >>> elem.attrib # 4.2
+ {'key': 'value'}
+
+ """
+
+def makeelement():
+ """
+ Test makeelement handling.
+
+ >>> elem = ElementTree.Element("tag")
+ >>> subelem = elem.makeelement("subtag", {"key": "value"})
+ >>> elem.append(subelem)
+ >>> serialize(elem)
+ '<tag><subtag key="value"/></tag>'
+
+ >>> elem.clear()
+ >>> serialize(elem)
+ '<tag/>'
+ >>> elem.append(subelem)
+ >>> serialize(elem)
+ '<tag><subtag key="value"/></tag>'
+
+ """
+
+## def observer():
+## """
+## Test observers.
+
+## >>> def observer(action, elem):
+## ... print("%s %s" % (action, elem.tag))
+## >>> builder = ElementTree.TreeBuilder()
+## >>> builder.addobserver(observer)
+## >>> parser = ElementTree.XMLParser(builder)
+## >>> file = open("samples/simple.xml", "rb")
+## >>> parser.feed(file.read())
+## start root
+## start element
+## end element
+## start element
+## end element
+## start empty-element
+## end empty-element
+## end root
+## >>> file.close()
+
+## """
+
+ENTITY_XML = """\
+<!DOCTYPE points [
+<!ENTITY % user-entities SYSTEM 'user-entities.xml'>
+%user-entities;
+]>
+<document>&entity;</document>
+"""
+
+## def entity():
+## """
+## Test entity handling.
+
+## 1) bad entities
+
+## >>> ElementTree.XML("<document>&entity;</document>")
+## Traceback (most recent call last):
+## SyntaxError: undefined entity: line 1, column 10
+
+## 2) custom entity
+
+## >>> parser = ElementTree.XMLParser()
+## >>> parser.entity["entity"] = "text"
+## >>> parser.feed(ENTITY_XML)
+## >>> root = parser.close()
+## >>> serialize(root)
+## '<document>text</document>'
+
+## """
+
+if __name__ == "__main__":
+ import doctest, selftest2
+ failed, tested = doctest.testmod(selftest2)
+ print("%d tests ok." % (tested - failed))
+ if failed > 0:
+ print("%d tests failed. Exiting with non-zero return code." % failed)
+ sys.exit(1)
diff --git a/src/lxml/tests/shakespeare.html b/src/lxml/tests/shakespeare.html
new file mode 100644
index 0000000..821ca6a
--- /dev/null
+++ b/src/lxml/tests/shakespeare.html
@@ -0,0 +1,526 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en" debug="true">
+<head>
+ <meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
+</head>
+
+<body>
+
+ <div id="test">
+ <div class="dialog">
+ <h2>As You Like It</h2>
+ <div id="playwright">
+
+ by William Shakespeare
+
+
+ </div>
+ <div class="dialog scene thirdClass" id="scene1">
+
+ <h3>ACT I, SCENE III. A room in the palace.</h3>
+
+ <div class="dialog">
+ <div class="direction">Enter CELIA and ROSALIND</div>
+
+ </div>
+
+ <div id="speech1" class="character">CELIA</div>
+
+ <div class="dialog">
+ <div id="scene1.3.1">Why, cousin! why, Rosalind! Cupid have mercy! not a word?</div>
+
+ </div>
+
+ <div id="speech2" class="character">ROSALIND</div>
+
+ <div class="dialog">
+ <div id="scene1.3.2">Not one to throw at a dog.</div>
+
+ </div>
+
+ <div id="speech3" class="character">CELIA</div>
+
+ <div class="dialog">
+ <div id="scene1.3.3">No, thy words are too precious to be cast away upon</div>
+
+ <div id="scene1.3.4">curs; throw some of them at me; come, lame me with reasons.</div>
+
+ </div>
+
+ <div id="speech4" class="character">ROSALIND</div>
+
+ <div id="speech5" class="character">CELIA</div>
+
+ <div class="dialog">
+ <div id="scene1.3.8">But is all this for your father?</div>
+
+ </div>
+
+ <div class="dialog">
+ <div id="scene1.3.5">Then there were two cousins laid up; when the one</div>
+ <div id="scene1.3.6">should be lamed with reasons and the other mad</div>
+
+ <div id="scene1.3.7">without any.</div>
+ </div>
+
+ <div id="speech6" class="character">ROSALIND</div>
+
+ <div class="dialog">
+ <div id="scene1.3.9">No, some of it is for my child's father. O, how</div>
+
+ <div id="scene1.3.10">full of briers is this working-day world!</div>
+
+ </div>
+
+ <div id="speech7" class="character">CELIA</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.11">They are but burs, cousin, thrown upon thee in</div>
+ <div id="scene1.3.12">holiday foolery: if we walk not in the trodden</div>
+
+ <div id="scene1.3.13">paths our very petticoats will catch them.</div>
+
+ </div>
+
+ <div id="speech8" class="character">ROSALIND</div>
+
+ <div class="dialog">
+ <div id="scene1.3.14">I could shake them off my coat: these burs are in my heart.</div>
+ </div>
+
+ <div id="speech9" class="character">CELIA</div>
+
+ <div class="dialog">
+ <div id="scene1.3.15">Hem them away.</div>
+
+ </div>
+
+ <div id="speech10" class="character">ROSALIND</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.16">I would try, if I could cry 'hem' and have him.</div>
+ </div>
+
+ <div id="speech11" class="character">CELIA</div>
+
+ <div class="dialog">
+ <div id="scene1.3.17">Come, come, wrestle with thy affections.</div>
+
+ </div>
+
+ <div id="speech12" class="character">ROSALIND</div>
+ <div class="dialog">
+ <div id="scene1.3.18">O, they take the part of a better wrestler than myself!</div>
+
+ </div>
+
+ <div id="speech13" class="character">CELIA</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.19">O, a good wish upon you! you will try in time, in</div>
+ <div id="scene1.3.20">despite of a fall. But, turning these jests out of</div>
+ <div id="scene1.3.21">service, let us talk in good earnest: is it</div>
+
+ <div id="scene1.3.22">possible, on such a sudden, you should fall into so</div>
+
+ <div id="scene1.3.23">strong a liking with old Sir Rowland's youngest son?</div>
+
+ </div>
+
+ <div id="speech14" class="character">ROSALIND</div>
+ <div class="dialog">
+ <div id="scene1.3.24">The duke my father loved his father dearly.</div>
+
+ </div>
+
+ <div id="speech15" class="character">CELIA</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.25">Doth it therefore ensue that you should love his son</div>
+
+ <div id="scene1.3.26">dearly? By this kind of chase, I should hate him,</div>
+
+ <div id="scene1.3.27">for my father hated his father dearly; yet I hate</div>
+
+ <div id="scene1.3.28">not Orlando.</div>
+
+ </div>
+
+ <div id="speech16" class="character">ROSALIND</div>
+
+ <div title="wtf" class="dialog">
+
+ <div id="scene1.3.29">No, faith, hate him not, for my sake.</div>
+
+ </div>
+
+ <div id="speech17" class="character">CELIA</div>
+
+ <div class="dialog">
+ <div id="scene1.3.30">Why should I not? doth he not deserve well?</div>
+
+ </div>
+
+ <div id="speech18" class="character">ROSALIND</div>
+
+ <div class="dialog">
+ <div id="scene1.3.31">Let me love him for that, and do you love him</div>
+ <div id="scene1.3.32">because I do. Look, here comes the duke.</div>
+ </div>
+
+ <div id="speech19" class="character">CELIA</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.33">With his eyes full of anger.</div>
+ <div class="direction">Enter DUKE FREDERICK, with Lords</div>
+ </div>
+
+ <div id="speech20" class="character">DUKE FREDERICK</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.34">Mistress, dispatch you with your safest haste</div>
+
+ <div id="scene1.3.35">And get you from our court.</div>
+ </div>
+
+ <div id="speech21" class="character">ROSALIND</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.36">Me, uncle?</div>
+
+ </div>
+
+ <div id="speech22" class="character">DUKE FREDERICK</div>
+ <div class="dialog">
+ <div id="scene1.3.37">You, cousin</div>
+
+ <div id="scene1.3.38">Within these ten days if that thou be'st found</div>
+
+ <div id="scene1.3.39">So near our public court as twenty miles,</div>
+
+ <div id="scene1.3.40">Thou diest for it.</div>
+
+ </div>
+
+ <div id="speech23" class="character">ROSALIND</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.41"> I do beseech your grace,</div>
+
+ <div id="scene1.3.42">Let me the knowledge of my fault bear with me:</div>
+ <div id="scene1.3.43">If with myself I hold intelligence</div>
+
+ <div id="scene1.3.44">Or have acquaintance with mine own desires,</div>
+
+ <div id="scene1.3.45">If that I do not dream or be not frantic,--</div>
+
+ <div id="scene1.3.46">As I do trust I am not--then, dear uncle,</div>
+
+ <div id="scene1.3.47">Never so much as in a thought unborn</div>
+
+ <div id="scene1.3.48">Did I offend your highness.</div>
+
+ </div>
+
+ <div id="speech24" class="character">DUKE FREDERICK</div>
+
+ <div class="dialog">
+ <div id="scene1.3.49">Thus do all traitors:</div>
+
+ <div id="scene1.3.50">If their purgation did consist in words,</div>
+
+ <div id="scene1.3.51">They are as innocent as grace itself:</div>
+
+ <div id="scene1.3.52">Let it suffice thee that I trust thee not.</div>
+
+ </div>
+
+ <div id="speech25" class="character">ROSALIND</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.53">Yet your mistrust cannot make me a traitor:</div>
+
+ <div id="scene1.3.54">Tell me whereon the likelihood depends.</div>
+
+ </div>
+
+ <div id="speech26" class="character">DUKE FREDERICK</div>
+ <div class="dialog">
+
+ <div id="scene1.3.55">Thou art thy father's daughter; there's enough.</div>
+
+ </div>
+
+ <div id="speech27" class="character">ROSALIND</div>
+
+ <div class="dialog">
+ <div id="scene1.3.56">So was I when your highness took his dukedom;</div>
+ <div id="scene1.3.57">So was I when your highness banish'd him:</div>
+
+ <div id="scene1.3.58">Treason is not inherited, my lord;</div>
+
+ <div id="scene1.3.59">Or, if we did derive it from our friends,</div>
+
+ <div id="scene1.3.60">What's that to me? my father was no traitor:</div>
+
+ <div id="scene1.3.61">Then, good my liege, mistake me not so much</div>
+ <div id="scene1.3.62">To think my poverty is treacherous.</div>
+
+ </div>
+
+ <div id="speech28" class="character">CELIA</div>
+ <div class="dialog">
+
+ <div id="scene1.3.63">Dear sovereign, hear me speak.</div>
+
+ </div>
+
+ <div id="speech29" class="character">DUKE FREDERICK</div>
+
+ <div class="dialog">
+ <div id="scene1.3.64">Ay, Celia; we stay'd her for your sake,</div>
+ <div id="scene1.3.65">Else had she with her father ranged along.</div>
+
+ </div>
+
+ <div id="speech30" class="character">CELIA</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.66">I did not then entreat to have her stay;</div>
+ <div id="scene1.3.67">It was your pleasure and your own remorse:</div>
+ <div id="scene1.3.68">I was too young that time to value her;</div>
+
+ <div id="scene1.3.69">But now I know her: if she be a traitor,</div>
+
+ <div id="scene1.3.70">Why so am I; we still have slept together,</div>
+
+ <div id="scene1.3.71">Rose at an instant, learn'd, play'd, eat together,</div>
+ <div id="scene1.3.72">And wheresoever we went, like Juno's swans,</div>
+
+ <div id="scene1.3.73">Still we went coupled and inseparable.</div>
+ </div>
+
+ <div id="speech31" class="character">DUKE FREDERICK</div>
+
+ <div class="dialog">
+ <div id="scene1.3.74">She is too subtle for thee; and her smoothness,</div>
+ <div id="scene1.3.75">Her very silence and her patience</div>
+ <div id="scene1.3.76">Speak to the people, and they pity her.</div>
+ <div id="scene1.3.77">Thou art a fool: she robs thee of thy name;</div>
+
+ <div id="scene1.3.78">And thou wilt show more bright and seem more virtuous</div>
+
+ <div id="scene1.3.79">When she is gone. Then open not thy lips:</div>
+ <div id="scene1.3.80">Firm and irrevocable is my doom</div>
+ <div id="scene1.3.81">Which I have pass'd upon her; she is banish'd.</div>
+ </div>
+
+ <div id="speech32" class="character">CELIA</div>
+
+ <div class="dialog">
+ <div id="scene1.3.82">Pronounce that sentence then on me, my liege:</div>
+ <div id="scene1.3.83">I cannot live out of her company.</div>
+ </div>
+
+ <div id="speech33" class="character">DUKE FREDERICK</div>
+
+ <div class="dialog">
+ <div id="scene1.3.84">You are a fool. You, niece, provide yourself:</div>
+
+ <div id="scene1.3.85">If you outstay the time, upon mine honour,</div>
+ <div id="scene1.3.86">And in the greatness of my word, you die.</div>
+ <div class="direction">Exeunt DUKE FREDERICK and Lords</div>
+ </div>
+
+ <div id="speech34" class="character">CELIA</div>
+ <div class="dialog">
+
+ <div id="scene1.3.87">O my poor Rosalind, whither wilt thou go?</div>
+
+ <div id="scene1.3.88">Wilt thou change fathers? I will give thee mine.</div>
+ <div id="scene1.3.89">I charge thee, be not thou more grieved than I am.</div>
+
+ </div>
+
+ <div id="speech35" class="character">ROSALIND</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.90">I have more cause.</div>
+ </div>
+
+ <div id="speech36" class="character">CELIA</div>
+
+ <div class="dialog">
+ <div id="scene1.3.91"> Thou hast not, cousin;</div>
+
+ <div id="scene1.3.92">Prithee be cheerful: know'st thou not, the duke</div>
+
+ <div id="scene1.3.93">Hath banish'd me, his daughter?</div>
+
+ </div>
+
+ <div id="speech37" class="character">ROSALIND</div>
+ <div class="dialog">
+ <div id="scene1.3.94">That he hath not.</div>
+
+ </div>
+
+ <div id="speech38" class="character">CELIA</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.95">No, hath not? Rosalind lacks then the love</div>
+
+ <div id="scene1.3.96">Which teacheth thee that thou and I am one:</div>
+ <div id="scene1.3.97">Shall we be sunder'd? shall we part, sweet girl?</div>
+
+ <div id="scene1.3.98">No: let my father seek another heir.</div>
+
+ <div id="scene1.3.99">Therefore devise with me how we may fly,</div>
+
+ <div id="scene1.3.100">Whither to go and what to bear with us;</div>
+ <div id="scene1.3.101">And do not seek to take your change upon you,</div>
+ <div id="scene1.3.102">To bear your griefs yourself and leave me out;</div>
+
+ <div id="scene1.3.103">For, by this heaven, now at our sorrows pale,</div>
+
+ <div id="scene1.3.104">Say what thou canst, I'll go along with thee.</div>
+
+ </div>
+
+ <div id="speech39" class="character">ROSALIND</div>
+ <div class="dialog">
+
+ <div id="scene1.3.105">Why, whither shall we go?</div>
+
+ </div>
+
+ <div id="speech40" class="character">CELIA</div>
+
+ <div class="dialog">
+ <div id="scene1.3.106">To seek my uncle in the forest of Arden.</div>
+ </div>
+
+ <div id="speech41" class="character">ROSALIND</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.107">Alas, what danger will it be to us,</div>
+
+ <div id="scene1.3.108">Maids as we are, to travel forth so far!</div>
+ <div id="scene1.3.109">Beauty provoketh thieves sooner than gold.</div>
+
+ </div>
+
+ <div id="speech42" class="character">CELIA</div>
+
+ <div class="dialog">
+ <div id="scene1.3.110">I'll put myself in poor and mean attire</div>
+
+ <div id="scene1.3.111">And with a kind of umber smirch my face;</div>
+ <div id="scene1.3.112">The like do you: so shall we pass along</div>
+
+ <div id="scene1.3.113">And never stir assailants.</div>
+
+ </div>
+
+ <div id="speech43" class="character">ROSALIND</div>
+ <div class="dialog">
+
+ <div id="scene1.3.114">Were it not better,</div>
+
+ <div id="scene1.3.115">Because that I am more than common tall,</div>
+
+ <div id="scene1.3.116">That I did suit me all points like a man?</div>
+
+ <div id="scene1.3.117">A gallant curtle-axe upon my thigh,</div>
+
+ <div id="scene1.3.118">A boar-spear in my hand; and--in my heart</div>
+
+ <div id="scene1.3.119">Lie there what hidden woman's fear there will--</div>
+
+ <div id="scene1.3.120">We'll have a swashing and a martial outside,</div>
+
+ <div id="scene1.3.121">As many other mannish cowards have</div>
+
+ <div id="scene1.3.122">That do outface it with their semblances.</div>
+
+ </div>
+
+ <div id="speech44" class="character">CELIA</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.123">What shall I call thee when thou art a man?</div>
+ </div>
+
+ <div id="speech45" class="character">ROSALIND</div>
+
+ <div class="dialog">
+ <div id="scene1.3.124">I'll have no worse a name than Jove's own page;</div>
+
+ <div id="scene1.3.125">And therefore look you call me Ganymede.</div>
+
+ <div id="scene1.3.126">But what will you be call'd?</div>
+
+ </div>
+
+ <div id="speech46" class="character">CELIA</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.127">Something that hath a reference to my state</div>
+ <div id="scene1.3.128">No longer Celia, but Aliena.</div>
+
+ </div>
+
+ <div id="speech47" class="character">ROSALIND</div>
+ <div class="dialog">
+
+ <div id="scene1.3.129">But, cousin, what if we assay'd to steal</div>
+
+ <div id="scene1.3.130">The clownish fool out of your father's court?</div>
+
+ <div id="scene1.3.131">Would he not be a comfort to our travel?</div>
+
+ </div>
+
+ <div id="speech48" class="character">CELIA</div>
+
+ <div class="dialog">
+
+ <div id="scene1.3.132">He'll go along o'er the wide world with me;</div>
+
+ <div id="scene1.3.133">Leave me alone to woo him. Let's away,</div>
+ <div id="scene1.3.134">And get our jewels and our wealth together,</div>
+
+ <div id="scene1.3.135">Devise the fittest time and safest way</div>
+
+ <div id="scene1.3.136">To hide us from pursuit that will be made</div>
+
+ <div id="scene1.3.137">After my flight. Now go we in content</div>
+
+ <div id="scene1.3.138">To liberty and not to banishment.</div>
+ <div class="direction">Exeunt</div>
+
+ </div>
+
+ </div>
+ </div>
+</div>
+</body>
+
+</html> \ No newline at end of file
diff --git a/src/lxml/tests/test-document.xslt b/src/lxml/tests/test-document.xslt
new file mode 100644
index 0000000..5d80c4b
--- /dev/null
+++ b/src/lxml/tests/test-document.xslt
@@ -0,0 +1,10 @@
+<?xml version="1.0"?>
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+
+ <xsl:template match="/">
+ <test>
+ <xsl:copy-of select="document('')"/>
+ </test>
+ </xsl:template>
+</xsl:stylesheet>
diff --git a/src/lxml/tests/test-string.xml b/src/lxml/tests/test-string.xml
new file mode 100644
index 0000000..25ae180
--- /dev/null
+++ b/src/lxml/tests/test-string.xml
@@ -0,0 +1,2 @@
+<?xml version="1.0"?>
+<a>Søk på nettet</a>
diff --git a/src/lxml/tests/test.dtd b/src/lxml/tests/test.dtd
new file mode 100644
index 0000000..bb62030
--- /dev/null
+++ b/src/lxml/tests/test.dtd
@@ -0,0 +1,11 @@
+<!ELEMENT a (b)>
+<!ATTLIST a
+ default (valueA|valueB) "valueA"
+>
+
+<!ELEMENT b EMPTY>
+<!ATTLIST b
+ default (valueA|valueB) "valueB"
+>
+
+<!ENTITY c "&#42;">
diff --git a/src/lxml/tests/test.rnc b/src/lxml/tests/test.rnc
new file mode 100644
index 0000000..7e3db57
--- /dev/null
+++ b/src/lxml/tests/test.rnc
@@ -0,0 +1,8 @@
+element a {
+ element b {
+ "B"
+ },
+ element c {
+ "C"
+ }
+}
diff --git a/src/lxml/tests/test.sch b/src/lxml/tests/test.sch
new file mode 100644
index 0000000..9fc4815
--- /dev/null
+++ b/src/lxml/tests/test.sch
@@ -0,0 +1,8 @@
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+</sch:schema>
diff --git a/src/lxml/tests/test.xml b/src/lxml/tests/test.xml
new file mode 100644
index 0000000..2eccc29
--- /dev/null
+++ b/src/lxml/tests/test.xml
@@ -0,0 +1,2 @@
+<!DOCTYPE a SYSTEM "test.dtd">
+<a><b></b></a>
diff --git a/src/lxml/tests/test.xsd b/src/lxml/tests/test.xsd
new file mode 100644
index 0000000..9a085b3
--- /dev/null
+++ b/src/lxml/tests/test.xsd
@@ -0,0 +1,8 @@
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:string" />
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
diff --git a/src/lxml/tests/test1.rng b/src/lxml/tests/test1.rng
new file mode 100644
index 0000000..bef4d00
--- /dev/null
+++ b/src/lxml/tests/test1.rng
@@ -0,0 +1,6 @@
+<grammar ns="http://www.w3.org/1999/xhtml"
+ xmlns="http://relaxng.org/ns/structure/1.0">
+
+<include href="test2.rng"/>
+
+</grammar>
diff --git a/src/lxml/tests/test1.xslt b/src/lxml/tests/test1.xslt
new file mode 100644
index 0000000..f25763e
--- /dev/null
+++ b/src/lxml/tests/test1.xslt
@@ -0,0 +1,9 @@
+<xsl:stylesheet
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+<xsl:include href="test2.xslt" />
+
+<xsl:template match="/">
+<p>Foo</p>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/src/lxml/tests/test2.rng b/src/lxml/tests/test2.rng
new file mode 100644
index 0000000..f6d0389
--- /dev/null
+++ b/src/lxml/tests/test2.rng
@@ -0,0 +1,13 @@
+<grammar xmlns="http://relaxng.org/ns/structure/1.0">
+
+<start>
+ <ref name="a" />
+</start>
+
+<define name="a">
+ <element name="a">
+ <text/>
+ </element>
+</define>
+
+</grammar>
diff --git a/src/lxml/tests/test2.xslt b/src/lxml/tests/test2.xslt
new file mode 100644
index 0000000..5c01e21
--- /dev/null
+++ b/src/lxml/tests/test2.xslt
@@ -0,0 +1,8 @@
+<xsl:stylesheet
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+
+<xsl:template match="hello">
+<p>hello</p>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/src/lxml/tests/test_broken.xml b/src/lxml/tests/test_broken.xml
new file mode 100644
index 0000000..3dd455a
--- /dev/null
+++ b/src/lxml/tests/test_broken.xml
@@ -0,0 +1 @@
+<a><b></c></b></a> \ No newline at end of file
diff --git a/src/lxml/tests/test_builder.py b/src/lxml/tests/test_builder.py
new file mode 100644
index 0000000..6aa2d12
--- /dev/null
+++ b/src/lxml/tests/test_builder.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests that ElementMaker works properly.
+"""
+
+from __future__ import absolute_import
+
+import unittest
+
+from lxml import etree
+from lxml.builder import E
+
+from .common_imports import HelperTestCase, _bytes
+
+
+class BuilderTestCase(HelperTestCase):
+ etree = etree
+
+ def test_build_from_xpath_result(self):
+ class StringSubclass(str): pass
+ wrapped = E.b(StringSubclass('Hello'))
+ self.assertEqual(_bytes('<b>Hello</b>'), etree.tostring(wrapped))
+
+ def test_unknown_type_raises(self):
+ class UnknownType(object):
+ pass
+ self.assertRaises(TypeError, E.b, UnknownType())
+
+ def test_cdata(self):
+ wrapped = E.b(etree.CDATA('Hello'))
+ self.assertEqual(_bytes('<b><![CDATA[Hello]]></b>'), etree.tostring(wrapped))
+
+ def test_cdata_solo(self):
+ self.assertRaises(ValueError, E.b, 'Hello', etree.CDATA('World'))
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(BuilderTestCase)])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_classlookup.py b/src/lxml/tests/test_classlookup.py
new file mode 100644
index 0000000..7c871d5
--- /dev/null
+++ b/src/lxml/tests/test_classlookup.py
@@ -0,0 +1,402 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests for different Element class lookup mechanisms.
+"""
+
+
+from __future__ import absolute_import
+
+import unittest, gc
+
+from .common_imports import etree, HelperTestCase, _bytes, BytesIO
+
+xml_str = _bytes('''\
+<root xmlns="myNS" xmlns:other="otherNS">
+ <c1 a1="A1" a2="A2" other:a3="A3">
+ <c2 a1="C2">0</c2>
+ <c2>1</c2>
+ <other:c2>2</other:c2>
+ </c1>
+</root>''')
+
+
+class ProxyTestCase(HelperTestCase):
+ """Basic tests for element proxy behaviour.
+ """
+ etree = etree
+
+ def test_proxy_reuse(self):
+ root = etree.XML('<a><b><c/></b></a>')
+ b = root.find('b')
+ self.assertTrue(b is root[0])
+
+ def test_proxy_reuse_after_gc(self):
+ root = etree.XML('<a><b><c/></b></a>')
+ b = root.find('b')
+ self.assertTrue(self.etree.iselement(b))
+ gc.collect()
+ self.assertTrue(b is root[0])
+
+ def test_proxy_reuse_after_del_root(self):
+ root = etree.XML('<a><b><c/></b></a>')
+ b = root.find('b')
+ self.assertTrue(self.etree.iselement(b))
+ c = b.find('c')
+ self.assertTrue(self.etree.iselement(c))
+ del root
+ gc.collect()
+ self.assertTrue(b[0] is c)
+
+ def test_proxy_hashing(self):
+ root = etree.XML('<a><b><c/></b></a>')
+ old_elements = set(root.iter())
+ elements = root.iter()
+ del root
+ gc.collect()
+
+ missing = len(old_elements)
+ self.assertEqual(3, missing)
+ for new in elements:
+ for old in old_elements:
+ if old == new:
+ self.assertTrue(old is new)
+ missing -= 1
+ break
+ else:
+ self.assertTrue(False, "element '%s' is missing" % new.tag)
+ self.assertEqual(0, missing)
+
+ def test_element_base(self):
+ el = self.etree.ElementBase()
+ self.assertEqual('ElementBase', el.tag)
+ root = self.etree.ElementBase()
+ root.append(el)
+ self.assertEqual('ElementBase', root[0].tag)
+
+ def test_element_base_children(self):
+ el = self.etree.ElementBase(etree.ElementBase())
+ self.assertEqual('ElementBase', el.tag)
+ self.assertEqual(1, len(el))
+ self.assertEqual('ElementBase', el[0].tag)
+
+ root = self.etree.ElementBase()
+ root.append(el)
+ self.assertEqual('ElementBase', root[0].tag)
+ self.assertEqual('ElementBase', root[0][0].tag)
+
+ def test_comment_base(self):
+ el = self.etree.CommentBase('some text')
+ self.assertEqual(self.etree.Comment, el.tag)
+ self.assertEqual('some text', el.text)
+ root = self.etree.Element('root')
+ root.append(el)
+ self.assertEqual('some text', root[0].text)
+
+ def test_pi_base(self):
+ el = self.etree.PIBase('the target', 'some text')
+ self.assertEqual(self.etree.ProcessingInstruction, el.tag)
+ self.assertEqual('some text', el.text)
+ root = self.etree.Element('root')
+ root.append(el)
+ self.assertEqual('some text', root[0].text)
+
+
+class ClassLookupTestCase(HelperTestCase):
+ """Test cases for different Element class lookup mechanisms.
+ """
+ etree = etree
+
+ def tearDown(self):
+ etree.set_element_class_lookup()
+ super(ClassLookupTestCase, self).tearDown()
+
+ def test_namespace_lookup(self):
+ class TestElement(etree.ElementBase):
+ FIND_ME = "namespace class"
+
+ lookup = etree.ElementNamespaceClassLookup()
+ etree.set_element_class_lookup(lookup)
+
+ ns = lookup.get_namespace("myNS")
+ ns[None] = TestElement
+
+ root = etree.XML(xml_str)
+ self.assertEqual(root.FIND_ME,
+ TestElement.FIND_ME)
+ self.assertEqual(root[0].FIND_ME,
+ TestElement.FIND_ME)
+ self.assertFalse(hasattr(root[0][-1], 'FIND_ME'))
+
+ def test_default_class_lookup(self):
+ class TestElement(etree.ElementBase):
+ FIND_ME = "default element"
+ class TestComment(etree.CommentBase):
+ FIND_ME = "default comment"
+ class TestPI(etree.PIBase):
+ FIND_ME = "default pi"
+
+ parser = etree.XMLParser()
+
+ lookup = etree.ElementDefaultClassLookup(
+ element=TestElement, comment=TestComment, pi=TestPI)
+ parser.set_element_class_lookup(lookup)
+
+ root = etree.XML(_bytes("""<?xml version='1.0'?>
+ <root>
+ <?myPI?>
+ <!-- hi -->
+ </root>
+ """), parser)
+
+ self.assertEqual("default element", root.FIND_ME)
+ self.assertEqual("default pi", root[0].FIND_ME)
+ self.assertEqual("default comment", root[1].FIND_ME)
+
+ def test_default_class_lookup_pull_parser(self):
+ class TestElement(etree.ElementBase):
+ FIND_ME = "default element"
+ class TestComment(etree.CommentBase):
+ FIND_ME = "default comment"
+ class TestPI(etree.PIBase):
+ FIND_ME = "default pi"
+
+ parser = etree.XMLPullParser(events=('start', 'end', 'comment', 'pi'))
+ lookup = etree.ElementDefaultClassLookup(
+ element=TestElement, comment=TestComment, pi=TestPI)
+ parser.set_element_class_lookup(lookup)
+
+ events_seen = []
+
+ def add_events(events):
+ for ev, el in events:
+ events_seen.append((ev, el.FIND_ME))
+
+ parser.feed("""<?xml version='1.0'?>
+ <root>
+ <?myPI?>
+ """)
+ add_events(parser.read_events())
+
+ parser.feed("<!-- hi -->")
+ add_events(parser.read_events())
+
+ parser.feed("</root>")
+ root = parser.close()
+ add_events(parser.read_events())
+
+ self.assertEqual([
+ ('start', "default element"),
+ ('pi', "default pi"),
+ ('comment', "default comment"),
+ ('end', "default element"),
+ ], events_seen)
+
+ self.assertEqual("default element", root.FIND_ME)
+ self.assertEqual("default pi", root[0].FIND_ME)
+ self.assertEqual("default comment", root[1].FIND_ME)
+
+ def test_evil_class_lookup(self):
+ class MyLookup(etree.CustomElementClassLookup):
+ def lookup(self, t, d, ns, name):
+ if name == 'none':
+ return None
+ elif name == 'obj':
+ return object()
+ else:
+ return etree.ElementBase
+
+ parser = etree.XMLParser()
+ parser.set_element_class_lookup(MyLookup())
+
+ root = etree.XML(_bytes('<none/>'), parser)
+ self.assertEqual('none', root.tag)
+
+ self.assertRaises(
+ TypeError,
+ etree.XML, _bytes("<obj />"), parser)
+
+ root = etree.XML(_bytes('<root/>'), parser)
+ self.assertEqual('root', root.tag)
+
+ def test_class_lookup_type_mismatch(self):
+ class MyLookup(etree.CustomElementClassLookup):
+ def lookup(self, t, d, ns, name):
+ if t == 'element':
+ if name == 'root':
+ return etree.ElementBase
+ return etree.CommentBase
+ elif t == 'comment':
+ return etree.PIBase
+ elif t == 'PI':
+ return etree.EntityBase
+ elif t == 'entity':
+ return etree.ElementBase
+ else:
+ raise ValueError('got type %s' % t)
+
+ parser = etree.XMLParser(resolve_entities=False)
+ parser.set_element_class_lookup(MyLookup())
+
+ root = etree.XML(_bytes('<root></root>'), parser)
+ self.assertEqual('root', root.tag)
+ self.assertEqual(etree.ElementBase, type(root))
+
+ root = etree.XML(_bytes("<root><test/></root>"), parser)
+ self.assertRaises(TypeError, root.__getitem__, 0)
+
+ root = etree.XML(_bytes("<root><!-- test --></root>"), parser)
+ self.assertRaises(TypeError, root.__getitem__, 0)
+
+ root = etree.XML(_bytes("<root><?test?></root>"), parser)
+ self.assertRaises(TypeError, root.__getitem__, 0)
+
+ root = etree.XML(
+ _bytes('<!DOCTYPE root [<!ENTITY myent "ent">]>'
+ '<root>&myent;</root>'),
+ parser)
+ self.assertRaises(TypeError, root.__getitem__, 0)
+
+ root = etree.XML(_bytes('<root><root/></root>'), parser)
+ self.assertEqual('root', root[0].tag)
+
+ def test_attribute_based_lookup(self):
+ class TestElement(etree.ElementBase):
+ FIND_ME = "attribute_based"
+
+ class_dict = {"A1" : TestElement}
+
+ lookup = etree.AttributeBasedElementClassLookup(
+ "a1", class_dict)
+ etree.set_element_class_lookup(lookup)
+
+ root = etree.XML(xml_str)
+ self.assertFalse(hasattr(root, 'FIND_ME'))
+ self.assertEqual(root[0].FIND_ME,
+ TestElement.FIND_ME)
+ self.assertFalse(hasattr(root[0][0], 'FIND_ME'))
+
+ def test_custom_lookup(self):
+ class TestElement(etree.ElementBase):
+ FIND_ME = "custom"
+
+ class MyLookup(etree.CustomElementClassLookup):
+ def lookup(self, t, d, ns, name):
+ if name == 'c1':
+ return TestElement
+
+ etree.set_element_class_lookup( MyLookup() )
+
+ root = etree.XML(xml_str)
+ self.assertFalse(hasattr(root, 'FIND_ME'))
+ self.assertEqual(root[0].FIND_ME,
+ TestElement.FIND_ME)
+ self.assertFalse(hasattr(root[0][1], 'FIND_ME'))
+
+ def test_custom_lookup_ns_fallback(self):
+ class TestElement1(etree.ElementBase):
+ FIND_ME = "custom"
+
+ class TestElement2(etree.ElementBase):
+ FIND_ME = "nsclasses"
+
+ class MyLookup(etree.CustomElementClassLookup):
+ def lookup(self, t, d, ns, name):
+ if name == 'c1':
+ return TestElement1
+
+ lookup = etree.ElementNamespaceClassLookup( MyLookup() )
+ etree.set_element_class_lookup(lookup)
+
+ ns = lookup.get_namespace("otherNS")
+ ns[None] = TestElement2
+
+ root = etree.XML(xml_str)
+ self.assertFalse(hasattr(root, 'FIND_ME'))
+ self.assertEqual(root[0].FIND_ME,
+ TestElement1.FIND_ME)
+ self.assertFalse(hasattr(root[0][1], 'FIND_ME'))
+ self.assertEqual(root[0][-1].FIND_ME,
+ TestElement2.FIND_ME)
+
+ def test_parser_based_lookup(self):
+ class TestElement(etree.ElementBase):
+ FIND_ME = "parser_based"
+
+ lookup = etree.ParserBasedElementClassLookup()
+ etree.set_element_class_lookup(lookup)
+
+ class MyLookup(etree.CustomElementClassLookup):
+ def lookup(self, t, d, ns, name):
+ return TestElement
+
+ parser = etree.XMLParser()
+ parser.set_element_class_lookup( MyLookup() )
+
+ root = etree.parse(BytesIO(xml_str), parser).getroot()
+ self.assertEqual(root.FIND_ME,
+ TestElement.FIND_ME)
+ self.assertEqual(root[0].FIND_ME,
+ TestElement.FIND_ME)
+
+ root = etree.parse(BytesIO(xml_str)).getroot()
+ self.assertFalse(hasattr(root, 'FIND_ME'))
+ self.assertFalse(hasattr(root[0], 'FIND_ME'))
+
+ def test_class_lookup_reentry(self):
+ XML = self.etree.XML
+
+ class TestElement(etree.ElementBase):
+ FIND_ME = "here"
+
+ root = None
+ class MyLookup(etree.CustomElementClassLookup):
+ el = None
+ def lookup(self, t, d, ns, name):
+ if root is not None: # not in the parser
+ if self.el is None and name == "a":
+ self.el = []
+ self.el.append(root.find(name))
+ return TestElement
+
+ parser = self.etree.XMLParser()
+ parser.set_element_class_lookup(MyLookup())
+
+ root = XML(_bytes('<root><a>A</a><b xmlns="test">B</b></root>'),
+ parser)
+
+ a = root[0]
+ self.assertEqual(a.tag, "a")
+ self.assertEqual(root[0].tag, "a")
+ del a
+ self.assertEqual(root[0].tag, "a")
+
+ def test_lookup_without_fallback(self):
+ class Lookup(etree.CustomElementClassLookup):
+ def __init__(self):
+ # no super call here, so no fallback is set
+ pass
+
+ def lookup(self, node_type, document, namespace, name):
+ return Foo
+
+ class Foo(etree.ElementBase):
+ def custom(self):
+ return "test"
+
+ parser = self.etree.XMLParser()
+ parser.set_element_class_lookup( Lookup() )
+
+ root = etree.XML('<foo/>', parser)
+
+ self.assertEqual("test", root.custom())
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ProxyTestCase)])
+ suite.addTests([unittest.makeSuite(ClassLookupTestCase)])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_css.py b/src/lxml/tests/test_css.py
new file mode 100644
index 0000000..e2afa65
--- /dev/null
+++ b/src/lxml/tests/test_css.py
@@ -0,0 +1,68 @@
+
+from __future__ import absolute_import
+
+import unittest
+
+import lxml.html
+
+from .common_imports import doctest, HelperTestCase, skipif
+
+try:
+ import cssselect
+except ImportError:
+ cssselect = None
+
+
+HTML = '''
+<div>
+ <a href="foo">link</a>
+ <a>anchor</a>
+</div>
+'''
+
+
+class CSSTestCase(HelperTestCase):
+
+ pytestmark = skipif('cssselect is None')
+
+ def test_cssselect(self):
+ div, = lxml.html.fromstring(HTML).xpath('//div')
+
+ def count(selector, expected_count, **kwargs):
+ result = div.cssselect(selector, **kwargs)
+ self.assertEqual(len(result), expected_count)
+
+ count('div', 1)
+ count('a', 2)
+ count('em', 0)
+ # Element names are case-insensitive in HTML
+ count('DIV', 1)
+ # ... but not in XHTML or XML
+ count('DIV', 0, translator='xhtml')
+ count('DIV', 0, translator='xml')
+
+ # :contains() is case-insensitive in lxml
+ count(':contains("link")', 2) # div, a
+ count(':contains("LInk")', 2)
+ # Whatever the document language
+ count(':contains("LInk")', 2, translator='xhtml')
+ count(':contains("LInk")', 2, translator='xml')
+ # ... but not in upstream cssselect
+ import cssselect
+ count(':contains("link")', 2, translator=cssselect.HTMLTranslator())
+ count(':contains("LInk")', 0, translator=cssselect.HTMLTranslator())
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ try:
+ import cssselect
+ except ImportError:
+ # no 'cssselect' installed
+ print("Skipping tests in lxml.cssselect - external cssselect package is not installed")
+ return suite
+
+ import lxml.cssselect
+ suite.addTests(doctest.DocTestSuite(lxml.cssselect))
+ suite.addTests([unittest.makeSuite(CSSTestCase)])
+ return suite
diff --git a/src/lxml/tests/test_doctestcompare.py b/src/lxml/tests/test_doctestcompare.py
new file mode 100644
index 0000000..3663281
--- /dev/null
+++ b/src/lxml/tests/test_doctestcompare.py
@@ -0,0 +1,133 @@
+
+from __future__ import absolute_import
+
+import unittest
+
+from lxml import etree
+from .common_imports import HelperTestCase
+from lxml.doctestcompare import LXMLOutputChecker, PARSE_HTML, PARSE_XML
+
+
+class DummyInput:
+ def __init__(self, **kw):
+ for name, value in kw.items():
+ setattr(self, name, value)
+
+
+def indent(elem, level=0):
+ i = "\n" + level*" "
+ if len(elem):
+ if not elem.text or not elem.text.strip():
+ elem.text = i + " "
+ if not elem.tail or not elem.tail.strip():
+ elem.tail = i
+ for elem in elem:
+ indent(elem, level+1)
+ if not elem.tail or not elem.tail.strip():
+ elem.tail = i
+ else:
+ if level and (not elem.tail or not elem.tail.strip()):
+ elem.tail = i
+
+
+class DoctestCompareTest(HelperTestCase):
+ _checker = LXMLOutputChecker()
+
+ def compare(self, want, got, html=False):
+ if html:
+ options = PARSE_HTML
+ else:
+ options = PARSE_XML
+
+ parse = self._checker.get_parser(want, got, options)
+ want_doc = parse(want)
+ got_doc = parse(got)
+ return self._checker.collect_diff(
+ want_doc, got_doc, html, indent=0).lstrip()
+
+ def assert_diff(self, want, got, diff, html=False):
+ self.assertEqual(self.compare(want, got, html), diff)
+
+ def assert_nodiff(self, want, got, html=False):
+ root = etree.fromstring(want)
+ root.tail = '\n'
+ indent(root)
+ diff = etree.tostring(
+ root, encoding='unicode', method=html and 'html' or 'xml')
+ self.assert_diff(want, got, diff, html=html)
+
+ def test_equal_input(self):
+ self.assert_nodiff(
+ '<p title="expected">Expected</p>',
+ '<p title="expected">Expected</p>')
+
+ def test_differing_tags(self):
+ self.assert_diff(
+ '<p title="expected">Expected</p>',
+ '<b title="expected">Expected</b>',
+ '<p (got: b) title="expected">Expected</p (got: b)>\n')
+
+ def test_tags_upper_lower_case(self):
+ self.assert_diff(
+ '<p title="expected">Expected</p>',
+ '<P title="expected">Expected</P>',
+ '<p (got: P) title="expected">Expected</p (got: P)>\n')
+
+ def test_tags_upper_lower_case_html(self):
+ self.assert_nodiff(
+ '<html><body><p title="expected">Expected</p></body></html>',
+ '<HTML><BODY><P title="expected">Expected</P></BODY></HTML>',
+ html=True)
+
+ def test_differing_attributes(self):
+ self.assert_diff(
+ '<p title="expected">Expected</p>',
+ '<p title="actual">Actual</p>',
+ '<p title="expected (got: actual)">Expected (got: Actual)</p>\n')
+
+ def test_extra_children(self):
+ # https://bugs.launchpad.net/lxml/+bug/1238503
+ self.assert_diff(
+ '<p><span>One</span></p>',
+ '<p><span>One</span><b>Two</b><em>Three</em></p>',
+ '<p>\n'
+ ' <span>One</span>\n'
+ ' +<b>Two</b>\n'
+ ' +<em>Three</em>\n'
+ '</p>\n')
+
+ def test_missing_children(self):
+ self.assert_diff(
+ '<p><span>One</span><b>Two</b><em>Three</em></p>',
+ '<p><span>One</span></p>',
+ '<p>\n'
+ ' <span>One</span>\n'
+ ' -<b>Two</b>\n'
+ ' -<em>Three</em>\n'
+ '</p>\n')
+
+ def test_extra_attributes(self):
+ self.assert_diff(
+ '<p><span class="foo">Text</span></p>',
+ '<p><span class="foo" id="bar">Text</span></p>',
+ '<p>\n'
+ ' <span class="foo" +id="bar">Text</span>\n'
+ '</p>\n')
+
+ def test_missing_attributes(self):
+ self.assert_diff(
+ '<p><span class="foo" id="bar">Text</span></p>',
+ '<p><span class="foo">Text</span></p>',
+ '<p>\n'
+ ' <span class="foo" -id="bar">Text</span>\n'
+ '</p>\n')
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(DoctestCompareTest)])
+ return suite
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/lxml/tests/test_dtd.py b/src/lxml/tests/test_dtd.py
new file mode 100644
index 0000000..0f06b73
--- /dev/null
+++ b/src/lxml/tests/test_dtd.py
@@ -0,0 +1,415 @@
+# -*- coding: utf-8 -*-
+
+"""
+Test cases related to DTD parsing and validation
+"""
+
+import unittest, sys
+
+from .common_imports import (
+ etree, html, BytesIO, _bytes, _str,
+ HelperTestCase, make_doctest, skipIf,
+ fileInTestDir, fileUrlInTestDir
+)
+
+
+class ETreeDtdTestCase(HelperTestCase):
+ def test_dtd(self):
+ pass
+
+ def test_dtd_file(self):
+ parse = etree.parse
+ tree = parse(fileInTestDir("test.xml"))
+ root = tree.getroot()
+
+ dtd = etree.DTD(fileInTestDir("test.dtd"))
+ self.assertTrue(dtd.validate(root))
+
+ def test_dtd_stringio(self):
+ root = etree.XML(_bytes("<b/>"))
+ dtd = etree.DTD(BytesIO("<!ELEMENT b EMPTY>"))
+ self.assertTrue(dtd.validate(root))
+
+ def test_dtd_parse_invalid(self):
+ fromstring = etree.fromstring
+ parser = etree.XMLParser(dtd_validation=True)
+ xml = _bytes('<!DOCTYPE b SYSTEM "%s"><b><a/></b>' %
+ fileInTestDir("test.dtd"))
+ self.assertRaises(etree.XMLSyntaxError,
+ fromstring, xml, parser=parser)
+
+ def test_dtd_parse_file_not_found(self):
+ fromstring = etree.fromstring
+ dtd_filename = fileUrlInTestDir("__nosuch.dtd")
+ parser = etree.XMLParser(dtd_validation=True)
+ xml = _bytes('<!DOCTYPE b SYSTEM "%s"><b><a/></b>' % dtd_filename)
+ self.assertRaises(etree.XMLSyntaxError,
+ fromstring, xml, parser=parser)
+ errors = None
+ try:
+ fromstring(xml, parser=parser)
+ except etree.XMLSyntaxError:
+ e = sys.exc_info()[1]
+ self.assertTrue(e.error_log)
+ self.assertTrue(parser.error_log)
+ errors = [entry.message for entry in e.error_log
+ if dtd_filename in entry.message]
+ self.assertTrue(errors)
+
+ def test_dtd_parse_valid(self):
+ parser = etree.XMLParser(dtd_validation=True)
+ xml = ('<!DOCTYPE a SYSTEM "%s"><a><b/></a>' %
+ fileUrlInTestDir("test.dtd"))
+ root = etree.fromstring(xml, parser=parser)
+
+ def test_dtd_parse_valid_file_url(self):
+ parser = etree.XMLParser(dtd_validation=True)
+ xml = ('<!DOCTYPE a SYSTEM "%s"><a><b/></a>' %
+ fileUrlInTestDir("test.dtd"))
+ root = etree.fromstring(xml, parser=parser)
+
+ def test_dtd_parse_valid_relative(self):
+ parser = etree.XMLParser(dtd_validation=True)
+ xml = '<!DOCTYPE a SYSTEM "test.dtd"><a><b/></a>'
+ root = etree.fromstring(
+ xml, parser=parser, base_url=fileUrlInTestDir("test.xml"))
+
+ def test_dtd_parse_valid_relative_file_url(self):
+ parser = etree.XMLParser(dtd_validation=True)
+ xml = '<!DOCTYPE a SYSTEM "test.dtd"><a><b/></a>'
+ root = etree.fromstring(
+ xml, parser=parser, base_url=fileUrlInTestDir("test.xml"))
+
+ def test_dtd_invalid(self):
+ root = etree.XML("<b><a/></b>")
+ dtd = etree.DTD(BytesIO("<!ELEMENT b EMPTY>"))
+ self.assertRaises(etree.DocumentInvalid, dtd.assertValid, root)
+
+ def test_dtd_assertValid(self):
+ root = etree.XML("<b><a/></b>")
+ dtd = etree.DTD(BytesIO("<!ELEMENT b (a)><!ELEMENT a EMPTY>"))
+ dtd.assertValid(root)
+
+ def test_dtd_internal(self):
+ root = etree.XML(_bytes('''
+ <!DOCTYPE b SYSTEM "none" [
+ <!ELEMENT b (a)>
+ <!ELEMENT a EMPTY>
+ ]>
+ <b><a/></b>
+ '''))
+ dtd = etree.ElementTree(root).docinfo.internalDTD
+ self.assertTrue(dtd)
+ dtd.assertValid(root)
+
+ def test_dtd_internal_invalid(self):
+ root = etree.XML(_bytes('''
+ <!DOCTYPE b SYSTEM "none" [
+ <!ELEMENT b (a)>
+ <!ELEMENT a (c)>
+ <!ELEMENT c EMPTY>
+ ]>
+ <b><a/></b>
+ '''))
+ dtd = etree.ElementTree(root).docinfo.internalDTD
+ self.assertTrue(dtd)
+ self.assertFalse(dtd.validate(root))
+
+ def test_dtd_invalid_duplicate_id(self):
+ root = etree.XML(_bytes('''
+ <a><b id="id1"/><b id="id2"/><b id="id1"/></a>
+ '''))
+ dtd = etree.DTD(BytesIO(_bytes("""
+ <!ELEMENT a (b*)>
+ <!ATTLIST b
+ id ID #REQUIRED
+ >
+ <!ELEMENT b EMPTY>
+ """)))
+ self.assertFalse(dtd.validate(root))
+ self.assertTrue(dtd.error_log)
+ self.assertTrue([error for error in dtd.error_log
+ if 'id1' in error.message])
+
+ def test_dtd_api_internal(self):
+ root = etree.XML(_bytes('''
+ <!DOCTYPE b SYSTEM "none" [
+ <!ATTLIST a
+ attr1 (x | y | z) "z"
+ attr2 CDATA #FIXED "X"
+ >
+ <!ELEMENT b (a)>
+ <!ELEMENT a EMPTY>
+ ]>
+ <b><a/></b>
+ '''))
+ dtd = etree.ElementTree(root).docinfo.internalDTD
+ self.assertTrue(dtd)
+ dtd.assertValid(root)
+
+ seen = []
+ for el in dtd.iterelements():
+ if el.name == 'a':
+ self.assertEqual(2, len(el.attributes()))
+ for attr in el.iterattributes():
+ if attr.name == 'attr1':
+ self.assertEqual('enumeration', attr.type)
+ self.assertEqual('none', attr.default)
+ self.assertEqual('z', attr.default_value)
+ values = attr.values()
+ values.sort()
+ self.assertEqual(['x', 'y', 'z'], values)
+ else:
+ self.assertEqual('attr2', attr.name)
+ self.assertEqual('cdata', attr.type)
+ self.assertEqual('fixed', attr.default)
+ self.assertEqual('X', attr.default_value)
+ else:
+ self.assertEqual('b', el.name)
+ self.assertEqual(0, len(el.attributes()))
+ seen.append(el.name)
+ seen.sort()
+ self.assertEqual(['a', 'b'], seen)
+ self.assertEqual(2, len(dtd.elements()))
+
+ def test_internal_dtds(self):
+ for el_count in range(2, 5):
+ for attr_count in range(4):
+ root = etree.XML(_bytes('''
+ <!DOCTYPE el0 SYSTEM "none" [
+ ''' + ''.join(['''
+ <!ATTLIST el%d
+ attr%d (x | y | z) "z"
+ >
+ ''' % (e, a) for a in range(attr_count) for e in range(el_count)
+ ]) + ''.join(['''
+ <!ELEMENT el%d EMPTY>
+ ''' % e for e in range(1, el_count)
+ ]) + '''
+ ''' + '<!ELEMENT el0 (%s)>' % '|'.join([
+ 'el%d' % e for e in range(1, el_count)]) + '''
+ ]>
+ <el0><el1 %s /></el0>
+ ''' % ' '.join(['attr%d="x"' % a for a in range(attr_count)])))
+ dtd = etree.ElementTree(root).docinfo.internalDTD
+ self.assertTrue(dtd)
+ dtd.assertValid(root)
+
+ e = -1
+ for e, el in enumerate(dtd.iterelements()):
+ self.assertEqual(attr_count, len(el.attributes()))
+ a = -1
+ for a, attr in enumerate(el.iterattributes()):
+ self.assertEqual('enumeration', attr.type)
+ self.assertEqual('none', attr.default)
+ self.assertEqual('z', attr.default_value)
+ values = sorted(attr.values())
+ self.assertEqual(['x', 'y', 'z'], values)
+ self.assertEqual(attr_count - 1, a)
+ self.assertEqual(el_count - 1, e)
+ self.assertEqual(el_count, len(dtd.elements()))
+
+ def test_dtd_broken(self):
+ self.assertRaises(etree.DTDParseError, etree.DTD,
+ BytesIO("<!ELEMENT b HONKEY>"))
+
+ def test_parse_file_dtd(self):
+ parser = etree.XMLParser(attribute_defaults=True)
+
+ tree = etree.parse(fileInTestDir('test.xml'), parser)
+ root = tree.getroot()
+
+ self.assertEqual(
+ "valueA",
+ root.get("default"))
+ self.assertEqual(
+ "valueB",
+ root[0].get("default"))
+
+ @skipIf(etree.LIBXML_VERSION == (2, 9, 0),
+ "DTD loading is broken for incremental parsing in libxml2 2.9.0")
+ def test_iterparse_file_dtd_start(self):
+ iterparse = etree.iterparse
+ iterator = iterparse(fileInTestDir("test.xml"), events=('start',),
+ attribute_defaults=True)
+ attributes = [ element.get("default")
+ for event, element in iterator ]
+ self.assertEqual(
+ ["valueA", "valueB"],
+ attributes)
+
+ @skipIf(etree.LIBXML_VERSION == (2, 9, 0),
+ "DTD loading is broken for incremental parsing in libxml2 2.9.0")
+ def test_iterparse_file_dtd_end(self):
+ iterparse = etree.iterparse
+ iterator = iterparse(fileInTestDir("test.xml"), events=('end',),
+ attribute_defaults=True)
+ attributes = [ element.get("default")
+ for event, element in iterator ]
+ self.assertEqual(
+ ["valueB", "valueA"],
+ attributes)
+
+ def test_dtd_attrs(self):
+ dtd = etree.DTD(fileUrlInTestDir("test.dtd"))
+
+ # Test DTD.system_url attribute
+ self.assertTrue(dtd.system_url.endswith("test.dtd"))
+
+ # Test elements and their attributes
+ a = dtd.elements()[0]
+ self.assertEqual(a.name, "a")
+ self.assertEqual(a.type, "element")
+ self.assertEqual(a.content.name, "b")
+ self.assertEqual(a.content.type, "element")
+ self.assertEqual(a.content.occur, "once")
+
+ aattr = a.attributes()[0]
+ self.assertEqual(aattr.name, "default")
+ self.assertEqual(aattr.type, "enumeration")
+ self.assertEqual(aattr.values(), ["valueA", "valueB"])
+ self.assertEqual(aattr.default_value, "valueA")
+
+ b = dtd.elements()[1]
+ self.assertEqual(b.name, "b")
+ self.assertEqual(b.type, "empty")
+ self.assertEqual(b.content, None)
+
+ # Test entities and their attributes
+ c = dtd.entities()[0]
+ self.assertEqual(c.name, "c")
+ self.assertEqual(c.orig, "&#42;")
+ self.assertEqual(c.content, "*")
+
+ # Test DTD.name attribute
+ root = etree.XML(_bytes('''
+ <!DOCTYPE a SYSTEM "none" [
+ <!ELEMENT a EMPTY>
+ ]>
+ <a/>
+ '''))
+ dtd = etree.ElementTree(root).docinfo.internalDTD
+ self.assertEqual(dtd.name, "a")
+
+ # Test DTD.name and DTD.systemID attributes
+ parser = etree.XMLParser(dtd_validation=True)
+ xml = '<!DOCTYPE a SYSTEM "test.dtd"><a><b/></a>'
+ root = etree.fromstring(xml, parser=parser,
+ base_url=fileUrlInTestDir("test.xml"))
+
+ dtd = root.getroottree().docinfo.internalDTD
+ self.assertEqual(dtd.name, "a")
+ self.assertEqual(dtd.system_url, "test.dtd")
+
+ def test_declaration_escape_quote_pid(self):
+ # Standard allows quotes in systemliteral, but in that case
+ # systemliteral must be escaped with single quotes.
+ # See http://www.w3.org/TR/REC-xml/#sec-prolog-dtd.
+ root = etree.XML('''<!DOCTYPE a PUBLIC 'foo' '"'><a/>''')
+ doc = root.getroottree()
+ self.assertEqual(doc.docinfo.doctype,
+ '''<!DOCTYPE a PUBLIC "foo" '"'>''')
+ self.assertEqual(etree.tostring(doc),
+ _bytes('''<!DOCTYPE a PUBLIC "foo" '"'>\n<a/>'''))
+
+ def test_declaration_quote_withoutpid(self):
+ root = etree.XML('''<!DOCTYPE a SYSTEM '"'><a/>''')
+ doc = root.getroottree()
+ self.assertEqual(doc.docinfo.doctype, '''<!DOCTYPE a SYSTEM '"'>''')
+ self.assertEqual(etree.tostring(doc),
+ _bytes('''<!DOCTYPE a SYSTEM '"'>\n<a/>'''))
+
+ def test_declaration_apos(self):
+ root = etree.XML('''<!DOCTYPE a SYSTEM "'"><a/>''')
+ doc = root.getroottree()
+ self.assertEqual(doc.docinfo.doctype, '''<!DOCTYPE a SYSTEM "'">''')
+ self.assertEqual(etree.tostring(doc),
+ _bytes('''<!DOCTYPE a SYSTEM "'">\n<a/>'''))
+
+ def test_ietf_decl(self):
+ html_data = (
+ '<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">\n'
+ '<html></html>')
+ root = etree.HTML(html_data)
+ doc = root.getroottree()
+ self.assertEqual(doc.docinfo.doctype,
+ '<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">')
+ self.assertEqual(etree.tostring(doc, method='html'), _bytes(html_data))
+
+ def test_set_decl_public(self):
+ doc = etree.Element('test').getroottree()
+ doc.docinfo.public_id = 'bar'
+ doc.docinfo.system_url = 'baz'
+ self.assertEqual(doc.docinfo.doctype,
+ '<!DOCTYPE test PUBLIC "bar" "baz">')
+ self.assertEqual(etree.tostring(doc),
+ _bytes('<!DOCTYPE test PUBLIC "bar" "baz">\n<test/>'))
+
+ def test_html_decl(self):
+ # Slightly different to one above: when we create an html element,
+ # we do not start with a blank slate.
+ doc = html.Element('html').getroottree()
+ doc.docinfo.public_id = 'bar'
+ doc.docinfo.system_url = 'baz'
+ self.assertEqual(doc.docinfo.doctype,
+ '<!DOCTYPE html PUBLIC "bar" "baz">')
+ self.assertEqual(etree.tostring(doc),
+ _bytes('<!DOCTYPE html PUBLIC "bar" "baz">\n<html/>'))
+
+ def test_clean_doctype(self):
+ doc = html.Element('html').getroottree()
+ self.assertTrue(doc.docinfo.doctype != '')
+ doc.docinfo.clear()
+ self.assertTrue(doc.docinfo.doctype == '')
+
+ def test_set_decl_system(self):
+ doc = etree.Element('test').getroottree()
+ doc.docinfo.system_url = 'baz'
+ self.assertEqual(doc.docinfo.doctype,
+ '<!DOCTYPE test SYSTEM "baz">')
+ self.assertEqual(etree.tostring(doc),
+ _bytes('<!DOCTYPE test SYSTEM "baz">\n<test/>'))
+
+ def test_empty_decl(self):
+ doc = etree.Element('test').getroottree()
+ doc.docinfo.public_id = None
+ self.assertEqual(doc.docinfo.doctype,
+ '<!DOCTYPE test>')
+ self.assertTrue(doc.docinfo.public_id is None)
+ self.assertTrue(doc.docinfo.system_url is None)
+ self.assertEqual(etree.tostring(doc),
+ _bytes('<!DOCTYPE test>\n<test/>'))
+
+ def test_invalid_decl_1(self):
+ docinfo = etree.Element('test').getroottree().docinfo
+
+ def set_public_id(value):
+ docinfo.public_id = value
+ self.assertRaises(ValueError, set_public_id, _str('ä'))
+ self.assertRaises(ValueError, set_public_id, _str('qwerty ä asdf'))
+
+ def test_invalid_decl_2(self):
+ docinfo = etree.Element('test').getroottree().docinfo
+
+ def set_system_url(value):
+ docinfo.system_url = value
+ self.assertRaises(ValueError, set_system_url, '\'"')
+ self.assertRaises(ValueError, set_system_url, '"\'')
+ self.assertRaises(ValueError, set_system_url, ' " \' ')
+
+ def test_comment_before_dtd(self):
+ data = '<!--comment--><!DOCTYPE test>\n<!-- --><test/>'
+ doc = etree.fromstring(data).getroottree()
+ self.assertEqual(etree.tostring(doc),
+ _bytes(data))
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ETreeDtdTestCase)])
+ suite.addTests(
+ [make_doctest('../../../doc/validation.txt')])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_elementpath.py b/src/lxml/tests/test_elementpath.py
new file mode 100644
index 0000000..1793ff8
--- /dev/null
+++ b/src/lxml/tests/test_elementpath.py
@@ -0,0 +1,302 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests for the ElementPath implementation.
+"""
+
+from __future__ import absolute_import
+
+import unittest
+from copy import deepcopy
+from .common_imports import etree, HelperTestCase
+
+
+def summarize(elem):
+ return elem.tag
+
+def summarize_list(seq):
+ return list(map(summarize, seq))
+
+def normalize_crlf(tree):
+ for elem in tree.getiterator():
+ if elem.text: elem.text = elem.text.replace("\r\n", "\n")
+ if elem.tail: elem.tail = elem.tail.replace("\r\n", "\n")
+
+
+class EtreeElementPathTestCase(HelperTestCase):
+ etree = etree
+ from lxml import _elementpath
+
+ def test_cache(self):
+ self._elementpath._cache.clear()
+ el = self.etree.XML(b'<a><b><c/><c/></b></a>')
+ self.assertFalse(self._elementpath._cache)
+
+ self.assertTrue(el.findall('b/c'))
+ self.assertEqual(1, len(self._elementpath._cache))
+ self.assertTrue(el.findall('b/c'))
+ self.assertEqual(1, len(self._elementpath._cache))
+ self.assertFalse(el.findall('xxx'))
+ self.assertEqual(2, len(self._elementpath._cache))
+ self.assertFalse(el.findall('xxx'))
+ self.assertEqual(2, len(self._elementpath._cache))
+ self.assertTrue(el.findall('b/c'))
+ self.assertEqual(2, len(self._elementpath._cache))
+
+ def _assert_tokens(self, tokens, path, namespaces=None):
+ self.assertEqual(tokens, list(self._elementpath.xpath_tokenizer(path, namespaces)))
+
+ def test_tokenizer(self):
+ assert_tokens = self._assert_tokens
+ assert_tokens(
+ [('/', '')],
+ '/',
+ )
+ assert_tokens(
+ [('.', ''), ('/', ''), ('', 'a'), ('/', ''), ('', 'b'), ('/', ''), ('', 'c')],
+ './a/b/c',
+ )
+ assert_tokens(
+ [('/', ''), ('', 'a'), ('/', ''), ('', 'b'), ('/', ''), ('', 'c')],
+ '/a/b/c',
+ )
+ assert_tokens(
+ [('/', ''), ('', '{nsx}a'), ('/', ''), ('', '{nsy}b'), ('/', ''), ('', 'c')],
+ '/x:a/y:b/c',
+ {'x': 'nsx', 'y': 'nsy'},
+ )
+ assert_tokens(
+ [('/', ''), ('', '{nsx}a'), ('/', ''), ('', '{nsy}b'), ('/', ''), ('', '{nsnone}c')],
+ '/x:a/y:b/c',
+ {'x': 'nsx', 'y': 'nsy', None: 'nsnone'},
+ )
+
+ def test_tokenizer_predicates(self):
+ assert_tokens = self._assert_tokens
+ assert_tokens(
+ [('', 'a'), ('[', ''), ('', 'b'), (']', '')],
+ 'a[b]',
+ )
+ assert_tokens(
+ [('', 'a'), ('[', ''), ('', 'b'), ('=', ''), ('"abc"', ''), (']', '')],
+ 'a[b="abc"]',
+ )
+ assert_tokens(
+ [('', 'a'), ('[', ''), ('.', ''), ('', ''), ('=', ''), ('', ''), ('"abc"', ''), (']', '')],
+ 'a[. = "abc"]',
+ )
+
+ def test_xpath_tokenizer(self):
+ # Test the XPath tokenizer. Copied from CPython's "test_xml_etree.py"
+ ElementPath = self._elementpath
+
+ def check(p, expected, namespaces=None):
+ self.assertEqual([op or tag
+ for op, tag in ElementPath.xpath_tokenizer(p, namespaces)],
+ expected)
+
+ # tests from the xml specification
+ check("*", ['*'])
+ check("text()", ['text', '()'])
+ check("@name", ['@', 'name'])
+ check("@*", ['@', '*'])
+ check("para[1]", ['para', '[', '1', ']'])
+ check("para[last()]", ['para', '[', 'last', '()', ']'])
+ check("*/para", ['*', '/', 'para'])
+ check("/doc/chapter[5]/section[2]",
+ ['/', 'doc', '/', 'chapter', '[', '5', ']',
+ '/', 'section', '[', '2', ']'])
+ check("chapter//para", ['chapter', '//', 'para'])
+ check("//para", ['//', 'para'])
+ check("//olist/item", ['//', 'olist', '/', 'item'])
+ check(".", ['.'])
+ check(".//para", ['.', '//', 'para'])
+ check("..", ['..'])
+ check("../@lang", ['..', '/', '@', 'lang'])
+ check("chapter[title]", ['chapter', '[', 'title', ']'])
+ check("employee[@secretary and @assistant]", ['employee',
+ '[', '@', 'secretary', '', 'and', '', '@', 'assistant', ']'])
+
+ # additional tests
+ check("@{ns}attr", ['@', '{ns}attr'])
+ check("{http://spam}egg", ['{http://spam}egg'])
+ check("./spam.egg", ['.', '/', 'spam.egg'])
+ check(".//{http://spam}egg", ['.', '//', '{http://spam}egg'])
+
+ # wildcard tags
+ check("{ns}*", ['{ns}*'])
+ check("{}*", ['{}*'])
+ check("{*}tag", ['{*}tag'])
+ check("{*}*", ['{*}*'])
+ check(".//{*}tag", ['.', '//', '{*}tag'])
+
+ # namespace prefix resolution
+ check("./xsd:type", ['.', '/', '{http://www.w3.org/2001/XMLSchema}type'],
+ {'xsd': 'http://www.w3.org/2001/XMLSchema'})
+ check("type", ['{http://www.w3.org/2001/XMLSchema}type'],
+ {'': 'http://www.w3.org/2001/XMLSchema'})
+ check("@xsd:type", ['@', '{http://www.w3.org/2001/XMLSchema}type'],
+ {'xsd': 'http://www.w3.org/2001/XMLSchema'})
+ check("@type", ['@', 'type'],
+ {'': 'http://www.w3.org/2001/XMLSchema'})
+ check("@{*}type", ['@', '{*}type'],
+ {'': 'http://www.w3.org/2001/XMLSchema'})
+ check("@{ns}attr", ['@', '{ns}attr'],
+ {'': 'http://www.w3.org/2001/XMLSchema',
+ 'ns': 'http://www.w3.org/2001/XMLSchema'})
+
+ def test_find(self):
+ """
+ Test find methods (including xpath syntax).
+ Originally copied from 'selftest.py'.
+ """
+ elem = etree.XML("""
+ <body>
+ <tag class='a'>text</tag>
+ <tag class='b' />
+ <section>
+ <tag class='b' id='inner'>subtext</tag>
+ </section>
+ </body>
+ """)
+
+ self.assertEqual(elem.find("tag").tag,
+ 'tag')
+ self.assertEqual(etree.ElementTree(elem).find("tag").tag,
+ 'tag')
+ self.assertEqual(elem.find("section/tag").tag,
+ 'tag')
+ self.assertEqual(etree.ElementTree(elem).find("section/tag").tag,
+ 'tag')
+
+ self.assertEqual(elem.findtext("tag"),
+ 'text')
+ self.assertEqual(elem.findtext("tog"),
+ None)
+ self.assertEqual(elem.findtext("tog", "default"),
+ 'default')
+ self.assertEqual(etree.ElementTree(elem).findtext("tag"),
+ 'text')
+ self.assertEqual(elem.findtext("section/tag"),
+ 'subtext')
+ self.assertEqual(etree.ElementTree(elem).findtext("section/tag"),
+ 'subtext')
+
+ self.assertEqual(summarize_list(elem.findall("tag")),
+ ['tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall("*")),
+ ['tag', 'tag', 'section'])
+ self.assertEqual(summarize_list(elem.findall(".//tag")),
+ ['tag', 'tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall("section/tag")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall("section//tag")),
+ ['tag'])
+
+ self.assertEqual(summarize_list(elem.findall("section/*")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall("section//*")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall("section/.//*")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall("*/*")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall("*//*")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall("*/tag")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall("*/./tag")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall("./tag")),
+ ['tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag")),
+ ['tag', 'tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall("././tag")),
+ ['tag', 'tag'])
+
+ self.assertEqual(summarize_list(elem.findall(".//tag[@class]")),
+ ['tag', 'tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[ @class]")),
+ ['tag', 'tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[@class ]")),
+ ['tag', 'tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[ @class ]")),
+ ['tag', 'tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[@class='a']")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall('.//tag[@class="a"]')),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[@class='b']")),
+ ['tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall('.//tag[@class="b"]')),
+ ['tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall('.//tag[@class = "b"]')),
+ ['tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[@id]")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[@class][@id]")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall(".//section[tag]")),
+ ['section'])
+ self.assertEqual(summarize_list(elem.findall(".//section[element]")),
+ [])
+
+ self.assertEqual(summarize_list(elem.findall(".//section[tag='subtext']")),
+ ['section'])
+ self.assertEqual(summarize_list(elem.findall(".//section[tag ='subtext']")),
+ ['section'])
+ self.assertEqual(summarize_list(elem.findall(".//section[tag= 'subtext']")),
+ ['section'])
+ self.assertEqual(summarize_list(elem.findall(".//section[tag = 'subtext']")),
+ ['section'])
+ self.assertEqual(summarize_list(elem.findall(".//section[ tag = 'subtext' ]")),
+ ['section'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[.='subtext']")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[. ='subtext']")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall('.//tag[.= "subtext"]')),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[. = 'subtext']")),
+ ['tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[. = 'subtext ']")),
+ [])
+ self.assertEqual(summarize_list(elem.findall(".//tag[.= ' subtext']")),
+ [])
+
+ self.assertEqual(summarize_list(elem.findall("../tag")),
+ [])
+ self.assertEqual(summarize_list(elem.findall("section/../tag")),
+ ['tag', 'tag'])
+ self.assertEqual(summarize_list(etree.ElementTree(elem).findall("./tag")),
+ ['tag', 'tag'])
+
+ # FIXME: ET's Path module handles this case incorrectly; this gives
+ # a warning in 1.3, and the behaviour will be modified in 1.4.
+ self.assertEqual(summarize_list(etree.ElementTree(elem).findall("/tag")),
+ ['tag', 'tag'])
+
+ # duplicate section => 2x tag matches
+ elem[1] = deepcopy(elem[2])
+ self.assertEqual(summarize_list(elem.findall(".//section[tag = 'subtext']")),
+ ['section', 'section'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[. = 'subtext']")),
+ ['tag', 'tag'])
+ self.assertEqual(summarize_list(elem.findall(".//tag[@class][@id]")),
+ ['tag', 'tag'])
+
+
+#class ElementTreeElementPathTestCase(EtreeElementPathTestCase):
+# import xml.etree.ElementTree as etree
+# import xml.etree.ElementPath as _elementpath
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(EtreeElementPathTestCase)])
+ #suite.addTests([unittest.makeSuite(ElementTreeElementPathTestCase)])
+ return suite
+
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_elementtree.py b/src/lxml/tests/test_elementtree.py
new file mode 100644
index 0000000..96b043d
--- /dev/null
+++ b/src/lxml/tests/test_elementtree.py
@@ -0,0 +1,4965 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests for the ElementTree API
+
+Only test cases that apply equally well to etree and ElementTree
+belong here. Note that there is a second test module called test_io.py
+for IO related test cases.
+"""
+
+from __future__ import absolute_import
+
+import copy
+import io
+import operator
+import os
+import re
+import sys
+import textwrap
+import unittest
+from contextlib import contextmanager
+from functools import wraps, partial
+from itertools import islice
+
+from .common_imports import (
+ BytesIO, etree, HelperTestCase,
+ ElementTree, cElementTree, ET_VERSION, CET_VERSION,
+ filter_by_version, fileInTestDir, canonicalize, tmpfile,
+ _str, _bytes, unicode, IS_PYTHON2
+)
+
+if cElementTree is not None and (CET_VERSION <= (1,0,7) or sys.version_info[0] >= 3):
+ cElementTree = None
+
+if ElementTree is not None:
+ print("Comparing with ElementTree %s" % getattr(ElementTree, "VERSION", "?"))
+
+if cElementTree is not None:
+ print("Comparing with cElementTree %s" % getattr(cElementTree, "VERSION", "?"))
+
+
+def et_needs_pyversion(*version):
+ def wrap(method):
+ @wraps(method)
+ def testfunc(self, *args):
+ if self.etree is not etree and sys.version_info < version:
+ raise unittest.SkipTest("requires ET in Python %s" % '.'.join(map(str, version)))
+ return method(self, *args)
+ return testfunc
+ return wrap
+
+
+def et_exclude_pyversion(*version):
+ def wrap(method):
+ @wraps(method)
+ def testfunc(self, *args):
+ if self.etree is not etree and sys.version_info[:len(version)] == version:
+ raise unittest.SkipTest("requires ET in Python %s" % '.'.join(map(str, version)))
+ return method(self, *args)
+ return testfunc
+ return wrap
+
+
+class _ETreeTestCaseBase(HelperTestCase):
+ etree = None
+ required_versions_ET = {}
+ required_versions_cET = {}
+
+ def XMLParser(self, **kwargs):
+ try:
+ XMLParser = self.etree.XMLParser
+ except AttributeError:
+ assert 'ElementTree' in self.etree.__name__
+ XMLParser = self.etree.TreeBuilder
+ return XMLParser(**kwargs)
+
+ try:
+ HelperTestCase.assertRegex
+ except AttributeError:
+ def assertRegex(self, *args, **kwargs):
+ return self.assertRegexpMatches(*args, **kwargs)
+
+ @et_needs_pyversion(3, 6)
+ def test_interface(self):
+ # Test element tree interface.
+
+ def check_string(string):
+ len(string)
+ for char in string:
+ self.assertEqual(len(char), 1,
+ msg="expected one-character string, got %r" % char)
+ new_string = string + ""
+ new_string = string + " "
+ string[:0]
+
+ def check_mapping(mapping):
+ len(mapping)
+ keys = mapping.keys()
+ items = mapping.items()
+ for key in keys:
+ item = mapping[key]
+ mapping["key"] = "value"
+ self.assertEqual(mapping["key"], "value",
+ msg="expected value string, got %r" % mapping["key"])
+
+ def check_element(element):
+ self.assertTrue(self.etree.iselement(element), msg="not an element")
+ direlem = dir(element)
+ for attr in 'tag', 'attrib', 'text', 'tail':
+ self.assertTrue(hasattr(element, attr),
+ msg='no %s member' % attr)
+ self.assertIn(attr, direlem,
+ msg='no %s visible by dir' % attr)
+
+ check_string(element.tag)
+ check_mapping(element.attrib)
+ if element.text is not None:
+ check_string(element.text)
+ if element.tail is not None:
+ check_string(element.tail)
+ for elem in element:
+ check_element(elem)
+
+ element = self.etree.Element("tag")
+ check_element(element)
+ tree = self.etree.ElementTree(element)
+ check_element(tree.getroot())
+ element = self.etree.Element(u"t\xe4g", key="value")
+ tree = self.etree.ElementTree(element)
+ # lxml and ET Py2: slightly different repr()
+ #self.assertRegex(repr(element), r"^<Element 't\xe4g' at 0x.*>$")
+ element = self.etree.Element("tag", key="value")
+
+ # Make sure all standard element methods exist.
+
+ def check_method(method):
+ self.assertTrue(hasattr(method, '__call__'),
+ msg="%s not callable" % method)
+
+ check_method(element.append)
+ check_method(element.extend)
+ check_method(element.insert)
+ check_method(element.remove)
+ # Removed in Py3.9
+ #check_method(element.getchildren)
+ check_method(element.find)
+ check_method(element.iterfind)
+ check_method(element.findall)
+ check_method(element.findtext)
+ check_method(element.clear)
+ check_method(element.get)
+ check_method(element.set)
+ check_method(element.keys)
+ check_method(element.items)
+ check_method(element.iter)
+ check_method(element.itertext)
+ # Removed in Py3.9
+ #check_method(element.getiterator)
+
+ # These methods return an iterable. See bug 6472.
+
+ def check_iter(it):
+ check_method(it.next if IS_PYTHON2 else it.__next__)
+
+ check_iter(element.iterfind("tag"))
+ check_iter(element.iterfind("*"))
+ check_iter(tree.iterfind("tag"))
+ check_iter(tree.iterfind("*"))
+
+ # These aliases are provided:
+
+ # not an alias in lxml
+ #self.assertEqual(self.etree.XML, self.etree.fromstring)
+ self.assertEqual(self.etree.PI, self.etree.ProcessingInstruction)
+
+ def test_element(self):
+ for i in range(10):
+ e = self.etree.Element('foo')
+ self.assertEqual(e.tag, 'foo')
+ self.assertEqual(e.text, None)
+ self.assertEqual(e.tail, None)
+
+ def test_simple(self):
+ Element = self.etree.Element
+
+ root = Element('root')
+ root.append(Element('one'))
+ root.append(Element('two'))
+ root.append(Element('three'))
+ self.assertEqual(3, len(root))
+ self.assertEqual('one', root[0].tag)
+ self.assertEqual('two', root[1].tag)
+ self.assertEqual('three', root[2].tag)
+ self.assertRaises(IndexError, operator.getitem, root, 3)
+
+ # test weird dictionary interaction leading to segfault previously
+ def test_weird_dict_interaction(self):
+ root = self.etree.Element('root')
+ self.assertEqual(root.tag, "root")
+ add = self.etree.ElementTree(file=BytesIO('<foo>Foo</foo>'))
+ self.assertEqual(add.getroot().tag, "foo")
+ self.assertEqual(add.getroot().text, "Foo")
+ root.append(self.etree.Element('baz'))
+ self.assertEqual(root.tag, "root")
+ self.assertEqual(root[0].tag, "baz")
+
+ def test_subelement(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ root = Element('root')
+ SubElement(root, 'one')
+ SubElement(root, 'two')
+ SubElement(root, 'three')
+ self.assertEqual(3, len(root))
+ self.assertEqual('one', root[0].tag)
+ self.assertEqual('two', root[1].tag)
+ self.assertEqual('three', root[2].tag)
+
+ def test_element_contains(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ root1 = Element('root')
+ SubElement(root1, 'one')
+ self.assertTrue(root1[0] in root1)
+
+ root2 = Element('root')
+ SubElement(root2, 'two')
+ SubElement(root2, 'three')
+ self.assertTrue(root2[0] in root2)
+ self.assertTrue(root2[1] in root2)
+
+ self.assertFalse(root1[0] in root2)
+ self.assertFalse(root2[0] in root1)
+ self.assertFalse(None in root2)
+
+ def test_element_indexing_with_text(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc>Test<one>One</one></doc>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual(1, len(root))
+ self.assertEqual('one', root[0].tag)
+ self.assertRaises(IndexError, operator.getitem, root, 1)
+
+ def test_element_indexing_with_text2(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc><one>One</one><two>Two</two>hm<three>Three</three></doc>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual(3, len(root))
+ self.assertEqual('one', root[0].tag)
+ self.assertEqual('two', root[1].tag)
+ self.assertEqual('three', root[2].tag)
+
+ def test_element_indexing_only_text(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc>Test</doc>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual(0, len(root))
+
+ def test_element_indexing_negative(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ self.assertEqual(d, a[-1])
+ self.assertEqual(c, a[-2])
+ self.assertEqual(b, a[-3])
+ self.assertRaises(IndexError, operator.getitem, a, -4)
+ a[-1] = e = Element('e')
+ self.assertEqual(e, a[-1])
+ del a[-1]
+ self.assertEqual(2, len(a))
+
+ def test_elementtree(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc><one>One</one><two>Two</two></doc>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual(2, len(root))
+ self.assertEqual('one', root[0].tag)
+ self.assertEqual('two', root[1].tag)
+
+ def test_text(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc>This is a text</doc>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual('This is a text', root.text)
+
+ def test_text_empty(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc></doc>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual(None, root.text)
+
+ def test_text_other(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc><one>One</one></doc>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual(None, root.text)
+ self.assertEqual('One', root[0].text)
+
+ def test_text_escape_in(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc>This is &gt; than a text</doc>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual('This is > than a text', root.text)
+
+ def test_text_escape_out(self):
+ Element = self.etree.Element
+
+ a = Element("a")
+ a.text = "<>&"
+ self.assertXML(_bytes('<a>&lt;&gt;&amp;</a>'),
+ a)
+
+ def test_text_escape_tostring(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+
+ a = Element("a")
+ a.text = "<>&"
+ self.assertEqual(_bytes('<a>&lt;&gt;&amp;</a>'),
+ tostring(a))
+
+ def test_text_str_subclass(self):
+ Element = self.etree.Element
+
+ class strTest(str):
+ pass
+
+ a = Element("a")
+ a.text = strTest("text")
+ self.assertXML(_bytes('<a>text</a>'),
+ a)
+
+ def test_tail(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc>This is <i>mixed</i> content.</doc>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual(1, len(root))
+ self.assertEqual('This is ', root.text)
+ self.assertEqual(None, root.tail)
+ self.assertEqual('mixed', root[0].text)
+ self.assertEqual(' content.', root[0].tail)
+
+ def test_tail_str_subclass(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ class strTest(str):
+ pass
+
+ a = Element("a")
+ SubElement(a, "t").tail = strTest("tail")
+ self.assertXML(_bytes('<a><t></t>tail</a>'),
+ a)
+
+ def _test_del_tail(self):
+ # this is discouraged for ET compat, should not be tested...
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc>This is <i>mixed</i> content.</doc>'))
+ self.assertEqual(1, len(root))
+ self.assertEqual('This is ', root.text)
+ self.assertEqual(None, root.tail)
+ self.assertEqual('mixed', root[0].text)
+ self.assertEqual(' content.', root[0].tail)
+
+ del root[0].tail
+
+ self.assertEqual(1, len(root))
+ self.assertEqual('This is ', root.text)
+ self.assertEqual(None, root.tail)
+ self.assertEqual('mixed', root[0].text)
+ self.assertEqual(None, root[0].tail)
+
+ root[0].tail = "TAIL"
+
+ self.assertEqual(1, len(root))
+ self.assertEqual('This is ', root.text)
+ self.assertEqual(None, root.tail)
+ self.assertEqual('mixed', root[0].text)
+ self.assertEqual('TAIL', root[0].tail)
+
+ def test_ElementTree(self):
+ Element = self.etree.Element
+ ElementTree = self.etree.ElementTree
+
+ el = Element('hoi')
+ doc = ElementTree(el)
+ root = doc.getroot()
+ self.assertEqual(None, root.text)
+ self.assertEqual('hoi', root.tag)
+
+ def test_attrib(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc one="One" two="Two"/>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual('One', root.attrib['one'])
+ self.assertEqual('Two', root.attrib['two'])
+ self.assertRaises(KeyError, operator.getitem, root.attrib, 'three')
+
+ def test_attrib_get(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc one="One" two="Two"/>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual('One', root.attrib.get('one'))
+ self.assertEqual('Two', root.attrib.get('two'))
+ self.assertEqual(None, root.attrib.get('three'))
+ self.assertEqual('foo', root.attrib.get('three', 'foo'))
+
+ def test_attrib_dict(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc one="One" two="Two"/>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ attrib = dict(root.attrib)
+ self.assertEqual('One', attrib['one'])
+ self.assertEqual('Two', attrib['two'])
+ self.assertRaises(KeyError, operator.getitem, attrib, 'three')
+
+ def test_attrib_copy(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc one="One" two="Two"/>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ attrib = copy.copy(root.attrib)
+ self.assertEqual('One', attrib['one'])
+ self.assertEqual('Two', attrib['two'])
+ self.assertRaises(KeyError, operator.getitem, attrib, 'three')
+
+ def test_attrib_deepcopy(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc one="One" two="Two"/>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ attrib = copy.deepcopy(root.attrib)
+ self.assertEqual('One', attrib['one'])
+ self.assertEqual('Two', attrib['two'])
+ self.assertRaises(KeyError, operator.getitem, attrib, 'three')
+
+ def test_attributes_get(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc one="One" two="Two"/>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual('One', root.get('one'))
+ self.assertEqual('Two', root.get('two'))
+ self.assertEqual(None, root.get('three'))
+ self.assertEqual('foo', root.get('three', 'foo'))
+
+ def test_attrib_clear(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc one="One" two="Two"/>'))
+ self.assertEqual('One', root.get('one'))
+ self.assertEqual('Two', root.get('two'))
+ root.attrib.clear()
+ self.assertEqual(None, root.get('one'))
+ self.assertEqual(None, root.get('two'))
+
+ def test_attrib_set_clear(self):
+ Element = self.etree.Element
+
+ root = Element("root", one="One")
+ root.set("two", "Two")
+ self.assertEqual('One', root.get('one'))
+ self.assertEqual('Two', root.get('two'))
+ root.attrib.clear()
+ self.assertEqual(None, root.get('one'))
+ self.assertEqual(None, root.get('two'))
+
+ def test_attrib_ns_clear(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ attribNS = '{http://foo/bar}x'
+
+ parent = Element('parent')
+ parent.set(attribNS, 'a')
+ child = SubElement(parent, 'child')
+ child.set(attribNS, 'b')
+
+ self.assertEqual('a', parent.get(attribNS))
+ self.assertEqual('b', child.get(attribNS))
+
+ parent.clear()
+ self.assertEqual(None, parent.get(attribNS))
+ self.assertEqual('b', child.get(attribNS))
+
+ def test_attrib_pop(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<doc one="One" two="Two"/>')
+ doc = ElementTree(file=f)
+ root = doc.getroot()
+ self.assertEqual('One', root.attrib['one'])
+ self.assertEqual('Two', root.attrib['two'])
+
+ self.assertEqual('One', root.attrib.pop('one'))
+
+ self.assertEqual(None, root.attrib.get('one'))
+ self.assertEqual('Two', root.attrib['two'])
+
+ def test_attrib_pop_unknown(self):
+ root = self.etree.XML(_bytes('<doc one="One" two="Two"/>'))
+ self.assertRaises(KeyError, root.attrib.pop, 'NONE')
+
+ self.assertEqual('One', root.attrib['one'])
+ self.assertEqual('Two', root.attrib['two'])
+
+ def test_attrib_pop_default(self):
+ root = self.etree.XML(_bytes('<doc one="One" two="Two"/>'))
+ self.assertEqual('Three', root.attrib.pop('three', 'Three'))
+
+ def test_attrib_pop_empty_default(self):
+ root = self.etree.XML(_bytes('<doc/>'))
+ self.assertEqual('Three', root.attrib.pop('three', 'Three'))
+
+ def test_attrib_pop_invalid_args(self):
+ root = self.etree.XML(_bytes('<doc one="One" two="Two"/>'))
+ self.assertRaises(TypeError, root.attrib.pop, 'One', None, None)
+
+ def test_attribute_update_dict(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc alpha="Alpha" beta="Beta"/>'))
+ items = list(root.attrib.items())
+ items.sort()
+ self.assertEqual(
+ [('alpha', 'Alpha'), ('beta', 'Beta')],
+ items)
+
+ root.attrib.update({'alpha' : 'test', 'gamma' : 'Gamma'})
+
+ items = list(root.attrib.items())
+ items.sort()
+ self.assertEqual(
+ [('alpha', 'test'), ('beta', 'Beta'), ('gamma', 'Gamma')],
+ items)
+
+ def test_attribute_update_sequence(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc alpha="Alpha" beta="Beta"/>'))
+ items = list(root.attrib.items())
+ items.sort()
+ self.assertEqual(
+ [('alpha', 'Alpha'), ('beta', 'Beta')],
+ items)
+
+ root.attrib.update({'alpha' : 'test', 'gamma' : 'Gamma'}.items())
+
+ items = list(root.attrib.items())
+ items.sort()
+ self.assertEqual(
+ [('alpha', 'test'), ('beta', 'Beta'), ('gamma', 'Gamma')],
+ items)
+
+ def test_attribute_update_iter(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc alpha="Alpha" beta="Beta"/>'))
+ items = list(root.attrib.items())
+ items.sort()
+ self.assertEqual(
+ [('alpha', 'Alpha'), ('beta', 'Beta')],
+ items)
+
+ root.attrib.update(iter({'alpha' : 'test', 'gamma' : 'Gamma'}.items()))
+
+ items = list(root.attrib.items())
+ items.sort()
+ self.assertEqual(
+ [('alpha', 'test'), ('beta', 'Beta'), ('gamma', 'Gamma')],
+ items)
+
+ def test_attribute_update_attrib(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc alpha="Alpha" beta="Beta"/>'))
+ items = list(root.attrib.items())
+ items.sort()
+ self.assertEqual(
+ [('alpha', 'Alpha'), ('beta', 'Beta')],
+ items)
+
+ other = XML(_bytes('<doc alpha="test" gamma="Gamma"/>'))
+ root.attrib.update(other.attrib)
+
+ items = list(root.attrib.items())
+ items.sort()
+ self.assertEqual(
+ [('alpha', 'test'), ('beta', 'Beta'), ('gamma', 'Gamma')],
+ items)
+
+ def test_attribute_keys(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc alpha="Alpha" beta="Beta" gamma="Gamma"/>'))
+ keys = list(root.attrib.keys())
+ keys.sort()
+ self.assertEqual(['alpha', 'beta', 'gamma'], keys)
+
+ def test_attribute_keys2(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc alpha="Alpha" beta="Beta" gamma="Gamma"/>'))
+ keys = list(root.keys())
+ keys.sort()
+ self.assertEqual(['alpha', 'beta', 'gamma'], keys)
+
+ def test_attribute_items2(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc alpha="Alpha" beta="Beta" gamma="Gamma"/>'))
+ items = list(root.items())
+ items.sort()
+ self.assertEqual(
+ [('alpha','Alpha'), ('beta','Beta'), ('gamma','Gamma')],
+ items)
+
+ def test_attribute_keys_ns(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<foo bar="Bar" xmlns:ns="http://ns.codespeak.net/test" ns:baz="Baz" />'))
+ keys = list(root.keys())
+ keys.sort()
+ self.assertEqual(['bar', '{http://ns.codespeak.net/test}baz'],
+ keys)
+
+ def test_attribute_values(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc alpha="Alpha" beta="Beta" gamma="Gamma"/>'))
+ values = list(root.attrib.values())
+ values.sort()
+ self.assertEqual(['Alpha', 'Beta', 'Gamma'], values)
+
+ def test_attribute_values_ns(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<foo bar="Bar" xmlns:ns="http://ns.codespeak.net/test" ns:baz="Baz" />'))
+ values = list(root.attrib.values())
+ values.sort()
+ self.assertEqual(
+ ['Bar', 'Baz'], values)
+
+ def test_attribute_items(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc alpha="Alpha" beta="Beta" gamma="Gamma"/>'))
+ items = list(root.attrib.items())
+ items.sort()
+ self.assertEqual([
+ ('alpha', 'Alpha'),
+ ('beta', 'Beta'),
+ ('gamma', 'Gamma'),
+ ],
+ items)
+
+ def test_attribute_items_ns(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<foo bar="Bar" xmlns:ns="http://ns.codespeak.net/test" ns:baz="Baz" />'))
+ items = list(root.attrib.items())
+ items.sort()
+ self.assertEqual(
+ [('bar', 'Bar'), ('{http://ns.codespeak.net/test}baz', 'Baz')],
+ items)
+
+ def test_attribute_str(self):
+ XML = self.etree.XML
+
+ expected = "{'{http://ns.codespeak.net/test}baz': 'Baz', 'bar': 'Bar'}"
+ alternative = "{'bar': 'Bar', '{http://ns.codespeak.net/test}baz': 'Baz'}"
+
+ root = XML(_bytes('<foo bar="Bar" xmlns:ns="http://ns.codespeak.net/test" ns:baz="Baz" />'))
+ try:
+ self.assertEqual(expected, str(root.attrib))
+ except AssertionError:
+ self.assertEqual(alternative, str(root.attrib))
+
+ def test_attribute_contains(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<foo bar="Bar" xmlns:ns="http://ns.codespeak.net/test" ns:baz="Baz" />'))
+ self.assertEqual(
+ True, 'bar' in root.attrib)
+ self.assertEqual(
+ False, 'baz' in root.attrib)
+ self.assertEqual(
+ False, 'hah' in root.attrib)
+ self.assertEqual(
+ True,
+ '{http://ns.codespeak.net/test}baz' in root.attrib)
+
+ def test_attribute_set(self):
+ Element = self.etree.Element
+
+ root = Element("root")
+ root.set("attr", "TEST")
+ self.assertEqual("TEST", root.get("attr"))
+
+ def test_attrib_as_attrib(self):
+ Element = self.etree.Element
+
+ root = Element("root")
+ root.set("attr", "TEST")
+ self.assertEqual("TEST", root.attrib["attr"])
+
+ root2 = Element("root2", root.attrib)
+ self.assertEqual("TEST", root2.attrib["attr"])
+
+ def test_attribute_iterator(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc alpha="Alpha" beta="Beta" gamma="Gamma" />'))
+ result = []
+ for key in root.attrib:
+ result.append(key)
+ result.sort()
+ self.assertEqual(['alpha', 'beta', 'gamma'], result)
+
+ def test_attribute_manipulation(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ a.attrib['foo'] = 'Foo'
+ a.attrib['bar'] = 'Bar'
+ self.assertEqual('Foo', a.attrib['foo'])
+ del a.attrib['foo']
+ self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo')
+
+ def test_del_attribute_ns(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ a.attrib['{http://a/}foo'] = 'Foo'
+ a.attrib['{http://a/}bar'] = 'Bar'
+ self.assertEqual(None, a.get('foo'))
+ self.assertEqual('Foo', a.get('{http://a/}foo'))
+ self.assertEqual('Foo', a.attrib['{http://a/}foo'])
+
+ self.assertRaises(KeyError, operator.delitem, a.attrib, 'foo')
+ self.assertEqual('Foo', a.attrib['{http://a/}foo'])
+
+ del a.attrib['{http://a/}foo']
+ self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo')
+
+ def test_del_attribute_ns_parsed(self):
+ XML = self.etree.XML
+
+ a = XML(_bytes('<a xmlns:nsa="http://a/" nsa:foo="FooNS" foo="Foo" />'))
+
+ self.assertEqual('Foo', a.attrib['foo'])
+ self.assertEqual('FooNS', a.attrib['{http://a/}foo'])
+
+ del a.attrib['foo']
+ self.assertEqual('FooNS', a.attrib['{http://a/}foo'])
+ self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo')
+ self.assertRaises(KeyError, operator.delitem, a.attrib, 'foo')
+
+ del a.attrib['{http://a/}foo']
+ self.assertRaises(KeyError, operator.getitem, a.attrib, '{http://a/}foo')
+ self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo')
+
+ a = XML(_bytes('<a xmlns:nsa="http://a/" foo="Foo" nsa:foo="FooNS" />'))
+
+ self.assertEqual('Foo', a.attrib['foo'])
+ self.assertEqual('FooNS', a.attrib['{http://a/}foo'])
+
+ del a.attrib['foo']
+ self.assertEqual('FooNS', a.attrib['{http://a/}foo'])
+ self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo')
+
+ del a.attrib['{http://a/}foo']
+ self.assertRaises(KeyError, operator.getitem, a.attrib, '{http://a/}foo')
+ self.assertRaises(KeyError, operator.getitem, a.attrib, 'foo')
+
+ def test_XML(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc>This is a text.</doc>'))
+ self.assertEqual(0, len(root))
+ self.assertEqual('This is a text.', root.text)
+
+ def test_XMLID(self):
+ XMLID = self.etree.XMLID
+ XML = self.etree.XML
+ xml_text = _bytes('''
+ <document>
+ <h1 id="chapter1">...</h1>
+ <p id="note1" class="note">...</p>
+ <p>Regular paragraph.</p>
+ <p xml:id="xmlid">XML:ID paragraph.</p>
+ <p id="warn1" class="warning">...</p>
+ </document>
+ ''')
+
+ root, dic = XMLID(xml_text)
+ root2 = XML(xml_text)
+ self.assertEqual(self._writeElement(root),
+ self._writeElement(root2))
+ expected = {
+ "chapter1" : root[0],
+ "note1" : root[1],
+ "warn1" : root[4]
+ }
+ self.assertEqual(dic, expected)
+
+ def test_fromstring(self):
+ fromstring = self.etree.fromstring
+
+ root = fromstring('<doc>This is a text.</doc>')
+ self.assertEqual(0, len(root))
+ self.assertEqual('This is a text.', root.text)
+
+ required_versions_ET['test_fromstringlist'] = (1,3)
+ def test_fromstringlist(self):
+ fromstringlist = self.etree.fromstringlist
+
+ root = fromstringlist(["<do", "c>T", "hi", "s is",
+ " a text.<", "/doc", ">"])
+ self.assertEqual(0, len(root))
+ self.assertEqual('This is a text.', root.text)
+
+ required_versions_ET['test_fromstringlist_characters'] = (1,3)
+ def test_fromstringlist_characters(self):
+ fromstringlist = self.etree.fromstringlist
+
+ root = fromstringlist(list('<doc>This is a text.</doc>'))
+ self.assertEqual(0, len(root))
+ self.assertEqual('This is a text.', root.text)
+
+ required_versions_ET['test_fromstringlist_single'] = (1,3)
+ def test_fromstringlist_single(self):
+ fromstringlist = self.etree.fromstringlist
+
+ root = fromstringlist(['<doc>This is a text.</doc>'])
+ self.assertEqual(0, len(root))
+ self.assertEqual('This is a text.', root.text)
+
+ def test_iselement(self):
+ iselement = self.etree.iselement
+ Element = self.etree.Element
+ ElementTree = self.etree.ElementTree
+ XML = self.etree.XML
+ Comment = self.etree.Comment
+ ProcessingInstruction = self.etree.ProcessingInstruction
+
+ el = Element('hoi')
+ self.assertTrue(iselement(el))
+
+ el2 = XML(_bytes('<foo/>'))
+ self.assertTrue(iselement(el2))
+
+ tree = ElementTree(element=Element('dag'))
+ self.assertTrue(not iselement(tree))
+ self.assertTrue(iselement(tree.getroot()))
+
+ c = Comment('test')
+ self.assertTrue(iselement(c))
+
+ p = ProcessingInstruction("test", "some text")
+ self.assertTrue(iselement(p))
+
+ def test_iteration(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<three/></doc>'))
+ result = []
+ for el in root:
+ result.append(el.tag)
+ self.assertEqual(['one', 'two', 'three'], result)
+
+ def test_iteration_empty(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc></doc>'))
+ result = []
+ for el in root:
+ result.append(el.tag)
+ self.assertEqual([], result)
+
+ def test_iteration_text_only(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc>Text</doc>'))
+ result = []
+ for el in root:
+ result.append(el.tag)
+ self.assertEqual([], result)
+
+ def test_iteration_set_tail_empty(self):
+ # this would cause a crash in the past
+ fromstring = self.etree.fromstring
+ root = fromstring('<html><p></p>x</html>')
+ for elem in root:
+ elem.tail = ''
+
+ def test_iteration_clear_tail(self):
+ # this would cause a crash in the past
+ fromstring = self.etree.fromstring
+ root = fromstring('<html><p></p>x</html>')
+ for elem in root:
+ elem.tail = None
+
+ def test_iteration_reversed(self):
+ XML = self.etree.XML
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<three/></doc>'))
+ result = []
+ for el in reversed(root):
+ result.append(el.tag)
+ self.assertEqual(['three', 'two', 'one'], result)
+
+ def test_iteration_subelement(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<three/></doc>'))
+ result = []
+ add = True
+ for el in root:
+ result.append(el.tag)
+ if add:
+ self.etree.SubElement(root, 'four')
+ add = False
+ self.assertEqual(['one', 'two', 'three', 'four'], result)
+
+ def test_iteration_del_child(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<three/></doc>'))
+ result = []
+ for el in root:
+ result.append(el.tag)
+ del root[-1]
+ self.assertEqual(['one', 'two'], result)
+
+ def test_iteration_double(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two/></doc>'))
+ result = []
+ for el0 in root:
+ result.append(el0.tag)
+ for el1 in root:
+ result.append(el1.tag)
+ self.assertEqual(['one','one', 'two', 'two', 'one', 'two'], result)
+
+ required_versions_ET['test_itertext'] = (1,3)
+ def test_itertext(self):
+ # ET 1.3+
+ XML = self.etree.XML
+ root = XML(_bytes("<root>RTEXT<a></a>ATAIL<b/><c>CTEXT</c>CTAIL</root>"))
+
+ text = list(root.itertext())
+ self.assertEqual(["RTEXT", "ATAIL", "CTEXT", "CTAIL"],
+ text)
+
+ required_versions_ET['test_itertext_child'] = (1,3)
+ def test_itertext_child(self):
+ # ET 1.3+
+ XML = self.etree.XML
+ root = XML(_bytes("<root>RTEXT<a></a>ATAIL<b/><c>CTEXT</c>CTAIL</root>"))
+
+ text = list(root[2].itertext())
+ self.assertEqual(["CTEXT"],
+ text)
+
+ def test_findall(self):
+ XML = self.etree.XML
+ root = XML(_bytes('<a><b><c/></b><b/><c><b/></c></a>'))
+ self.assertEqual(len(list(root.findall("c"))), 1)
+ self.assertEqual(len(list(root.findall(".//c"))), 2)
+ self.assertEqual(len(list(root.findall(".//b"))), 3)
+ self.assertEqual(len(list(root.findall(".//b"))[0]), 1)
+ self.assertEqual(len(list(root.findall(".//b"))[1]), 0)
+ self.assertEqual(len(list(root.findall(".//b"))[2]), 0)
+
+ def test_findall_ns(self):
+ XML = self.etree.XML
+ root = XML(_bytes('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><b/></a>'))
+ self.assertEqual(len(list(root.findall(".//{X}b"))), 2)
+ self.assertEqual(len(list(root.findall(".//b"))), 3)
+ self.assertEqual(len(list(root.findall("b"))), 2)
+
+ @et_needs_pyversion(3, 8, 0, 'alpha', 4)
+ def test_findall_wildcard(self):
+ def summarize_list(l):
+ return [el.tag for el in l]
+
+ root = self.etree.XML('''
+ <a xmlns:x="X" xmlns:y="Y">
+ <x:b><c/></x:b>
+ <b/>
+ <c><x:b/><b/></c><y:b/>
+ </a>''')
+ root.append(self.etree.Comment('test'))
+
+ self.assertEqual(summarize_list(root.findall("{*}b")),
+ ['{X}b', 'b', '{Y}b'])
+ self.assertEqual(summarize_list(root.findall("{*}c")),
+ ['c'])
+ self.assertEqual(summarize_list(root.findall("{X}*")),
+ ['{X}b'])
+ self.assertEqual(summarize_list(root.findall("{Y}*")),
+ ['{Y}b'])
+ self.assertEqual(summarize_list(root.findall("{}*")),
+ ['b', 'c'])
+ self.assertEqual(summarize_list(root.findall("{}b")), # only for consistency
+ ['b'])
+ self.assertEqual(summarize_list(root.findall("{}b")),
+ summarize_list(root.findall("b")))
+ self.assertEqual(summarize_list(root.findall("{*}*")),
+ ['{X}b', 'b', 'c', '{Y}b'])
+ self.assertEqual(summarize_list(root.findall("{*}*")
+ + ([] if self.etree is etree else [root[-1]])),
+ summarize_list(root.findall("*")))
+
+ self.assertEqual(summarize_list(root.findall(".//{*}b")),
+ ['{X}b', 'b', '{X}b', 'b', '{Y}b'])
+ self.assertEqual(summarize_list(root.findall(".//{*}c")),
+ ['c', 'c'])
+ self.assertEqual(summarize_list(root.findall(".//{X}*")),
+ ['{X}b', '{X}b'])
+ self.assertEqual(summarize_list(root.findall(".//{Y}*")),
+ ['{Y}b'])
+ self.assertEqual(summarize_list(root.findall(".//{}*")),
+ ['c', 'b', 'c', 'b'])
+ self.assertEqual(summarize_list(root.findall(".//{}b")),
+ ['b', 'b'])
+
+ def test_element_with_attributes_keywords(self):
+ Element = self.etree.Element
+
+ el = Element('tag', foo='Foo', bar='Bar')
+ self.assertEqual('Foo', el.attrib['foo'])
+ self.assertEqual('Bar', el.attrib['bar'])
+
+ def test_element_with_attributes(self):
+ Element = self.etree.Element
+
+ el = Element('tag', {'foo': 'Foo', 'bar': 'Bar'})
+ self.assertEqual('Foo', el.attrib['foo'])
+ self.assertEqual('Bar', el.attrib['bar'])
+
+ def test_element_with_attributes_extra(self):
+ Element = self.etree.Element
+
+ el = Element('tag', {'foo': 'Foo', 'bar': 'Bar'}, baz='Baz')
+ self.assertEqual('Foo', el.attrib['foo'])
+ self.assertEqual('Bar', el.attrib['bar'])
+ self.assertEqual('Baz', el.attrib['baz'])
+
+ def test_element_with_attributes_extra_duplicate(self):
+ Element = self.etree.Element
+
+ el = Element('tag', {'foo': 'Foo', 'bar': 'Bar'}, bar='Baz')
+ self.assertEqual('Foo', el.attrib['foo'])
+ self.assertEqual('Baz', el.attrib['bar'])
+
+ def test_element_with_attributes_ns(self):
+ Element = self.etree.Element
+
+ el = Element('tag', {'{ns1}foo':'Foo', '{ns2}bar':'Bar'})
+ self.assertEqual('Foo', el.attrib['{ns1}foo'])
+ self.assertEqual('Bar', el.attrib['{ns2}bar'])
+
+ def test_subelement_with_attributes(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ el = Element('tag')
+ SubElement(el, 'foo', {'foo':'Foo'}, baz="Baz")
+ self.assertEqual("Baz", el[0].attrib['baz'])
+ self.assertEqual('Foo', el[0].attrib['foo'])
+
+ def test_subelement_with_attributes_ns(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ el = Element('tag')
+ SubElement(el, 'foo', {'{ns1}foo':'Foo', '{ns2}bar':'Bar'})
+ self.assertEqual('Foo', el[0].attrib['{ns1}foo'])
+ self.assertEqual('Bar', el[0].attrib['{ns2}bar'])
+
+ def test_write(self):
+ ElementTree = self.etree.ElementTree
+ XML = self.etree.XML
+
+ for i in range(10):
+ f = BytesIO()
+ root = XML(_bytes('<doc%s>This is a test.</doc%s>' % (i, i)))
+ tree = ElementTree(element=root)
+ tree.write(f)
+ data = f.getvalue()
+ self.assertEqual(
+ _bytes('<doc%s>This is a test.</doc%s>' % (i, i)),
+ canonicalize(data))
+
+ required_versions_ET['test_write_method_html'] = (1,3)
+ def test_write_method_html(self):
+ ElementTree = self.etree.ElementTree
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ html = Element('html')
+ body = SubElement(html, 'body')
+ p = SubElement(body, 'p')
+ p.text = "html"
+ SubElement(p, 'br').tail = "test"
+
+ tree = ElementTree(element=html)
+ f = BytesIO()
+ tree.write(f, method="html")
+ data = f.getvalue().replace(_bytes('\n'),_bytes(''))
+
+ self.assertEqual(_bytes('<html><body><p>html<br>test</p></body></html>'),
+ data)
+
+ required_versions_ET['test_write_method_text'] = (1,3)
+ def test_write_method_text(self):
+ ElementTree = self.etree.ElementTree
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = "A"
+ a.tail = "tail"
+ b = SubElement(a, 'b')
+ b.text = "B"
+ b.tail = "TAIL"
+ c = SubElement(a, 'c')
+ c.text = "C"
+
+ tree = ElementTree(element=a)
+ f = BytesIO()
+ tree.write(f, method="text")
+ data = f.getvalue()
+
+ self.assertEqual(_bytes('ABTAILCtail'),
+ data)
+
+ def test_write_fail(self):
+ ElementTree = self.etree.ElementTree
+ XML = self.etree.XML
+
+ tree = ElementTree( XML(_bytes('<doc>This is a test.</doc>')) )
+ self.assertRaises(IOError, tree.write,
+ "definitely////\\-\\nonexisting\\-\\////FILE")
+
+ # this could trigger a crash, apparently because the document
+ # reference was prematurely garbage collected
+ def test_crash(self):
+ Element = self.etree.Element
+
+ element = Element('tag')
+ for i in range(10):
+ element.attrib['key'] = 'value'
+ value = element.attrib['key']
+ self.assertEqual(value, 'value')
+
+ # from doctest; for some reason this caused crashes too
+ def test_write_ElementTreeDoctest(self):
+ Element = self.etree.Element
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO()
+ for i in range(10):
+ element = Element('tag%s' % i)
+ self._check_element(element)
+ tree = ElementTree(element)
+ tree.write(f)
+ self._check_element_tree(tree)
+
+ def test_subelement_reference(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ el = Element('foo')
+ el2 = SubElement(el, 'bar')
+ el3 = SubElement(el2, 'baz')
+
+ al = Element('foo2')
+ al2 = SubElement(al, 'bar2')
+ al3 = SubElement(al2, 'baz2')
+
+ # now move al2 into el
+ el.append(al2)
+
+ # now change al3 directly
+ al3.text = 'baz2-modified'
+
+ # it should have changed through this route too
+ self.assertEqual(
+ 'baz2-modified',
+ el[1][0].text)
+
+ def test_set_text(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ a.text = 'hoi'
+ self.assertEqual(
+ 'hoi',
+ a.text)
+ self.assertEqual(
+ 'b',
+ a[0].tag)
+
+ def test_set_text2(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = 'hoi'
+ b = SubElement(a ,'b')
+ self.assertEqual(
+ 'hoi',
+ a.text)
+ self.assertEqual(
+ 'b',
+ a[0].tag)
+
+ def test_set_text_none(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+
+ a.text = 'foo'
+ a.text = None
+
+ self.assertEqual(
+ None,
+ a.text)
+ self.assertXML(_bytes('<a></a>'), a)
+
+ def test_set_text_empty(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ self.assertEqual(None, a.text)
+
+ a.text = ''
+ self.assertEqual('', a.text)
+ self.assertXML(_bytes('<a></a>'), a)
+
+ def test_tail1(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.tail = 'dag'
+ self.assertEqual('dag',
+ a.tail)
+ b = SubElement(a, 'b')
+ b.tail = 'hoi'
+ self.assertEqual('hoi',
+ b.tail)
+ self.assertEqual('dag',
+ a.tail)
+
+ def test_tail_append(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ b = Element('b')
+ b.tail = 'b_tail'
+ a.append(b)
+ self.assertEqual('b_tail',
+ b.tail)
+
+ def test_tail_set_twice(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ b.tail = 'foo'
+ b.tail = 'bar'
+ self.assertEqual('bar',
+ b.tail)
+ self.assertXML(_bytes('<a><b></b>bar</a>'), a)
+
+ def test_tail_set_none(self):
+ Element = self.etree.Element
+ a = Element('a')
+ a.tail = 'foo'
+ a.tail = None
+ self.assertEqual(
+ None,
+ a.tail)
+ self.assertXML(_bytes('<a></a>'), a)
+
+ required_versions_ET['test_extend'] = (1,3)
+ def test_extend(self):
+ root = self.etree.Element('foo')
+ for i in range(3):
+ element = self.etree.SubElement(root, 'a%s' % i)
+ element.text = "text%d" % i
+ element.tail = "tail%d" % i
+
+ elements = []
+ for i in range(3):
+ new_element = self.etree.Element("test%s" % i)
+ new_element.text = "TEXT%s" % i
+ new_element.tail = "TAIL%s" % i
+ elements.append(new_element)
+
+ root.extend(elements)
+
+ self.assertEqual(
+ ["a0", "a1", "a2", "test0", "test1", "test2"],
+ [ el.tag for el in root ])
+ self.assertEqual(
+ ["text0", "text1", "text2", "TEXT0", "TEXT1", "TEXT2"],
+ [ el.text for el in root ])
+ self.assertEqual(
+ ["tail0", "tail1", "tail2", "TAIL0", "TAIL1", "TAIL2"],
+ [ el.tail for el in root ])
+
+ def test_comment(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ Comment = self.etree.Comment
+
+ a = Element('a')
+ a.append(Comment('foo'))
+ self.assertEqual(a[0].tag, Comment)
+ self.assertEqual(a[0].text, 'foo')
+
+ # ElementTree < 1.3 adds whitespace around comments
+ required_versions_ET['test_comment_text'] = (1,3)
+ def test_comment_text(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ Comment = self.etree.Comment
+ tostring = self.etree.tostring
+
+ a = Element('a')
+ a.append(Comment('foo'))
+ self.assertEqual(a[0].text, 'foo')
+
+ self.assertEqual(
+ _bytes('<a><!--foo--></a>'),
+ tostring(a))
+
+ a[0].text = "TEST"
+ self.assertEqual(a[0].text, 'TEST')
+
+ self.assertEqual(
+ _bytes('<a><!--TEST--></a>'),
+ tostring(a))
+
+ # ElementTree < 1.3 adds whitespace around comments
+ required_versions_ET['test_comment_whitespace'] = (1,3)
+ def test_comment_whitespace(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ Comment = self.etree.Comment
+ tostring = self.etree.tostring
+
+ a = Element('a')
+ a.append(Comment(' foo '))
+ self.assertEqual(a[0].text, ' foo ')
+ self.assertEqual(
+ _bytes('<a><!-- foo --></a>'),
+ tostring(a))
+
+ def test_comment_nonsense(self):
+ Comment = self.etree.Comment
+ c = Comment('foo')
+ self.assertEqual({}, c.attrib)
+ self.assertEqual([], list(c.keys()))
+ self.assertEqual([], list(c.items()))
+ self.assertEqual(None, c.get('hoi'))
+ self.assertEqual(0, len(c))
+ # should not iterate
+ for i in c:
+ pass
+
+ def test_pi(self):
+ # lxml.etree separates target and text
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ ProcessingInstruction = self.etree.ProcessingInstruction
+
+ a = Element('a')
+ a.append(ProcessingInstruction('foo', 'some more text'))
+ self.assertEqual(a[0].tag, ProcessingInstruction)
+ self.assertXML(_bytes("<a><?foo some more text?></a>"),
+ a)
+
+ def test_processinginstruction(self):
+ # lxml.etree separates target and text
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ ProcessingInstruction = self.etree.PI
+
+ a = Element('a')
+ a.append(ProcessingInstruction('foo', 'some more text'))
+ self.assertEqual(a[0].tag, ProcessingInstruction)
+ self.assertXML(_bytes("<a><?foo some more text?></a>"),
+ a)
+
+ def test_pi_nonsense(self):
+ ProcessingInstruction = self.etree.ProcessingInstruction
+ pi = ProcessingInstruction('foo')
+ self.assertEqual({}, pi.attrib)
+ self.assertEqual([], list(pi.keys()))
+ self.assertEqual([], list(pi.items()))
+ self.assertEqual(None, pi.get('hoi'))
+ self.assertEqual(0, len(pi))
+ # should not iterate
+ for i in pi:
+ pass
+
+ def test_setitem(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = Element('c')
+ a[0] = c
+ self.assertEqual(
+ c,
+ a[0])
+ self.assertXML(_bytes('<a><c></c></a>'),
+ a)
+ self.assertXML(_bytes('<b></b>'),
+ b)
+
+ def test_setitem2(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ for i in range(5):
+ b = SubElement(a, 'b%s' % i)
+ c = SubElement(b, 'c')
+ for i in range(5):
+ d = Element('d')
+ e = SubElement(d, 'e')
+ a[i] = d
+ self.assertXML(
+ _bytes('<a><d><e></e></d><d><e></e></d><d><e></e></d><d><e></e></d><d><e></e></d></a>'),
+ a)
+ self.assertXML(_bytes('<c></c>'),
+ c)
+
+ def test_setitem_replace(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ SubElement(a, 'b')
+ d = Element('d')
+ a[0] = d
+ self.assertXML(_bytes('<a><d></d></a>'), a)
+
+ def test_setitem_indexerror(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+
+ self.assertRaises(IndexError, operator.setitem, a, 1, Element('c'))
+
+ def test_setitem_tail(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ b.tail = 'B2'
+ c = Element('c')
+ c.tail = 'C2'
+
+ a[0] = c
+ self.assertXML(
+ _bytes('<a><c></c>C2</a>'),
+ a)
+
+ def test_tag_write(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+
+ a.tag = 'c'
+
+ self.assertEqual(
+ 'c',
+ a.tag)
+
+ self.assertXML(
+ _bytes('<c><b></b></c>'),
+ a)
+
+ def test_tag_reset_ns(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ tostring = self.etree.tostring
+
+ a = Element('{a}a')
+ b1 = SubElement(a, '{a}b')
+ b2 = SubElement(a, '{b}b')
+
+ self.assertEqual('{a}b', b1.tag)
+
+ b1.tag = 'c'
+
+ # can't use C14N here!
+ self.assertEqual('c', b1.tag)
+ self.assertEqual(_bytes('<c'), tostring(b1)[:2])
+ self.assertTrue(_bytes('<c') in tostring(a))
+
+ def test_tag_reset_root_ns(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ tostring = self.etree.tostring
+
+ a = Element('{a}a')
+ b1 = SubElement(a, '{a}b')
+ b2 = SubElement(a, '{b}b')
+
+ a.tag = 'c'
+
+ self.assertEqual(
+ 'c',
+ a.tag)
+
+ # can't use C14N here!
+ self.assertEqual('c', a.tag)
+ self.assertEqual(_bytes('<c'), tostring(a)[:2])
+
+ def test_tag_str_subclass(self):
+ Element = self.etree.Element
+
+ class strTest(str):
+ pass
+
+ a = Element("a")
+ a.tag = strTest("TAG")
+ self.assertXML(_bytes('<TAG></TAG>'),
+ a)
+
+ def test_delitem(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+
+ del a[1]
+ self.assertXML(
+ _bytes('<a><b></b><d></d></a>'),
+ a)
+
+ del a[0]
+ self.assertXML(
+ _bytes('<a><d></d></a>'),
+ a)
+
+ del a[0]
+ self.assertXML(
+ _bytes('<a></a>'),
+ a)
+ # move deleted element into other tree afterwards
+ other = Element('other')
+ other.append(c)
+ self.assertXML(
+ _bytes('<other><c></c></other>'),
+ other)
+
+ def test_del_insert(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ bs = SubElement(b, 'bs')
+ c = SubElement(a, 'c')
+ cs = SubElement(c, 'cs')
+
+ el = a[0]
+ self.assertXML(
+ _bytes('<a><b><bs></bs></b><c><cs></cs></c></a>'),
+ a)
+ self.assertXML(_bytes('<b><bs></bs></b>'), b)
+ self.assertXML(_bytes('<c><cs></cs></c>'), c)
+
+ del a[0]
+ self.assertXML(
+ _bytes('<a><c><cs></cs></c></a>'),
+ a)
+ self.assertXML(_bytes('<b><bs></bs></b>'), b)
+ self.assertXML(_bytes('<c><cs></cs></c>'), c)
+
+ a.insert(0, el)
+ self.assertXML(
+ _bytes('<a><b><bs></bs></b><c><cs></cs></c></a>'),
+ a)
+ self.assertXML(_bytes('<b><bs></bs></b>'), b)
+ self.assertXML(_bytes('<c><cs></cs></c>'), c)
+
+ def test_del_setitem(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ bs = SubElement(b, 'bs')
+ c = SubElement(a, 'c')
+ cs = SubElement(c, 'cs')
+
+ el = a[0]
+ del a[0]
+ a[0] = el
+ self.assertXML(
+ _bytes('<a><b><bs></bs></b></a>'),
+ a)
+ self.assertXML(_bytes('<b><bs></bs></b>'), b)
+ self.assertXML(_bytes('<c><cs></cs></c>'), c)
+
+ def test_del_setslice(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ bs = SubElement(b, 'bs')
+ c = SubElement(a, 'c')
+ cs = SubElement(c, 'cs')
+
+ el = a[0]
+ del a[0]
+ a[0:0] = [el]
+ self.assertXML(
+ _bytes('<a><b><bs></bs></b><c><cs></cs></c></a>'),
+ a)
+ self.assertXML(_bytes('<b><bs></bs></b>'), b)
+ self.assertXML(_bytes('<c><cs></cs></c>'), c)
+
+ def test_replace_slice_tail(self):
+ XML = self.etree.XML
+ a = XML(_bytes('<a><b></b>B2<c></c>C2</a>'))
+ b, c = a
+
+ a[:] = []
+
+ self.assertEqual("B2", b.tail)
+ self.assertEqual("C2", c.tail)
+
+ def test_merge_namespaced_subtree_as_slice(self):
+ XML = self.etree.XML
+ root = XML(_bytes(
+ '<foo><bar xmlns:baz="http://huhu"><puh><baz:bump1 /><baz:bump2 /></puh></bar></foo>'))
+ root[:] = root.findall('.//puh') # delete bar from hierarchy
+
+ # previously, this lost a namespace declaration on bump2
+ result = self.etree.tostring(root)
+ foo = self.etree.fromstring(result)
+
+ self.assertEqual('puh', foo[0].tag)
+ self.assertEqual('{http://huhu}bump1', foo[0][0].tag)
+ self.assertEqual('{http://huhu}bump2', foo[0][1].tag)
+
+ def test_delitem_tail_dealloc(self):
+ ElementTree = self.etree.ElementTree
+ f = BytesIO('<a><b></b>B2<c></c>C2</a>')
+ doc = ElementTree(file=f)
+ a = doc.getroot()
+ del a[0]
+ self.assertXML(
+ _bytes('<a><c></c>C2</a>'),
+ a)
+
+ def test_delitem_tail(self):
+ ElementTree = self.etree.ElementTree
+ f = BytesIO('<a><b></b>B2<c></c>C2</a>')
+ doc = ElementTree(file=f)
+ a = doc.getroot()
+ b, c = a
+ del a[0]
+ self.assertXML(
+ _bytes('<a><c></c>C2</a>'),
+ a)
+ self.assertEqual("B2", b.tail)
+ self.assertEqual("C2", c.tail)
+
+ def test_clear(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ a.text = 'foo'
+ a.tail = 'bar'
+ a.set('hoi', 'dag')
+ a.clear()
+ self.assertEqual(None, a.text)
+ self.assertEqual(None, a.tail)
+ self.assertEqual(None, a.get('hoi'))
+ self.assertEqual('a', a.tag)
+
+ def test_clear_sub(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = 'foo'
+ a.tail = 'bar'
+ a.set('hoi', 'dag')
+ b = SubElement(a, 'b')
+ c = SubElement(b, 'c')
+ a.clear()
+ self.assertEqual(None, a.text)
+ self.assertEqual(None, a.tail)
+ self.assertEqual(None, a.get('hoi'))
+ self.assertEqual('a', a.tag)
+ self.assertEqual(0, len(a))
+ self.assertXML(_bytes('<a></a>'),
+ a)
+ self.assertXML(_bytes('<b><c></c></b>'),
+ b)
+
+ def test_clear_tail(self):
+ ElementTree = self.etree.ElementTree
+ f = BytesIO('<a><b></b>B2<c></c>C2</a>')
+ doc = ElementTree(file=f)
+ a = doc.getroot()
+ a.clear()
+ self.assertXML(
+ _bytes('<a></a>'),
+ a)
+
+ def test_insert(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = Element('d')
+ a.insert(0, d)
+
+ self.assertEqual(
+ d,
+ a[0])
+
+ self.assertXML(
+ _bytes('<a><d></d><b></b><c></c></a>'),
+ a)
+
+ e = Element('e')
+ a.insert(2, e)
+ self.assertEqual(
+ e,
+ a[2])
+ self.assertXML(
+ _bytes('<a><d></d><b></b><e></e><c></c></a>'),
+ a)
+
+ def test_insert_name_interning(self):
+ # See GH#268 / LP#1773749.
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ # Use unique names to make sure they are new in the tag name dict.
+ import uuid
+ names = dict((k, 'tag-' + str(uuid.uuid4())) for k in 'abcde')
+
+ a = Element(names['a'])
+ b = SubElement(a, names['b'])
+ c = SubElement(a, names['c'])
+ d = Element(names['d'])
+ a.insert(0, d)
+
+ self.assertEqual(
+ d,
+ a[0])
+
+ self.assertXML(
+ _bytes('<%(a)s><%(d)s></%(d)s><%(b)s></%(b)s><%(c)s></%(c)s></%(a)s>' % names),
+ a)
+
+ e = Element(names['e'])
+ a.insert(2, e)
+ self.assertEqual(
+ e,
+ a[2])
+ self.assertXML(
+ _bytes('<%(a)s><%(d)s></%(d)s><%(b)s></%(b)s><%(e)s></%(e)s><%(c)s></%(c)s></%(a)s>' % names),
+ a)
+
+ def test_insert_beyond_index(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = Element('c')
+
+ a.insert(2, c)
+ self.assertEqual(
+ c,
+ a[1])
+ self.assertXML(
+ _bytes('<a><b></b><c></c></a>'),
+ a)
+
+ def test_insert_negative(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ d = Element('d')
+ a.insert(-1, d)
+ self.assertEqual(
+ d,
+ a[-2])
+ self.assertXML(
+ _bytes('<a><b></b><d></d><c></c></a>'),
+ a)
+
+ def test_insert_tail(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+
+ c = Element('c')
+ c.tail = 'C2'
+
+ a.insert(0, c)
+ self.assertXML(
+ _bytes('<a><c></c>C2<b></b></a>'),
+ a)
+
+ def test_remove(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ a.remove(b)
+ self.assertEqual(
+ c,
+ a[0])
+ self.assertXML(
+ _bytes('<a><c></c></a>'),
+ a)
+
+ def test_remove_ns(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('{http://test}a')
+ b = SubElement(a, '{http://test}b')
+ c = SubElement(a, '{http://test}c')
+
+ a.remove(b)
+ self.assertXML(
+ _bytes('<ns0:a xmlns:ns0="http://test"><ns0:c></ns0:c></ns0:a>'),
+ a)
+ self.assertXML(
+ _bytes('<ns0:b xmlns:ns0="http://test"></ns0:b>'),
+ b)
+
+ def test_remove_nonexisting(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = Element('d')
+ self.assertRaises(
+ ValueError, a.remove, d)
+
+ def test_remove_tail(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ b.tail = 'b2'
+ a.remove(b)
+ self.assertXML(
+ _bytes('<a></a>'),
+ a)
+ self.assertEqual('b2', b.tail)
+
+ def test_remove_while_iterating(self):
+ # There is no guarantee that this "works", but it should
+ # remove at least one child and not crash.
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ SubElement(a, 'b')
+ SubElement(a, 'c')
+ SubElement(a, 'd')
+ for el in a:
+ a.remove(el)
+ self.assertLess(len(a), 3)
+
+ def test_makeelement(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ b = a.makeelement('c', {'hoi':'dag'})
+ self.assertXML(
+ _bytes('<c hoi="dag"></c>'),
+ b)
+
+ required_versions_ET['test_iter'] = (1,3)
+ def test_iter(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+
+ self.assertEqual(
+ [a, b, d, c, e],
+ list(a.iter()))
+ self.assertEqual(
+ [d],
+ list(d.iter()))
+
+ def test_iter_remove_tail(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = 'a'
+ a.tail = 'a1' * 100
+ b = SubElement(a, 'b')
+ b.text = 'b'
+ b.tail = 'b1' * 100
+ c = SubElement(a, 'c')
+ c.text = 'c'
+ c.tail = 'c1' * 100
+ d = SubElement(b, 'd')
+ d.text = 'd'
+ d.tail = 'd1' * 100
+ e = SubElement(c, 'e')
+ e.text = 'e'
+ e.tail = 'e1' * 100
+
+ for el in a.iter():
+ el.tail = None
+ el = None
+
+ self.assertEqual(
+ [None] * 5,
+ [el.tail for el in a.iter()])
+
+ def test_getslice(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+
+ self.assertEqual(
+ [b, c],
+ a[0:2])
+ self.assertEqual(
+ [b, c, d],
+ a[:])
+ self.assertEqual(
+ [b, c, d],
+ a[:10])
+ self.assertEqual(
+ [b],
+ a[0:1])
+ self.assertEqual(
+ [],
+ a[10:12])
+
+ def test_getslice_negative(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+
+ self.assertEqual(
+ [d],
+ a[-1:])
+ self.assertEqual(
+ [c, d],
+ a[-2:])
+ self.assertEqual(
+ [c],
+ a[-2:-1])
+ self.assertEqual(
+ [b, c],
+ a[-3:-1])
+ self.assertEqual(
+ [b, c],
+ a[-3:2])
+
+ def test_getslice_step(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ e = SubElement(a, 'e')
+
+ self.assertEqual(
+ [e,d,c,b],
+ a[::-1])
+ self.assertEqual(
+ [b,d],
+ a[::2])
+ self.assertEqual(
+ [e,c],
+ a[::-2])
+ self.assertEqual(
+ [d,c],
+ a[-2:0:-1])
+ self.assertEqual(
+ [e],
+ a[:1:-2])
+
+ def test_getslice_text(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<a><b>B</b>B1<c>C</c>C1</a>')
+ doc = ElementTree(file=f)
+ a = doc.getroot()
+ b = a[0]
+ c = a[1]
+ self.assertEqual(
+ [b, c],
+ a[:])
+ self.assertEqual(
+ [b],
+ a[0:1])
+ self.assertEqual(
+ [c],
+ a[1:])
+
+ def test_comment_getitem_getslice(self):
+ Element = self.etree.Element
+ Comment = self.etree.Comment
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ foo = Comment('foo')
+ a.append(foo)
+ c = SubElement(a, 'c')
+ self.assertEqual(
+ [b, foo, c],
+ a[:])
+ self.assertEqual(
+ foo,
+ a[1])
+ a[1] = new = Element('new')
+ self.assertEqual(
+ new,
+ a[1])
+ self.assertXML(
+ _bytes('<a><b></b><new></new><c></c></a>'),
+ a)
+
+ def test_delslice(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ e = SubElement(a, 'e')
+
+ del a[1:3]
+ self.assertEqual(
+ [b, e],
+ list(a))
+
+ def test_delslice_negative1(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ e = SubElement(a, 'e')
+
+ del a[1:-1]
+ self.assertEqual(
+ [b, e],
+ list(a))
+
+ def test_delslice_negative2(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ e = SubElement(a, 'e')
+
+ del a[-3:-1]
+ self.assertEqual(
+ [b, e],
+ list(a))
+
+ def test_delslice_step(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ e = SubElement(a, 'e')
+
+ del a[1::2]
+ self.assertEqual(
+ [b, d],
+ list(a))
+
+ def test_delslice_step_negative(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ e = SubElement(a, 'e')
+
+ del a[::-1]
+ self.assertEqual(
+ [],
+ list(a))
+
+ def test_delslice_step_negative2(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ e = SubElement(a, 'e')
+
+ del a[::-2]
+ self.assertEqual(
+ [b, d],
+ list(a))
+
+ def test_delslice_child_tail_dealloc(self):
+ ElementTree = self.etree.ElementTree
+ f = BytesIO('<a><b></b>B2<c></c>C2<d></d>D2<e></e>E2</a>')
+ doc = ElementTree(file=f)
+ a = doc.getroot()
+ del a[1:3]
+ self.assertXML(
+ _bytes('<a><b></b>B2<e></e>E2</a>'),
+ a)
+
+ def test_delslice_child_tail(self):
+ ElementTree = self.etree.ElementTree
+ f = BytesIO('<a><b></b>B2<c></c>C2<d></d>D2<e></e>E2</a>')
+ doc = ElementTree(file=f)
+ a = doc.getroot()
+ b, c, d, e = a
+ del a[1:3]
+ self.assertXML(
+ _bytes('<a><b></b>B2<e></e>E2</a>'),
+ a)
+ self.assertEqual("B2", b.tail)
+ self.assertEqual("C2", c.tail)
+ self.assertEqual("D2", d.tail)
+ self.assertEqual("E2", e.tail)
+
+ def test_delslice_tail(self):
+ XML = self.etree.XML
+ a = XML(_bytes('<a><b></b>B2<c></c>C2</a>'))
+ b, c = a
+
+ del a[:]
+
+ self.assertEqual("B2", b.tail)
+ self.assertEqual("C2", c.tail)
+
+ def test_delslice_memory(self):
+ # this could trigger a crash
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(b, 'c')
+ del b # no more reference to b
+ del a[:]
+ self.assertEqual('c', c.tag)
+
+ def test_setslice(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+
+ e = Element('e')
+ f = Element('f')
+ g = Element('g')
+
+ s = [e, f, g]
+ a[1:2] = s
+ self.assertEqual(
+ [b, e, f, g, d],
+ list(a))
+
+ def test_setslice_all(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ e = Element('e')
+ f = Element('f')
+ g = Element('g')
+
+ s = [e, f, g]
+ a[:] = s
+ self.assertEqual(
+ [e, f, g],
+ list(a))
+
+ def test_setslice_all_empty(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+
+ e = Element('e')
+ f = Element('f')
+ g = Element('g')
+
+ s = [e, f, g]
+ a[:] = s
+ self.assertEqual(
+ [e, f, g],
+ list(a))
+
+ def test_setslice_all_replace(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+
+ s = [b, c, d]
+ a[:] = s
+ self.assertEqual(
+ [b, c, d],
+ list(a))
+
+ def test_setslice_all_replace_reversed(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+
+ s = [d, c, b]
+ a[:] = s
+ self.assertEqual(
+ [d, c, b],
+ list(a))
+
+ def test_setslice_all_replace_reversed_ns1(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('{ns}a')
+ b = SubElement(a, '{ns}b', {'{ns1}a1': 'test'})
+ c = SubElement(a, '{ns}c', {'{ns2}a2': 'test'})
+ d = SubElement(a, '{ns}d', {'{ns3}a3': 'test'})
+
+ s = [d, c, b]
+ a[:] = s
+ self.assertEqual(
+ [d, c, b],
+ list(a))
+ self.assertEqual(
+ ['{ns}d', '{ns}c', '{ns}b'],
+ [ child.tag for child in a ])
+
+ self.assertEqual(
+ [['{ns3}a3'], ['{ns2}a2'], ['{ns1}a1']],
+ [ list(child.attrib.keys()) for child in a ])
+
+ def test_setslice_all_replace_reversed_ns2(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('{ns}a')
+ b = SubElement(a, '{ns1}b', {'{ns}a1': 'test'})
+ c = SubElement(a, '{ns2}c', {'{ns}a2': 'test'})
+ d = SubElement(a, '{ns3}d', {'{ns}a3': 'test'})
+
+ s = [d, c, b]
+ a[:] = s
+ self.assertEqual(
+ [d, c, b],
+ list(a))
+ self.assertEqual(
+ ['{ns3}d', '{ns2}c', '{ns1}b'],
+ [ child.tag for child in a ])
+
+ self.assertEqual(
+ [['{ns}a3'], ['{ns}a2'], ['{ns}a1']],
+ [ list(child.attrib.keys()) for child in a ])
+
+ def test_setslice_end(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ e = Element('e')
+ f = Element('f')
+ g = Element('g')
+ h = Element('h')
+
+ s = [e, f]
+ a[99:] = s
+ self.assertEqual(
+ [b, c, e, f],
+ list(a))
+
+ s = [g, h]
+ a[:0] = s
+ self.assertEqual(
+ [g, h, b, c, e, f],
+ list(a))
+
+ def test_setslice_end_exact(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+
+ e = Element('e')
+ f = Element('f')
+ g = Element('g')
+
+ s = [e, f, g]
+ a[3:] = s
+ self.assertEqual(
+ [b, c, d, e, f, g],
+ list(a))
+
+ def test_setslice_single(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ e = Element('e')
+ f = Element('f')
+
+ s = [e]
+ a[0:1] = s
+ self.assertEqual(
+ [e, c],
+ list(a))
+
+ s = [f]
+ a[1:2] = s
+ self.assertEqual(
+ [e, f],
+ list(a))
+
+ def test_setslice_tail(self):
+ ElementTree = self.etree.ElementTree
+ Element = self.etree.Element
+ f = BytesIO('<a><b></b>B2<c></c>C2<d></d>D2<e></e>E2</a>')
+ doc = ElementTree(file=f)
+ a = doc.getroot()
+ x = Element('x')
+ y = Element('y')
+ z = Element('z')
+ x.tail = 'X2'
+ y.tail = 'Y2'
+ z.tail = 'Z2'
+ a[1:3] = [x, y, z]
+ self.assertXML(
+ _bytes('<a><b></b>B2<x></x>X2<y></y>Y2<z></z>Z2<e></e>E2</a>'),
+ a)
+
+ def test_setslice_negative(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+
+ x = Element('x')
+ y = Element('y')
+
+ a[1:-1] = [x, y]
+ self.assertEqual(
+ [b, x, y, d],
+ list(a))
+
+ def test_setslice_negative2(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+
+ x = Element('x')
+ y = Element('y')
+
+ a[1:-2] = [x, y]
+ self.assertEqual(
+ [b, x, y, c, d],
+ list(a))
+
+ def test_setslice_empty(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+
+ b = Element('b')
+ c = Element('c')
+
+ a[:] = [b, c]
+ self.assertEqual(
+ [b, c],
+ list(a))
+
+ def test_tail_elementtree_root(self):
+ Element = self.etree.Element
+ ElementTree = self.etree.ElementTree
+
+ a = Element('a')
+ a.tail = 'A2'
+ t = ElementTree(element=a)
+ self.assertEqual('A2',
+ a.tail)
+
+ def test_ns_access(self):
+ ElementTree = self.etree.ElementTree
+ ns = 'http://xml.infrae.com/1'
+ f = BytesIO('<x:a xmlns:x="%s"><x:b></x:b></x:a>' % ns)
+ t = ElementTree(file=f)
+ a = t.getroot()
+ self.assertEqual('{%s}a' % ns,
+ a.tag)
+ self.assertEqual('{%s}b' % ns,
+ a[0].tag)
+
+ def test_ns_access2(self):
+ ElementTree = self.etree.ElementTree
+ ns = 'http://xml.infrae.com/1'
+ ns2 = 'http://xml.infrae.com/2'
+ f = BytesIO('<x:a xmlns:x="%s" xmlns:y="%s"><x:b></x:b><y:b></y:b></x:a>' % (ns, ns2))
+ t = ElementTree(file=f)
+ a = t.getroot()
+ self.assertEqual('{%s}a' % ns,
+ a.tag)
+ self.assertEqual('{%s}b' % ns,
+ a[0].tag)
+ self.assertEqual('{%s}b' % ns2,
+ a[1].tag)
+
+ def test_ns_setting(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ ns = 'http://xml.infrae.com/1'
+ ns2 = 'http://xml.infrae.com/2'
+ a = Element('{%s}a' % ns)
+ b = SubElement(a, '{%s}b' % ns2)
+ c = SubElement(a, '{%s}c' % ns)
+ self.assertEqual('{%s}a' % ns,
+ a.tag)
+ self.assertEqual('{%s}b' % ns2,
+ b.tag)
+ self.assertEqual('{%s}c' % ns,
+ c.tag)
+ self.assertEqual('{%s}a' % ns,
+ a.tag)
+ self.assertEqual('{%s}b' % ns2,
+ b.tag)
+ self.assertEqual('{%s}c' % ns,
+ c.tag)
+
+ def test_ns_tag_parse(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ ElementTree = self.etree.ElementTree
+
+ ns = 'http://xml.infrae.com/1'
+ ns2 = 'http://xml.infrae.com/2'
+ f = BytesIO('<a xmlns="%s" xmlns:x="%s"><x:b></x:b><b></b></a>' % (ns, ns2))
+ t = ElementTree(file=f)
+
+ a = t.getroot()
+ self.assertEqual('{%s}a' % ns,
+ a.tag)
+ self.assertEqual('{%s}b' % ns2,
+ a[0].tag)
+ self.assertEqual('{%s}b' % ns,
+ a[1].tag)
+
+ def test_ns_attr(self):
+ Element = self.etree.Element
+ ns = 'http://xml.infrae.com/1'
+ ns2 = 'http://xml.infrae.com/2'
+ a = Element('a')
+ a.set('{%s}foo' % ns, 'Foo')
+ a.set('{%s}bar' % ns2, 'Bar')
+ self.assertEqual(
+ 'Foo',
+ a.get('{%s}foo' % ns))
+ self.assertEqual(
+ 'Bar',
+ a.get('{%s}bar' % ns2))
+ try:
+ self.assertXML(
+ _bytes('<a xmlns:ns0="%s" xmlns:ns1="%s" ns0:foo="Foo" ns1:bar="Bar"></a>' % (ns, ns2)),
+ a)
+ except AssertionError:
+ self.assertXML(
+ _bytes('<a xmlns:ns0="%s" xmlns:ns1="%s" ns1:foo="Foo" ns0:bar="Bar"></a>' % (ns2, ns)),
+ a)
+
+ def test_ns_move(self):
+ Element = self.etree.Element
+ one = self.etree.fromstring(
+ _bytes('<foo><bar xmlns:ns="http://a.b.c"><ns:baz/></bar></foo>'))
+ baz = one[0][0]
+
+ two = Element('root')
+ two.append(baz)
+ # removing the originating document could cause a crash/error before
+ # as namespace is not moved along with it
+ del one, baz
+ self.assertEqual('{http://a.b.c}baz', two[0].tag)
+
+ def test_ns_decl_tostring(self):
+ tostring = self.etree.tostring
+ root = self.etree.XML(
+ _bytes('<foo><bar xmlns:ns="http://a.b.c"><ns:baz/></bar></foo>'))
+ baz = root[0][0]
+
+ nsdecl = re.findall(_bytes("xmlns(?::[a-z0-9]+)?=[\"']([^\"']+)[\"']"),
+ tostring(baz))
+ self.assertEqual([_bytes("http://a.b.c")], nsdecl)
+
+ def test_ns_decl_tostring_default(self):
+ tostring = self.etree.tostring
+ root = self.etree.XML(
+ _bytes('<foo><bar xmlns="http://a.b.c"><baz/></bar></foo>'))
+ baz = root[0][0]
+
+ nsdecl = re.findall(_bytes("xmlns(?::[a-z0-9]+)?=[\"']([^\"']+)[\"']"),
+ tostring(baz))
+ self.assertEqual([_bytes("http://a.b.c")], nsdecl)
+
+ def test_ns_decl_tostring_root(self):
+ tostring = self.etree.tostring
+ root = self.etree.XML(
+ _bytes('<foo xmlns:ns="http://a.b.c"><bar><ns:baz/></bar></foo>'))
+ baz = root[0][0]
+
+ nsdecl = re.findall(_bytes("xmlns(?::[a-z0-9]+)?=[\"']([^\"']+)[\"']"),
+ tostring(baz))
+
+ self.assertEqual([_bytes("http://a.b.c")], nsdecl)
+
+ def test_ns_decl_tostring_element(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ root = Element("foo")
+ bar = SubElement(root, "{http://a.b.c}bar")
+ baz = SubElement(bar, "{http://a.b.c}baz")
+
+ nsdecl = re.findall(_bytes("xmlns(?::[a-z0-9]+)?=[\"']([^\"']+)[\"']"),
+ self.etree.tostring(baz))
+
+ self.assertEqual([_bytes("http://a.b.c")], nsdecl)
+
+ def test_attribute_xmlns_move(self):
+ Element = self.etree.Element
+
+ root = Element('element')
+
+ subelement = Element('subelement',
+ {"{http://www.w3.org/XML/1998/namespace}id": "foo"})
+ self.assertEqual(1, len(subelement.attrib))
+ self.assertEqual(
+ "foo",
+ subelement.get("{http://www.w3.org/XML/1998/namespace}id"))
+
+ root.append(subelement)
+ self.assertEqual(1, len(subelement.attrib))
+ self.assertEqual(
+ list({"{http://www.w3.org/XML/1998/namespace}id" : "foo"}.items()),
+ list(subelement.attrib.items()))
+ self.assertEqual(
+ "foo",
+ subelement.get("{http://www.w3.org/XML/1998/namespace}id"))
+
+ def test_namespaces_after_serialize(self):
+ parse = self.etree.parse
+ tostring = self.etree.tostring
+
+ ns_href = "http://a.b.c"
+ one = parse(
+ BytesIO('<foo><bar xmlns:ns="%s"><ns:baz/></bar></foo>' % ns_href))
+ baz = one.getroot()[0][0]
+
+ parsed = parse(BytesIO( tostring(baz) )).getroot()
+ self.assertEqual('{%s}baz' % ns_href, parsed.tag)
+
+ def test_attribute_namespace_roundtrip(self):
+ fromstring = self.etree.fromstring
+ tostring = self.etree.tostring
+
+ ns_href = "http://a.b.c"
+ xml = _bytes('<root xmlns="%s" xmlns:x="%s"><el x:a="test" /></root>' % (
+ ns_href,ns_href))
+ root = fromstring(xml)
+ self.assertEqual('test', root[0].get('{%s}a' % ns_href))
+
+ xml2 = tostring(root)
+ self.assertTrue(_bytes(':a=') in xml2, xml2)
+
+ root2 = fromstring(xml2)
+ self.assertEqual('test', root2[0].get('{%s}a' % ns_href))
+
+ def test_attribute_namespace_roundtrip_replaced(self):
+ fromstring = self.etree.fromstring
+ tostring = self.etree.tostring
+
+ ns_href = "http://a.b.c"
+ xml = _bytes('<root xmlns="%s" xmlns:x="%s"><el x:a="test" /></root>' % (
+ ns_href,ns_href))
+ root = fromstring(xml)
+ self.assertEqual('test', root[0].get('{%s}a' % ns_href))
+
+ root[0].set('{%s}a' % ns_href, 'TEST')
+
+ xml2 = tostring(root)
+ self.assertTrue(_bytes(':a=') in xml2, xml2)
+
+ root2 = fromstring(xml2)
+ self.assertEqual('TEST', root2[0].get('{%s}a' % ns_href))
+
+ required_versions_ET['test_register_namespace'] = (1,3)
+ def test_register_namespace(self):
+ # ET 1.3+
+ Element = self.etree.Element
+ prefix = 'TESTPREFIX'
+ namespace = 'http://seriously.unknown/namespace/URI'
+
+ el = Element('{%s}test' % namespace)
+ self.assertEqual(_bytes('<ns0:test xmlns:ns0="%s"></ns0:test>' % namespace),
+ self._writeElement(el))
+
+ self.etree.register_namespace(prefix, namespace)
+ el = Element('{%s}test' % namespace)
+ self.assertEqual(_bytes('<%s:test xmlns:%s="%s"></%s:test>' % (
+ prefix, prefix, namespace, prefix)),
+ self._writeElement(el))
+
+ self.assertRaises(ValueError, self.etree.register_namespace, 'ns25', namespace)
+
+ def test_tostring(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ self.assertEqual(_bytes('<a><b></b><c></c></a>'),
+ canonicalize(tostring(a)))
+
+ def test_tostring_element(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(c, 'd')
+ self.assertEqual(_bytes('<b></b>'),
+ canonicalize(tostring(b)))
+ self.assertEqual(_bytes('<c><d></d></c>'),
+ canonicalize(tostring(c)))
+
+ def test_tostring_element_tail(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(c, 'd')
+ b.tail = 'Foo'
+
+ self.assertTrue(tostring(b) == _bytes('<b/>Foo') or
+ tostring(b) == _bytes('<b />Foo'))
+
+ required_versions_ET['test_tostring_method_html'] = (1,3)
+ def test_tostring_method_html(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ html = Element('html')
+ body = SubElement(html, 'body')
+ p = SubElement(body, 'p')
+ p.text = "html"
+ SubElement(p, 'br').tail = "test"
+
+ self.assertEqual(_bytes('<html><body><p>html<br>test</p></body></html>'),
+ tostring(html, method="html"))
+
+ required_versions_ET['test_tostring_method_text'] = (1,3)
+ def test_tostring_method_text(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = "A"
+ a.tail = "tail"
+ b = SubElement(a, 'b')
+ b.text = "B"
+ b.tail = "TAIL"
+ c = SubElement(a, 'c')
+ c.text = "C"
+
+ self.assertEqual(_bytes('ABTAILCtail'),
+ tostring(a, method="text"))
+
+ def test_iterparse(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b></b><c/></a>')
+
+ iterator = iterparse(f)
+ self.assertEqual(None,
+ iterator.root)
+ events = list(iterator)
+ root = iterator.root
+ self.assertEqual(
+ [('end', root[0]), ('end', root[1]), ('end', root)],
+ events)
+
+ def test_iterparse_incomplete(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b></b><c/></a>')
+
+ iterator = iterparse(f)
+ self.assertEqual(None,
+ iterator.root)
+ event, element = next(iter(iterator))
+ self.assertEqual('end', event)
+ self.assertEqual('b', element.tag)
+
+ def test_iterparse_file(self):
+ iterparse = self.etree.iterparse
+ iterator = iterparse(fileInTestDir("test.xml"))
+ self.assertEqual(None,
+ iterator.root)
+ events = list(iterator)
+ root = iterator.root
+ self.assertEqual(
+ [('end', root[0]), ('end', root)],
+ events)
+
+ def test_iterparse_start(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b></b><c/></a>')
+
+ iterator = iterparse(f, events=('start',))
+ events = list(iterator)
+ root = iterator.root
+ self.assertEqual(
+ [('start', root), ('start', root[0]), ('start', root[1])],
+ events)
+
+ def test_iterparse_start_end(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b></b><c/></a>')
+
+ iterator = iterparse(f, events=('start','end'))
+ events = list(iterator)
+ root = iterator.root
+ self.assertEqual(
+ [('start', root), ('start', root[0]), ('end', root[0]),
+ ('start', root[1]), ('end', root[1]), ('end', root)],
+ events)
+
+ def test_iterparse_clear(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b></b><c/></a>')
+
+ iterator = iterparse(f)
+ for event, elem in iterator:
+ elem.clear()
+
+ root = iterator.root
+ self.assertEqual(0,
+ len(root))
+
+ def test_iterparse_large(self):
+ iterparse = self.etree.iterparse
+ CHILD_COUNT = 12345
+ f = BytesIO('<a>%s</a>' % ('<b>test</b>'*CHILD_COUNT))
+
+ i = 0
+ for key in iterparse(f):
+ event, element = key
+ i += 1
+ self.assertEqual(i, CHILD_COUNT + 1)
+
+ def test_iterparse_set_ns_attribute(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a xmlns="http://ns1/"><b><c xmlns="http://ns2/"/></b></a>')
+
+ attr_name = '{http://testns/}bla'
+ events = []
+ iterator = iterparse(f, events=('start','end','start-ns','end-ns'))
+ for event, elem in iterator:
+ events.append(event)
+ if event == 'start':
+ if elem.tag != '{http://ns1/}a':
+ elem.set(attr_name, 'value')
+
+ self.assertEqual(
+ ['start-ns', 'start', 'start', 'start-ns', 'start',
+ 'end', 'end-ns', 'end', 'end', 'end-ns'],
+ events)
+
+ root = iterator.root
+ self.assertEqual(
+ None,
+ root.get(attr_name))
+ self.assertEqual(
+ 'value',
+ root[0].get(attr_name))
+
+ def test_iterparse_only_end_ns(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a xmlns="http://ns1/"><b><c xmlns="http://ns2/"/></b></a>')
+
+ attr_name = '{http://testns/}bla'
+ events = []
+ iterator = iterparse(f, events=('start','end','start-ns','end-ns'))
+ for event, elem in iterator:
+ events.append(event)
+ if event == 'start':
+ if elem.tag != '{http://ns1/}a':
+ elem.set(attr_name, 'value')
+
+ self.assertEqual(
+ ['start-ns', 'start', 'start', 'start-ns', 'start',
+ 'end', 'end-ns', 'end', 'end', 'end-ns'],
+ events)
+
+ root = iterator.root
+ self.assertEqual(
+ None,
+ root.get(attr_name))
+ self.assertEqual(
+ 'value',
+ root[0].get(attr_name))
+
+ def test_iterparse_move_elements(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b><d/></b><c/></a>')
+
+ for event, node in etree.iterparse(f): pass
+
+ root = etree.Element('new_root', {})
+ root[:] = node[:]
+
+ self.assertEqual(
+ ['b', 'c'],
+ [ el.tag for el in root ])
+
+ def test_iterparse_cdata(self):
+ tostring = self.etree.tostring
+ f = BytesIO('<root><![CDATA[test]]></root>')
+ context = self.etree.iterparse(f)
+ content = [ el.text for event,el in context ]
+
+ self.assertEqual(['test'], content)
+ self.assertEqual(_bytes('<root>test</root>'),
+ tostring(context.root))
+
+ def test_parse_file(self):
+ parse = self.etree.parse
+ # from file
+ tree = parse(fileInTestDir('test.xml'))
+ self.assertXML(
+ _bytes('<a><b></b></a>'),
+ tree.getroot())
+
+ def test_parse_file_nonexistent(self):
+ parse = self.etree.parse
+ self.assertRaises(IOError, parse, fileInTestDir('notthere.xml'))
+
+ def test_parse_error_none(self):
+ parse = self.etree.parse
+ self.assertRaises(TypeError, parse, None)
+
+ required_versions_ET['test_parse_error'] = (1,3)
+ def test_parse_error(self):
+ # ET < 1.3 raises ExpatError
+ parse = self.etree.parse
+ f = BytesIO('<a><b></c></b></a>')
+ self.assertRaises(SyntaxError, parse, f)
+ f.close()
+
+ required_versions_ET['test_parse_error_from_file'] = (1,3)
+ def test_parse_error_from_file(self):
+ parse = self.etree.parse
+ # from file
+ f = open(fileInTestDir('test_broken.xml'), 'rb')
+ self.assertRaises(SyntaxError, parse, f)
+ f.close()
+
+ def test_parse_file_object(self):
+ parse = self.etree.parse
+ # from file object
+ f = open(fileInTestDir('test.xml'), 'rb')
+ tree = parse(f)
+ f.close()
+ self.assertXML(
+ _bytes('<a><b></b></a>'),
+ tree.getroot())
+
+ def test_parse_stringio(self):
+ parse = self.etree.parse
+ f = BytesIO('<a><b></b></a>')
+ tree = parse(f)
+ f.close()
+ self.assertXML(
+ _bytes('<a><b></b></a>'),
+ tree.getroot()
+ )
+
+ def test_parse_cdata(self):
+ tostring = self.etree.tostring
+ root = self.etree.XML(_bytes('<root><![CDATA[test]]></root>'))
+
+ self.assertEqual('test', root.text)
+ self.assertEqual(_bytes('<root>test</root>'),
+ tostring(root))
+
+ def test_parse_with_encoding(self):
+ # this can fail in libxml2 <= 2.6.22
+ parse = self.etree.parse
+ tree = parse(BytesIO('<?xml version="1.0" encoding="ascii"?><html/>'))
+ self.assertXML(_bytes('<html></html>'),
+ tree.getroot())
+
+ def test_encoding(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ a.text = _str('Søk på nettet')
+ self.assertXML(
+ _str('<a>Søk på nettet</a>').encode('UTF-8'),
+ a, 'utf-8')
+
+ def test_encoding_exact(self):
+ ElementTree = self.etree.ElementTree
+ Element = self.etree.Element
+
+ a = Element('a')
+ a.text = _str('Søk på nettet')
+
+ f = BytesIO()
+ tree = ElementTree(element=a)
+ tree.write(f, encoding='utf-8')
+ self.assertEqual(_str('<a>Søk på nettet</a>').encode('UTF-8'),
+ f.getvalue().replace(_bytes('\n'),_bytes('')))
+
+ def test_parse_file_encoding(self):
+ parse = self.etree.parse
+ # from file
+ tree = parse(fileInTestDir('test-string.xml'))
+ self.assertXML(
+ _str('<a>Søk på nettet</a>').encode('UTF-8'),
+ tree.getroot(), 'UTF-8')
+
+ def test_parse_file_object_encoding(self):
+ parse = self.etree.parse
+ # from file object
+ f = open(fileInTestDir('test-string.xml'), 'rb')
+ tree = parse(f)
+ f.close()
+ self.assertXML(
+ _str('<a>Søk på nettet</a>').encode('UTF-8'),
+ tree.getroot(), 'UTF-8')
+
+ def test_encoding_8bit_latin1(self):
+ ElementTree = self.etree.ElementTree
+ Element = self.etree.Element
+
+ a = Element('a')
+ a.text = _str('Søk på nettet')
+
+ f = BytesIO()
+ tree = ElementTree(element=a)
+ tree.write(f, encoding='iso-8859-1')
+ result = f.getvalue()
+ declaration = _bytes("<?xml version=\'1.0\' encoding=\'iso-8859-1\'?>")
+ self.assertEncodingDeclaration(result, _bytes('iso-8859-1'))
+ result = result.split(_bytes('?>'), 1)[-1].replace(_bytes('\n'),_bytes(''))
+ self.assertEqual(_str('<a>Søk på nettet</a>').encode('iso-8859-1'),
+ result)
+
+ required_versions_ET['test_parse_encoding_8bit_explicit'] = (1,3)
+ def test_parse_encoding_8bit_explicit(self):
+ XMLParser = self.XMLParser
+
+ text = _str('Søk på nettet')
+ xml_latin1 = (_str('<a>%s</a>') % text).encode('iso-8859-1')
+
+ self.assertRaises(self.etree.ParseError,
+ self.etree.parse,
+ BytesIO(xml_latin1))
+
+ tree = self.etree.parse(BytesIO(xml_latin1),
+ XMLParser(encoding="iso-8859-1"))
+ a = tree.getroot()
+ self.assertEqual(a.text, text)
+
+ required_versions_ET['test_parse_encoding_8bit_override'] = (1,3)
+ def test_parse_encoding_8bit_override(self):
+ XMLParser = self.XMLParser
+
+ text = _str('Søk på nettet')
+ wrong_declaration = _str("<?xml version='1.0' encoding='UTF-8'?>")
+ xml_latin1 = (_str('%s<a>%s</a>') % (wrong_declaration, text)
+ ).encode('iso-8859-1')
+
+ self.assertRaises(self.etree.ParseError,
+ self.etree.parse,
+ BytesIO(xml_latin1))
+
+ tree = self.etree.parse(BytesIO(xml_latin1),
+ XMLParser(encoding="iso-8859-1"))
+ a = tree.getroot()
+ self.assertEqual(a.text, text)
+
+ def _test_wrong_unicode_encoding(self):
+ # raise error on wrong encoding declaration in unicode strings
+ XML = self.etree.XML
+ test_utf = (_str('<?xml version="1.0" encoding="iso-8859-1"?>') +
+ _str('<a>Søk på nettet</a>'))
+ self.assertRaises(SyntaxError, XML, test_utf)
+
+ def test_encoding_write_default_encoding(self):
+ ElementTree = self.etree.ElementTree
+ Element = self.etree.Element
+
+ a = Element('a')
+ a.text = _str('Søk på nettet')
+
+ f = BytesIO()
+ tree = ElementTree(element=a)
+ tree.write(f)
+ data = f.getvalue().replace(_bytes('\n'),_bytes(''))
+ self.assertEqual(
+ _str('<a>Søk på nettet</a>').encode('ASCII', 'xmlcharrefreplace'),
+ data)
+
+ def test_encoding_tostring(self):
+ Element = self.etree.Element
+ tostring = self.etree.tostring
+
+ a = Element('a')
+ a.text = _str('Søk på nettet')
+ self.assertEqual(_str('<a>Søk på nettet</a>').encode('UTF-8'),
+ tostring(a, encoding='utf-8'))
+
+ def test_encoding_tostring_unknown(self):
+ Element = self.etree.Element
+ tostring = self.etree.tostring
+
+ a = Element('a')
+ a.text = _str('Søk på nettet')
+ self.assertRaises(LookupError, tostring, a,
+ encoding='Invalid Encoding')
+
+ def test_encoding_tostring_sub(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ tostring = self.etree.tostring
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ b.text = _str('Søk på nettet')
+ self.assertEqual(_str('<b>Søk på nettet</b>').encode('UTF-8'),
+ tostring(b, encoding='utf-8'))
+
+ def test_encoding_tostring_sub_tail(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ tostring = self.etree.tostring
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ b.text = _str('Søk på nettet')
+ b.tail = _str('Søk')
+ self.assertEqual(_str('<b>Søk på nettet</b>Søk').encode('UTF-8'),
+ tostring(b, encoding='utf-8'))
+
+ def test_encoding_tostring_default_encoding(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ tostring = self.etree.tostring
+
+ a = Element('a')
+ a.text = _str('Søk på nettet')
+
+ expected = _bytes('<a>S&#248;k p&#229; nettet</a>')
+ self.assertEqual(
+ expected,
+ tostring(a))
+
+ def test_encoding_sub_tostring_default_encoding(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ tostring = self.etree.tostring
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ b.text = _str('Søk på nettet')
+
+ expected = _bytes('<b>S&#248;k p&#229; nettet</b>')
+ self.assertEqual(
+ expected,
+ tostring(b))
+
+ def test_encoding_8bit_xml(self):
+ utext = _str('Søk på nettet')
+ uxml = _str('<p>%s</p>') % utext
+ prologue = _bytes('<?xml version="1.0" encoding="iso-8859-1" ?>')
+ isoxml = prologue + uxml.encode('iso-8859-1')
+ tree = self.etree.XML(isoxml)
+ self.assertEqual(utext, tree.text)
+
+ def test_encoding_utf8_bom(self):
+ utext = _str('Søk på nettet')
+ uxml = (_str('<?xml version="1.0" encoding="UTF-8"?>') +
+ _str('<p>%s</p>') % utext)
+ bom = _bytes('\\xEF\\xBB\\xBF').decode("unicode_escape").encode("latin1")
+ xml = bom + uxml.encode("utf-8")
+ tree = etree.XML(xml)
+ self.assertEqual(utext, tree.text)
+
+ def test_encoding_8bit_parse_stringio(self):
+ utext = _str('Søk på nettet')
+ uxml = _str('<p>%s</p>') % utext
+ prologue = _bytes('<?xml version="1.0" encoding="iso-8859-1" ?>')
+ isoxml = prologue + uxml.encode('iso-8859-1')
+ el = self.etree.parse(BytesIO(isoxml)).getroot()
+ self.assertEqual(utext, el.text)
+
+ def test_deepcopy_elementtree(self):
+ Element = self.etree.Element
+ ElementTree = self.etree.ElementTree
+
+ a = Element('a')
+ a.text = "Foo"
+ atree = ElementTree(a)
+
+ btree = copy.deepcopy(atree)
+ self.assertEqual("Foo", atree.getroot().text)
+ self.assertEqual("Foo", btree.getroot().text)
+ self.assertFalse(btree is atree)
+ self.assertFalse(btree.getroot() is atree.getroot())
+
+ def test_deepcopy(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ a.text = 'Foo'
+
+ b = copy.deepcopy(a)
+ self.assertEqual('Foo', b.text)
+
+ b.text = 'Bar'
+ self.assertEqual('Bar', b.text)
+ self.assertEqual('Foo', a.text)
+
+ del a
+ self.assertEqual('Bar', b.text)
+
+ def test_deepcopy_tail(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ a.tail = 'Foo'
+
+ b = copy.deepcopy(a)
+ self.assertEqual('Foo', b.tail)
+
+ b.tail = 'Bar'
+ self.assertEqual('Bar', b.tail)
+ self.assertEqual('Foo', a.tail)
+
+ del a
+ self.assertEqual('Bar', b.tail)
+
+ def test_deepcopy_subelement(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ root = Element('root')
+ a = SubElement(root, 'a')
+ a.text = 'FooText'
+ a.tail = 'FooTail'
+
+ b = copy.deepcopy(a)
+ self.assertEqual('FooText', b.text)
+ self.assertEqual('FooTail', b.tail)
+
+ b.text = 'BarText'
+ b.tail = 'BarTail'
+ self.assertEqual('BarTail', b.tail)
+ self.assertEqual('FooTail', a.tail)
+ self.assertEqual('BarText', b.text)
+ self.assertEqual('FooText', a.text)
+
+ del a
+ self.assertEqual('BarTail', b.tail)
+ self.assertEqual('BarText', b.text)
+
+ def test_deepcopy_namespaces(self):
+ root = self.etree.XML(_bytes('''<doc xmlns="dns" xmlns:t="tns">
+ <parent><node t:foo="bar" /></parent>
+ </doc>'''))
+ self.assertEqual(
+ root[0][0].get('{tns}foo'),
+ copy.deepcopy(root[0])[0].get('{tns}foo') )
+ self.assertEqual(
+ root[0][0].get('{tns}foo'),
+ copy.deepcopy(root[0][0]).get('{tns}foo') )
+
+ def test_deepcopy_append(self):
+ # previously caused a crash
+ Element = self.etree.Element
+ tostring = self.etree.tostring
+
+ a = Element('a')
+ b = copy.deepcopy(a)
+ a.append( Element('C') )
+ b.append( Element('X') )
+
+ self.assertEqual(_bytes('<a><C/></a>'),
+ tostring(a).replace(_bytes(' '), _bytes('')))
+ self.assertEqual(_bytes('<a><X/></a>'),
+ tostring(b).replace(_bytes(' '), _bytes('')))
+
+ def test_deepcopy_comment(self):
+ # previously caused a crash
+ # not supported by ET < 1.3!
+ Comment = self.etree.Comment
+
+ a = Comment("ONE")
+ b = copy.deepcopy(a)
+ b.text = "ANOTHER"
+
+ self.assertEqual('ONE', a.text)
+ self.assertEqual('ANOTHER', b.text)
+
+ def test_shallowcopy(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ a.text = 'Foo'
+
+ b = copy.copy(a)
+ self.assertEqual('Foo', b.text)
+
+ b.text = 'Bar'
+ self.assertEqual('Bar', b.text)
+ self.assertEqual('Foo', a.text)
+ # XXX ElementTree will share nodes, but lxml.etree won't..
+
+ def test_shallowcopy_elementtree(self):
+ Element = self.etree.Element
+ ElementTree = self.etree.ElementTree
+
+ a = Element('a')
+ a.text = 'Foo'
+ atree = ElementTree(a)
+
+ btree = copy.copy(atree)
+ self.assertFalse(btree is atree)
+ self.assertTrue(btree.getroot() is atree.getroot())
+ self.assertEqual('Foo', atree.getroot().text)
+
+ def _test_element_boolean(self):
+ # deprecated as of ET 1.3/lxml 2.0
+ etree = self.etree
+ e = etree.Element('foo')
+ self.assertEqual(False, bool(e))
+ etree.SubElement(e, 'bar')
+ self.assertEqual(True, bool(e))
+ e = etree.Element('foo')
+ e.text = 'hey'
+ self.assertEqual(False, bool(e))
+ e = etree.Element('foo')
+ e.tail = 'bar'
+ self.assertEqual(False, bool(e))
+ e = etree.Element('foo')
+ e.set('bar', 'Bar')
+ self.assertEqual(False, bool(e))
+
+ def test_multiple_elementrees(self):
+ etree = self.etree
+
+ a = etree.Element('a')
+ b = etree.SubElement(a, 'b')
+
+ t = etree.ElementTree(a)
+ self.assertEqual(self._rootstring(t), _bytes('<a><b/></a>'))
+
+ t1 = etree.ElementTree(a)
+ self.assertEqual(self._rootstring(t1), _bytes('<a><b/></a>'))
+ self.assertEqual(self._rootstring(t), _bytes('<a><b/></a>'))
+
+ t2 = etree.ElementTree(b)
+ self.assertEqual(self._rootstring(t2), _bytes('<b/>'))
+ self.assertEqual(self._rootstring(t1), _bytes('<a><b/></a>'))
+ self.assertEqual(self._rootstring(t), _bytes('<a><b/></a>'))
+
+ def test_qname(self):
+ etree = self.etree
+ qname = etree.QName('myns', 'a')
+ a1 = etree.Element(qname)
+ a2 = etree.SubElement(a1, qname)
+ self.assertEqual(a1.tag, "{myns}a")
+ self.assertEqual(a2.tag, "{myns}a")
+
+ def test_qname_cmp(self):
+ etree = self.etree
+ qname1 = etree.QName('myns', 'a')
+ qname2 = etree.QName('myns', 'a')
+ self.assertEqual(qname1, "{myns}a")
+ self.assertEqual("{myns}a", qname2)
+ self.assertEqual(qname1, qname1)
+ self.assertEqual(qname1, qname2)
+
+ def test_qname_attribute_getset(self):
+ etree = self.etree
+ qname = etree.QName('myns', 'a')
+
+ a = etree.Element(qname)
+ a.set(qname, "value")
+
+ self.assertEqual(a.get(qname), "value")
+ self.assertEqual(a.get("{myns}a"), "value")
+
+ def test_qname_attrib(self):
+ etree = self.etree
+ qname = etree.QName('myns', 'a')
+
+ a = etree.Element(qname)
+ a.attrib[qname] = "value"
+
+ self.assertEqual(a.attrib[qname], "value")
+ self.assertEqual(a.attrib.get(qname), "value")
+
+ self.assertEqual(a.attrib["{myns}a"], "value")
+ self.assertEqual(a.attrib.get("{myns}a"), "value")
+
+ def test_qname_attribute_resolve(self):
+ etree = self.etree
+ qname = etree.QName('http://myns', 'a')
+ a = etree.Element(qname)
+ a.set(qname, qname)
+
+ self.assertXML(
+ _bytes('<ns0:a xmlns:ns0="http://myns" ns0:a="ns0:a"></ns0:a>'),
+ a)
+
+ def test_qname_attribute_resolve_new(self):
+ etree = self.etree
+ qname = etree.QName('http://myns', 'a')
+ a = etree.Element('a')
+ a.set('a', qname)
+
+ self.assertXML(
+ _bytes('<a xmlns:ns0="http://myns" a="ns0:a"></a>'),
+ a)
+
+ def test_qname_attrib_resolve(self):
+ etree = self.etree
+ qname = etree.QName('http://myns', 'a')
+ a = etree.Element(qname)
+ a.attrib[qname] = qname
+
+ self.assertXML(
+ _bytes('<ns0:a xmlns:ns0="http://myns" ns0:a="ns0:a"></ns0:a>'),
+ a)
+
+ def test_parser_version(self):
+ etree = self.etree
+ parser = etree.XMLParser()
+ if hasattr(parser, "version"):
+ # ElementTree 1.3+, cET
+ self.assertTrue(re.match("[^ ]+ [0-9.]+", parser.version))
+
+ # feed parser interface
+
+ def test_feed_parser_bytes(self):
+ parser = self.XMLParser()
+
+ parser.feed(_bytes('<?xml version='))
+ parser.feed(_bytes('"1.0"?><ro'))
+ parser.feed(_bytes('ot><'))
+ parser.feed(_bytes('a test="works"/'))
+ parser.feed(_bytes('></root'))
+ parser.feed(_bytes('>'))
+
+ root = parser.close()
+
+ self.assertEqual(root.tag, "root")
+ self.assertEqual(root[0].tag, "a")
+ self.assertEqual(root[0].get("test"), "works")
+
+ def test_feed_parser_unicode(self):
+ parser = self.XMLParser()
+
+ parser.feed(_str('<ro'))
+ parser.feed(_str('ot><'))
+ parser.feed(_str('a test="works"/'))
+ parser.feed(_str('></root'))
+ parser.feed(_str('>'))
+
+ root = parser.close()
+
+ self.assertEqual(root.tag, "root")
+ self.assertEqual(root[0].tag, "a")
+ self.assertEqual(root[0].get("test"), "works")
+
+ required_versions_ET['test_feed_parser_error_close_empty'] = (1,3)
+ def test_feed_parser_error_close_empty(self):
+ ParseError = self.etree.ParseError
+ parser = self.XMLParser()
+ self.assertRaises(ParseError, parser.close)
+
+ required_versions_ET['test_feed_parser_error_close_incomplete'] = (1,3)
+ def test_feed_parser_error_close_incomplete(self):
+ ParseError = self.etree.ParseError
+ parser = self.XMLParser()
+
+ parser.feed('<?xml version=')
+ parser.feed('"1.0"?><ro')
+
+ self.assertRaises(ParseError, parser.close)
+
+ required_versions_ET['test_feed_parser_error_broken'] = (1,3)
+ def test_feed_parser_error_broken(self):
+ ParseError = self.etree.ParseError
+ parser = self.XMLParser()
+
+ parser.feed('<?xml version=')
+ parser.feed('"1.0"?><ro')
+ try:
+ parser.feed('<><><><><><><')
+ except ParseError:
+ # can raise, but not required before close()
+ pass
+
+ self.assertRaises(ParseError, parser.close)
+
+ required_versions_ET['test_feed_parser_error_position'] = (1,3)
+ def test_feed_parser_error_position(self):
+ ParseError = self.etree.ParseError
+ parser = self.XMLParser()
+ try:
+ parser.close()
+ except ParseError:
+ e = sys.exc_info()[1]
+ self.assertNotEqual(None, e.code)
+ self.assertNotEqual(0, e.code)
+ self.assertTrue(isinstance(e.position, tuple))
+ self.assertTrue(e.position >= (0, 0))
+
+ # parser target interface
+
+ required_versions_ET['test_parser_target_property'] = (1,3)
+ def test_parser_target_property(self):
+ class Target(object):
+ pass
+
+ target = Target()
+ parser = self.XMLParser(target=target)
+
+ self.assertEqual(target, parser.target)
+
+ def test_parser_target_tag(self):
+ assertEqual = self.assertEqual
+ assertFalse = self.assertFalse
+
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start")
+ assertFalse(attrib)
+ assertEqual("TAG", tag)
+ def end(self, tag):
+ events.append("end")
+ assertEqual("TAG", tag)
+ def close(self):
+ return "DONE"
+
+ parser = self.XMLParser(target=Target())
+
+ parser.feed("<TAG/>")
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual(["start", "end"], events)
+
+ def test_parser_target_error_in_start(self):
+ assertEqual = self.assertEqual
+
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start")
+ assertEqual("TAG", tag)
+ raise ValueError("TEST")
+ def end(self, tag):
+ events.append("end")
+ assertEqual("TAG", tag)
+ def close(self):
+ return "DONE"
+
+ parser = self.XMLParser(target=Target())
+
+ try:
+ parser.feed("<TAG/>")
+ except ValueError:
+ self.assertTrue('TEST' in str(sys.exc_info()[1]))
+ else:
+ self.assertTrue(False)
+ if 'lxml' in self.etree.__name__:
+ self.assertEqual(["start"], events)
+ else:
+ # cElementTree calls end() as well
+ self.assertTrue("start" in events)
+
+ def test_parser_target_error_in_end(self):
+ assertEqual = self.assertEqual
+
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start")
+ assertEqual("TAG", tag)
+ def end(self, tag):
+ events.append("end")
+ assertEqual("TAG", tag)
+ raise ValueError("TEST")
+ def close(self):
+ return "DONE"
+
+ parser = self.XMLParser(target=Target())
+
+ try:
+ parser.feed("<TAG/>")
+ except ValueError:
+ self.assertTrue('TEST' in str(sys.exc_info()[1]))
+ else:
+ self.assertTrue(False)
+ self.assertEqual(["start", "end"], events)
+
+ def test_parser_target_error_in_close(self):
+ assertEqual = self.assertEqual
+
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start")
+ assertEqual("TAG", tag)
+ def end(self, tag):
+ events.append("end")
+ assertEqual("TAG", tag)
+ def close(self):
+ raise ValueError("TEST")
+
+ parser = self.XMLParser(target=Target())
+
+ try:
+ parser.feed("<TAG/>")
+ parser.close()
+ except ValueError:
+ self.assertTrue('TEST' in str(sys.exc_info()[1]))
+ else:
+ self.assertTrue(False)
+ self.assertEqual(["start", "end"], events)
+
+ def test_parser_target_error_in_start_and_close(self):
+ assertEqual = self.assertEqual
+
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start")
+ assertEqual("TAG", tag)
+ raise IndexError("TEST-IE")
+ def end(self, tag):
+ events.append("end")
+ assertEqual("TAG", tag)
+ def close(self):
+ raise ValueError("TEST-VE")
+
+ parser = self.XMLParser(target=Target())
+
+ try:
+ parser.feed("<TAG/>")
+ parser.close()
+ except IndexError:
+ if 'lxml' in self.etree.__name__:
+ # we try not to swallow the initial exception in Py2
+ self.assertTrue(sys.version_info[0] < 3)
+ self.assertTrue('TEST-IE' in str(sys.exc_info()[1]))
+ except ValueError:
+ if 'lxml' in self.etree.__name__:
+ self.assertTrue(sys.version_info[0] >= 3)
+ self.assertTrue('TEST-VE' in str(sys.exc_info()[1]))
+ else:
+ self.assertTrue(False)
+
+ if 'lxml' in self.etree.__name__:
+ self.assertEqual(["start"], events)
+ else:
+ # cElementTree calls end() as well
+ self.assertTrue("start" in events)
+
+ def test_elementtree_parser_target(self):
+ assertEqual = self.assertEqual
+ assertFalse = self.assertFalse
+ Element = self.etree.Element
+
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start")
+ assertFalse(attrib)
+ assertEqual("TAG", tag)
+ def end(self, tag):
+ events.append("end")
+ assertEqual("TAG", tag)
+ def close(self):
+ return Element("DONE")
+
+ parser = self.XMLParser(target=Target())
+ tree = self.etree.ElementTree()
+ tree.parse(BytesIO("<TAG/>"), parser=parser)
+
+ self.assertEqual("DONE", tree.getroot().tag)
+ self.assertEqual(["start", "end"], events)
+
+ def test_parser_target_attrib(self):
+ assertEqual = self.assertEqual
+
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ for name, value in attrib.items():
+ assertEqual(tag + name, value)
+ def end(self, tag):
+ events.append("end-" + tag)
+ def close(self):
+ return "DONE"
+
+ parser = self.XMLParser(target=Target())
+
+ parser.feed('<root a="roota" b="rootb"><sub c="subc"/></root>')
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual(["start-root", "start-sub", "end-sub", "end-root"],
+ events)
+
+ def test_parser_target_data(self):
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ def end(self, tag):
+ events.append("end-" + tag)
+ def data(self, data):
+ events.append("data-" + data)
+ def close(self):
+ return "DONE"
+
+ parser = self.XMLParser(target=Target())
+
+ parser.feed('<root>A<sub/>B</root>')
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual(["start-root", "data-A", "start-sub",
+ "end-sub", "data-B", "end-root"],
+ events)
+
+ def test_parser_target_entity(self):
+ events = []
+ class Target(object):
+ def __init__(self):
+ self._data = []
+ def _flush_data(self):
+ if self._data:
+ events.append("data-" + ''.join(self._data))
+ del self._data[:]
+ def start(self, tag, attrib):
+ self._flush_data()
+ events.append("start-" + tag)
+ def end(self, tag):
+ self._flush_data()
+ events.append("end-" + tag)
+ def data(self, data):
+ self._data.append(data)
+ def close(self):
+ self._flush_data()
+ return "DONE"
+
+ parser = self.XMLParser(target=Target())
+
+ dtd = '''
+ <!DOCTYPE root [
+ <!ELEMENT root (sub*)>
+ <!ELEMENT sub (#PCDATA)>
+ <!ENTITY ent "an entity">
+ ]>
+ '''
+ parser.feed(dtd+'<root><sub/><sub>this is &ent;</sub><sub/></root>')
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual(["start-root", "start-sub", "end-sub", "start-sub",
+ "data-this is an entity",
+ "end-sub", "start-sub", "end-sub", "end-root"],
+ events)
+
+ required_versions_ET['test_parser_target_entity_unknown'] = (1,3)
+ def test_parser_target_entity_unknown(self):
+ events = []
+ class Target(object):
+ def __init__(self):
+ self._data = []
+ def _flush_data(self):
+ if self._data:
+ events.append("data-" + ''.join(self._data))
+ del self._data[:]
+ def start(self, tag, attrib):
+ self._flush_data()
+ events.append("start-" + tag)
+ def end(self, tag):
+ self._flush_data()
+ events.append("end-" + tag)
+ def data(self, data):
+ self._data.append(data)
+ def close(self):
+ self._flush_data()
+ return "DONE"
+
+ parser = self.XMLParser(target=Target())
+
+ def feed():
+ parser.feed('<root><sub/><sub>some &ent;</sub><sub/></root>')
+ parser.close()
+
+ self.assertRaises(self.etree.ParseError, feed)
+
+ @et_needs_pyversion(3, 8, 0, 'alpha', 4)
+ def test_parser_target_start_end_ns(self):
+ class Builder(list):
+ def start(self, tag, attrib):
+ self.append(("start", tag))
+ def end(self, tag):
+ self.append(("end", tag))
+ def data(self, text):
+ pass
+ def pi(self, target, data):
+ self.append(("pi", target, data))
+ def comment(self, data):
+ self.append(("comment", data))
+ def start_ns(self, prefix, uri):
+ self.append(("start-ns", prefix, uri))
+ def end_ns(self, prefix):
+ self.append(("end-ns", prefix))
+
+ builder = Builder()
+ parser = self.etree.XMLParser(target=builder)
+ parser.feed(textwrap.dedent("""\
+ <?pi data?>
+ <!-- comment -->
+ <root xmlns='namespace'>
+ <element key='value'>text</element>
+ <element>text</element>tail
+ <empty-element/>
+ </root>
+ """))
+ self.assertEqual(builder, [
+ ('pi', 'pi', 'data'),
+ ('comment', ' comment '),
+ ('start-ns', '', 'namespace'),
+ ('start', '{namespace}root'),
+ ('start', '{namespace}element'),
+ ('end', '{namespace}element'),
+ ('start', '{namespace}element'),
+ ('end', '{namespace}element'),
+ ('start', '{namespace}empty-element'),
+ ('end', '{namespace}empty-element'),
+ ('end', '{namespace}root'),
+ ('end-ns', ''),
+ ])
+
+ @et_needs_pyversion(3, 8, 0, 'alpha', 4)
+ def test_parser_target_end_ns(self):
+ class Builder(list):
+ def end_ns(self, prefix):
+ self.append(("end-ns", prefix))
+
+ builder = Builder()
+ parser = self.etree.XMLParser(target=builder)
+ parser.feed(textwrap.dedent("""\
+ <?pi data?>
+ <!-- comment -->
+ <root xmlns='namespace' xmlns:p='pns'>
+ <element key='value'>text</element>
+ <p:element>text</p:element>tail
+ <empty-element/>
+ </root>
+ """))
+ self.assertEqual(builder, [
+ ('end-ns', 'p'),
+ ('end-ns', ''),
+ ])
+
+ def test_treebuilder(self):
+ builder = self.etree.TreeBuilder()
+ el = builder.start("root", {'a':'A', 'b':'B'})
+ self.assertEqual("root", el.tag)
+ self.assertEqual({'a':'A', 'b':'B'}, el.attrib)
+ builder.data("ROOTTEXT")
+ el = builder.start("child", {'x':'X', 'y':'Y'})
+ self.assertEqual("child", el.tag)
+ self.assertEqual({'x':'X', 'y':'Y'}, el.attrib)
+ builder.data("CHILDTEXT")
+ el = builder.end("child")
+ self.assertEqual("child", el.tag)
+ self.assertEqual({'x':'X', 'y':'Y'}, el.attrib)
+ self.assertEqual("CHILDTEXT", el.text)
+ self.assertEqual(None, el.tail)
+ builder.data("CHILDTAIL")
+ root = builder.end("root")
+
+ self.assertEqual("root", root.tag)
+ self.assertEqual("ROOTTEXT", root.text)
+ self.assertEqual("CHILDTEXT", root[0].text)
+ self.assertEqual("CHILDTAIL", root[0].tail)
+
+ def test_treebuilder_target(self):
+ parser = self.XMLParser(target=self.etree.TreeBuilder())
+ parser.feed('<root>ROOTTEXT<child>CHILDTEXT</child>CHILDTAIL</root>')
+ root = parser.close()
+
+ self.assertEqual("root", root.tag)
+ self.assertEqual("ROOTTEXT", root.text)
+ self.assertEqual("CHILDTEXT", root[0].text)
+ self.assertEqual("CHILDTAIL", root[0].tail)
+
+ @et_needs_pyversion(3, 8, 0, 'alpha', 4)
+ def test_treebuilder_comment(self):
+ ET = self.etree
+ b = ET.TreeBuilder()
+ self.assertEqual(b.comment('ctext').tag, ET.Comment)
+ self.assertEqual(b.comment('ctext').text, 'ctext')
+
+ b = ET.TreeBuilder(comment_factory=ET.Comment)
+ self.assertEqual(b.comment('ctext').tag, ET.Comment)
+ self.assertEqual(b.comment('ctext').text, 'ctext')
+
+ #b = ET.TreeBuilder(comment_factory=len)
+ #self.assertEqual(b.comment('ctext'), len('ctext'))
+
+ @et_needs_pyversion(3, 8, 0, 'alpha', 4)
+ def test_treebuilder_pi(self):
+ ET = self.etree
+ is_lxml = ET.__name__ == 'lxml.etree'
+
+ b = ET.TreeBuilder()
+ self.assertEqual(b.pi('target', None).tag, ET.PI)
+ if is_lxml:
+ self.assertEqual(b.pi('target', None).target, 'target')
+ else:
+ self.assertEqual(b.pi('target', None).text, 'target')
+
+ b = ET.TreeBuilder(pi_factory=ET.PI)
+ self.assertEqual(b.pi('target').tag, ET.PI)
+ if is_lxml:
+ self.assertEqual(b.pi('target').target, "target")
+ else:
+ self.assertEqual(b.pi('target').text, "target")
+ self.assertEqual(b.pi('pitarget', ' text ').tag, ET.PI)
+ if is_lxml:
+ self.assertEqual(b.pi('pitarget', ' text ').target, "pitarget")
+ self.assertEqual(b.pi('pitarget', ' text ').text, " text ")
+ else:
+ self.assertEqual(b.pi('pitarget', ' text ').text, "pitarget text ")
+
+ #b = ET.TreeBuilder(pi_factory=lambda target, text: (len(target), text))
+ #self.assertEqual(b.pi('target'), (len('target'), None))
+ #self.assertEqual(b.pi('pitarget', ' text '), (len('pitarget'), ' text '))
+
+ def test_late_tail(self):
+ # Issue #37399: The tail of an ignored comment could overwrite the text before it.
+ ET = self.etree
+ class TreeBuilderSubclass(ET.TreeBuilder):
+ pass
+
+ if ET.__name__ == 'lxml.etree':
+ def assert_content(a):
+ self.assertEqual(a.text, "text")
+ self.assertEqual(a[0].tail, "tail")
+ else:
+ def assert_content(a):
+ self.assertEqual(a.text, "texttail")
+
+ xml = "<a>text<!-- comment -->tail</a>"
+ a = ET.fromstring(xml)
+ assert_content(a)
+
+ parser = ET.XMLParser(target=TreeBuilderSubclass())
+ parser.feed(xml)
+ a = parser.close()
+ assert_content(a)
+
+ xml = "<a>text<?pi data?>tail</a>"
+ a = ET.fromstring(xml)
+ assert_content(a)
+
+ xml = "<a>text<?pi data?>tail</a>"
+ parser = ET.XMLParser(target=TreeBuilderSubclass())
+ parser.feed(xml)
+ a = parser.close()
+ assert_content(a)
+
+ @et_needs_pyversion(3, 8, 0, 'alpha', 4)
+ def test_late_tail_mix_pi_comments(self):
+ # Issue #37399: The tail of an ignored comment could overwrite the text before it.
+ # Test appending tails to comments/pis.
+ ET = self.etree
+ class TreeBuilderSubclass(ET.TreeBuilder):
+ pass
+
+ xml = "<a>text<?pi1?> <!-- comment -->\n<?pi2?>tail</a>"
+ parser = ET.XMLParser(target=ET.TreeBuilder(insert_comments=True, insert_pis=False))
+ parser.feed(xml)
+ a = parser.close()
+ self.assertEqual(a[0].text, ' comment ')
+ self.assertEqual(a[0].tail, '\ntail')
+ self.assertEqual(a.text, "text ")
+
+ parser = ET.XMLParser(target=TreeBuilderSubclass(insert_comments=True, insert_pis=False))
+ parser.feed(xml)
+ a = parser.close()
+ self.assertEqual(a[0].text, ' comment ')
+ self.assertEqual(a[0].tail, '\ntail')
+ self.assertEqual(a.text, "text ")
+
+ xml = "<a>text<!-- comment -->\n<?pi data?>tail</a>"
+ parser = ET.XMLParser(target=ET.TreeBuilder(insert_pis=True, insert_comments=False))
+ parser.feed(xml)
+ a = parser.close()
+ self.assertEqual(a[0].text[-4:], 'data')
+ self.assertEqual(a[0].tail, 'tail')
+ self.assertEqual(a.text, "text\n")
+
+ parser = ET.XMLParser(target=TreeBuilderSubclass(insert_pis=True, insert_comments=False))
+ parser.feed(xml)
+ a = parser.close()
+ self.assertEqual(a[0].text[-4:], 'data')
+ self.assertEqual(a[0].tail, 'tail')
+ self.assertEqual(a.text, "text\n")
+
+ # helper methods
+
+ def _writeElement(self, element, encoding='us-ascii'):
+ """Write out element for comparison.
+ """
+ data = self.etree.tostring(element, encoding=encoding)
+ return canonicalize(data)
+
+ def _writeElementFile(self, element, encoding='us-ascii'):
+ """Write out element for comparison, using real file.
+ """
+ ElementTree = self.etree.ElementTree
+ with tmpfile() as filename:
+ with open(filename, 'wb') as f:
+ tree = ElementTree(element=element)
+ tree.write(f, encoding=encoding)
+ with open(filename, 'rb') as f:
+ data = f.read()
+ return canonicalize(data)
+
+ def assertXML(self, expected, element, encoding='us-ascii'):
+ """Writes element out and checks whether it is expected.
+
+ Does this two ways; once using BytesIO, once using a real file.
+ """
+ if isinstance(expected, unicode):
+ expected = expected.encode(encoding)
+ self.assertEqual(expected, self._writeElement(element, encoding))
+ self.assertEqual(expected, self._writeElementFile(element, encoding))
+
+ def assertEncodingDeclaration(self, result, encoding):
+ "Checks if the result XML byte string specifies the encoding."
+ enc_re = r"<\?xml[^>]+ encoding=[\"']([^\"']+)[\"']"
+ if isinstance(result, str):
+ has_encoding = re.compile(enc_re).match
+ else:
+ has_encoding = re.compile(_bytes(enc_re)).match
+ self.assertTrue(has_encoding(result))
+ result_encoding = has_encoding(result).group(1)
+ self.assertEqual(result_encoding.upper(), encoding.upper())
+
+ def _rootstring(self, tree):
+ return self.etree.tostring(tree.getroot()).replace(
+ _bytes(' '), _bytes('')).replace(_bytes('\n'), _bytes(''))
+
+ def _check_element_tree(self, tree):
+ self._check_element(tree.getroot())
+
+ def _check_element(self, element):
+ self.assertTrue(hasattr(element, 'tag'))
+ self.assertTrue(hasattr(element, 'attrib'))
+ self.assertTrue(hasattr(element, 'text'))
+ self.assertTrue(hasattr(element, 'tail'))
+ self._check_string(element.tag)
+ self._check_mapping(element.attrib)
+ if element.text is not None:
+ self._check_string(element.text)
+ if element.tail is not None:
+ self._check_string(element.tail)
+
+ def _check_string(self, string):
+ len(string)
+ for char in string:
+ self.assertEqual(1, len(char))
+ new_string = string + ""
+ new_string = string + " "
+ string[:0]
+
+ def _check_mapping(self, mapping):
+ len(mapping)
+ keys = mapping.keys()
+ values = mapping.values()
+ items = mapping.items()
+ for key in keys:
+ item = mapping[key]
+ mapping["key"] = "value"
+ self.assertEqual("value", mapping["key"])
+
+
+class _ElementSlicingTest(unittest.TestCase):
+ etree = None
+
+ def _elem_tags(self, elemlist):
+ return [e.tag for e in elemlist]
+
+ def _subelem_tags(self, elem):
+ return self._elem_tags(list(elem))
+
+ def _make_elem_with_children(self, numchildren):
+ """Create an Element with a tag 'a', with the given amount of children
+ named 'a0', 'a1' ... and so on.
+
+ """
+ e = self.etree.Element('a')
+ for i in range(numchildren):
+ self.etree.SubElement(e, 'a%s' % i)
+ return e
+
+ def test_getslice_single_index(self):
+ e = self._make_elem_with_children(10)
+
+ self.assertEqual(e[1].tag, 'a1')
+ self.assertEqual(e[-2].tag, 'a8')
+
+ self.assertRaises(IndexError, lambda: e[12])
+ self.assertRaises(IndexError, lambda: e[-12])
+
+ def test_getslice_range(self):
+ e = self._make_elem_with_children(6)
+
+ self.assertEqual(self._elem_tags(e[3:]), ['a3', 'a4', 'a5'])
+ self.assertEqual(self._elem_tags(e[3:6]), ['a3', 'a4', 'a5'])
+ self.assertEqual(self._elem_tags(e[3:16]), ['a3', 'a4', 'a5'])
+ self.assertEqual(self._elem_tags(e[3:5]), ['a3', 'a4'])
+ self.assertEqual(self._elem_tags(e[3:-1]), ['a3', 'a4'])
+ self.assertEqual(self._elem_tags(e[:2]), ['a0', 'a1'])
+
+ def test_getslice_steps(self):
+ e = self._make_elem_with_children(10)
+
+ self.assertEqual(self._elem_tags(e[8:10:1]), ['a8', 'a9'])
+ self.assertEqual(self._elem_tags(e[::3]), ['a0', 'a3', 'a6', 'a9'])
+ self.assertEqual(self._elem_tags(e[::8]), ['a0', 'a8'])
+ self.assertEqual(self._elem_tags(e[1::8]), ['a1', 'a9'])
+ self.assertEqual(self._elem_tags(e[3::sys.maxsize]), ['a3'])
+ self.assertEqual(self._elem_tags(e[3::sys.maxsize<<64]), ['a3'])
+
+ def test_getslice_negative_steps(self):
+ e = self._make_elem_with_children(4)
+
+ self.assertEqual(self._elem_tags(e[::-1]), ['a3', 'a2', 'a1', 'a0'])
+ self.assertEqual(self._elem_tags(e[::-2]), ['a3', 'a1'])
+ self.assertEqual(self._elem_tags(e[3::-sys.maxsize]), ['a3'])
+ self.assertEqual(self._elem_tags(e[3::-sys.maxsize-1]), ['a3'])
+ self.assertEqual(self._elem_tags(e[3::-sys.maxsize<<64]), ['a3'])
+
+ def test_delslice(self):
+ e = self._make_elem_with_children(4)
+ del e[0:2]
+ self.assertEqual(self._subelem_tags(e), ['a2', 'a3'])
+
+ e = self._make_elem_with_children(4)
+ del e[0:]
+ self.assertEqual(self._subelem_tags(e), [])
+
+ e = self._make_elem_with_children(4)
+ del e[::-1]
+ self.assertEqual(self._subelem_tags(e), [])
+
+ e = self._make_elem_with_children(4)
+ del e[::-2]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'a2'])
+
+ e = self._make_elem_with_children(4)
+ del e[1::2]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'a2'])
+
+ e = self._make_elem_with_children(2)
+ del e[::2]
+ self.assertEqual(self._subelem_tags(e), ['a1'])
+
+ def test_setslice_single_index(self):
+ e = self._make_elem_with_children(4)
+ e[1] = self.etree.Element('b')
+ self.assertEqual(self._subelem_tags(e), ['a0', 'b', 'a2', 'a3'])
+
+ e[-2] = self.etree.Element('c')
+ self.assertEqual(self._subelem_tags(e), ['a0', 'b', 'c', 'a3'])
+
+ with self.assertRaises(IndexError):
+ e[5] = self.etree.Element('d')
+ with self.assertRaises(IndexError):
+ e[-5] = self.etree.Element('d')
+ self.assertEqual(self._subelem_tags(e), ['a0', 'b', 'c', 'a3'])
+
+ def test_setslice_range(self):
+ e = self._make_elem_with_children(4)
+ e[1:3] = [self.etree.Element('b%s' % i) for i in range(2)]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'b0', 'b1', 'a3'])
+
+ e = self._make_elem_with_children(4)
+ e[1:3] = [self.etree.Element('b')]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'b', 'a3'])
+
+ e = self._make_elem_with_children(4)
+ e[1:3] = [self.etree.Element('b%s' % i) for i in range(3)]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'b0', 'b1', 'b2', 'a3'])
+
+ def test_setslice_steps(self):
+ e = self._make_elem_with_children(6)
+ e[1:5:2] = [self.etree.Element('b%s' % i) for i in range(2)]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'b0', 'a2', 'b1', 'a4', 'a5'])
+
+ e = self._make_elem_with_children(6)
+ with self.assertRaises(ValueError):
+ e[1:5:2] = [self.etree.Element('b')]
+ with self.assertRaises(ValueError):
+ e[1:5:2] = [self.etree.Element('b%s' % i) for i in range(3)]
+ with self.assertRaises(ValueError):
+ e[1:5:2] = []
+ self.assertEqual(self._subelem_tags(e), ['a0', 'a1', 'a2', 'a3', 'a4', 'a5'])
+
+ e = self._make_elem_with_children(4)
+ e[1::sys.maxsize] = [self.etree.Element('b')]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'b', 'a2', 'a3'])
+ e[1::sys.maxsize<<64] = [self.etree.Element('c')]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'c', 'a2', 'a3'])
+
+ def test_setslice_negative_steps(self):
+ e = self._make_elem_with_children(4)
+ e[2:0:-1] = [self.etree.Element('b%s' % i) for i in range(2)]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'b1', 'b0', 'a3'])
+
+ e = self._make_elem_with_children(4)
+ with self.assertRaises(ValueError):
+ e[2:0:-1] = [self.etree.Element('b')]
+ with self.assertRaises(ValueError):
+ e[2:0:-1] = [self.etree.Element('b%s' % i) for i in range(3)]
+ with self.assertRaises(ValueError):
+ e[2:0:-1] = []
+ self.assertEqual(self._subelem_tags(e), ['a0', 'a1', 'a2', 'a3'])
+
+ e = self._make_elem_with_children(4)
+ e[1::-sys.maxsize] = [self.etree.Element('b')]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'b', 'a2', 'a3'])
+ e[1::-sys.maxsize-1] = [self.etree.Element('c')]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'c', 'a2', 'a3'])
+ e[1::-sys.maxsize<<64] = [self.etree.Element('d')]
+ self.assertEqual(self._subelem_tags(e), ['a0', 'd', 'a2', 'a3'])
+
+
+class _XMLPullParserTest(unittest.TestCase):
+ etree = None
+
+ def _close_and_return_root(self, parser):
+ if 'ElementTree' in self.etree.__name__:
+ # ElementTree's API is a bit unwieldy in Py3.4
+ root = parser._close_and_return_root()
+ else:
+ root = parser.close()
+ return root
+
+ def _feed(self, parser, data, chunk_size=None):
+ if chunk_size is None:
+ parser.feed(data)
+ else:
+ for i in range(0, len(data), chunk_size):
+ parser.feed(data[i:i+chunk_size])
+
+ def assert_events(self, parser, expected, max_events=None):
+ self.assertEqual(
+ [(event, (elem.tag, elem.text))
+ for event, elem in islice(parser.read_events(), max_events)],
+ expected)
+
+ def assert_event_tuples(self, parser, expected, max_events=None):
+ self.assertEqual(
+ list(islice(parser.read_events(), max_events)),
+ expected)
+
+ def assert_event_tags(self, parser, expected, max_events=None):
+ events = islice(parser.read_events(), max_events)
+ self.assertEqual([(action, elem.tag) for action, elem in events],
+ expected)
+
+ def test_simple_xml(self):
+ for chunk_size in (None, 1, 5):
+ #with self.subTest(chunk_size=chunk_size):
+ parser = self.etree.XMLPullParser()
+ self.assert_event_tags(parser, [])
+ self._feed(parser, "<!-- comment -->\n", chunk_size)
+ self.assert_event_tags(parser, [])
+ self._feed(parser,
+ "<root>\n <element key='value'>text</element",
+ chunk_size)
+ self.assert_event_tags(parser, [])
+ self._feed(parser, ">\n", chunk_size)
+ self.assert_event_tags(parser, [('end', 'element')])
+ self._feed(parser, "<element>text</element>tail\n", chunk_size)
+ self._feed(parser, "<empty-element/>\n", chunk_size)
+ self.assert_event_tags(parser, [
+ ('end', 'element'),
+ ('end', 'empty-element'),
+ ])
+ self._feed(parser, "</root>\n", chunk_size)
+ self.assert_event_tags(parser, [('end', 'root')])
+ root = self._close_and_return_root(parser)
+ self.assertEqual(root.tag, 'root')
+
+ def test_feed_while_iterating(self):
+ parser = self.etree.XMLPullParser()
+ it = parser.read_events()
+ self._feed(parser, "<root>\n <element key='value'>text</element>\n")
+ action, elem = next(it)
+ self.assertEqual((action, elem.tag), ('end', 'element'))
+ self._feed(parser, "</root>\n")
+ action, elem = next(it)
+ self.assertEqual((action, elem.tag), ('end', 'root'))
+ with self.assertRaises(StopIteration):
+ next(it)
+
+ def test_simple_xml_with_ns(self):
+ parser = self.etree.XMLPullParser()
+ self.assert_event_tags(parser, [])
+ self._feed(parser, "<!-- comment -->\n")
+ self.assert_event_tags(parser, [])
+ self._feed(parser, "<root xmlns='namespace'>\n")
+ self.assert_event_tags(parser, [])
+ self._feed(parser, "<element key='value'>text</element")
+ self.assert_event_tags(parser, [])
+ self._feed(parser, ">\n")
+ self.assert_event_tags(parser, [('end', '{namespace}element')])
+ self._feed(parser, "<element>text</element>tail\n")
+ self._feed(parser, "<empty-element/>\n")
+ self.assert_event_tags(parser, [
+ ('end', '{namespace}element'),
+ ('end', '{namespace}empty-element'),
+ ])
+ self._feed(parser, "</root>\n")
+ self.assert_event_tags(parser, [('end', '{namespace}root')])
+ root = self._close_and_return_root(parser)
+ self.assertEqual(root.tag, '{namespace}root')
+
+ def test_ns_events(self):
+ parser = self.etree.XMLPullParser(events=('start-ns', 'end-ns'))
+ self._feed(parser, "<!-- comment -->\n")
+ self._feed(parser, "<root xmlns='namespace'>\n")
+ self.assertEqual(
+ list(parser.read_events()),
+ [('start-ns', ('', 'namespace'))])
+ self._feed(parser, "<element key='value'>text</element")
+ self._feed(parser, ">\n")
+ self._feed(parser, "<element>text</element>tail\n")
+ self._feed(parser, "<empty-element/>\n")
+ self._feed(parser, "</root>\n")
+ self.assertEqual(list(parser.read_events()), [('end-ns', None)])
+ parser.close()
+
+ def test_ns_events_end_ns_only(self):
+ parser = self.etree.XMLPullParser(events=['end-ns'])
+ self._feed(parser, "<!-- comment -->\n")
+ self._feed(parser, "<root xmlns='namespace' xmlns:a='abc' xmlns:b='xyz'>\n")
+ self.assertEqual(list(parser.read_events()), [])
+ self._feed(parser, "<a:element key='value'>text</a:element")
+ self._feed(parser, ">\n")
+ self._feed(parser, "<b:element>text</b:element>tail\n")
+ self._feed(parser, "<empty-element/>\n")
+ self.assertEqual(list(parser.read_events()), [])
+ self._feed(parser, "</root>\n")
+ self.assertEqual(list(parser.read_events()), [
+ ('end-ns', None),
+ ('end-ns', None),
+ ('end-ns', None),
+ ])
+ parser.close()
+
+ @et_needs_pyversion(3,8)
+ def test_ns_events_start(self):
+ parser = self.etree.XMLPullParser(events=('start-ns', 'start', 'end'))
+ self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
+ self.assert_event_tuples(parser, [
+ ('start-ns', ('', 'abc')),
+ ('start-ns', ('p', 'xyz')),
+ ], max_events=2)
+ self.assert_event_tags(parser, [
+ ('start', '{abc}tag'),
+ ], max_events=1)
+
+ self._feed(parser, "<child />\n")
+ self.assert_event_tags(parser, [
+ ('start', '{abc}child'),
+ ('end', '{abc}child'),
+ ])
+
+ self._feed(parser, "</tag>\n")
+ parser.close()
+ self.assert_event_tags(parser, [
+ ('end', '{abc}tag'),
+ ])
+
+ @et_needs_pyversion(3,8)
+ def test_ns_events_start_end(self):
+ parser = self.etree.XMLPullParser(events=('start-ns', 'start', 'end', 'end-ns'))
+ self._feed(parser, "<tag xmlns='abc' xmlns:p='xyz'>\n")
+ self.assert_event_tuples(parser, [
+ ('start-ns', ('', 'abc')),
+ ('start-ns', ('p', 'xyz')),
+ ], max_events=2)
+ self.assert_event_tags(parser, [
+ ('start', '{abc}tag'),
+ ], max_events=1)
+
+ self._feed(parser, "<child />\n")
+ self.assert_event_tags(parser, [
+ ('start', '{abc}child'),
+ ('end', '{abc}child'),
+ ])
+
+ self._feed(parser, "</tag>\n")
+ parser.close()
+ self.assert_event_tags(parser, [
+ ('end', '{abc}tag'),
+ ], max_events=1)
+ self.assert_event_tuples(parser, [
+ ('end-ns', None),
+ ('end-ns', None),
+ ])
+
+ def test_events(self):
+ parser = self.etree.XMLPullParser(events=())
+ self._feed(parser, "<root/>\n")
+ self.assert_event_tags(parser, [])
+
+ parser = self.etree.XMLPullParser(events=('start', 'end'))
+ self._feed(parser, "<!-- text here -->\n")
+ self.assert_events(parser, [])
+
+ parser = self.etree.XMLPullParser(events=('start', 'end'))
+ self._feed(parser, "<root>\n")
+ self.assert_event_tags(parser, [('start', 'root')])
+ self._feed(parser, "<element key='value'>text</element")
+ self.assert_event_tags(parser, [('start', 'element')])
+ self._feed(parser, ">\n")
+ self.assert_event_tags(parser, [('end', 'element')])
+ self._feed(parser,
+ "<element xmlns='foo'>text<empty-element/></element>tail\n")
+ self.assert_event_tags(parser, [
+ ('start', '{foo}element'),
+ ('start', '{foo}empty-element'),
+ ('end', '{foo}empty-element'),
+ ('end', '{foo}element'),
+ ])
+ self._feed(parser, "</root>")
+ root = self._close_and_return_root(parser)
+ self.assert_event_tags(parser, [('end', 'root')])
+ self.assertEqual(root.tag, 'root')
+
+ parser = self.etree.XMLPullParser(events=('start',))
+ self._feed(parser, "<!-- comment -->\n")
+ self.assert_event_tags(parser, [])
+ self._feed(parser, "<root>\n")
+ self.assert_event_tags(parser, [('start', 'root')])
+ self._feed(parser, "<element key='value'>text</element")
+ self.assert_event_tags(parser, [('start', 'element')])
+ self._feed(parser, ">\n")
+ self.assert_event_tags(parser, [])
+ self._feed(parser,
+ "<element xmlns='foo'>text<empty-element/></element>tail\n")
+ self.assert_event_tags(parser, [
+ ('start', '{foo}element'),
+ ('start', '{foo}empty-element'),
+ ])
+ self._feed(parser, "</root>")
+ root = self._close_and_return_root(parser)
+ self.assertEqual(root.tag, 'root')
+
+ @et_needs_pyversion(3, 8, 0, 'alpha', 4)
+ def test_events_comment(self):
+ parser = self.etree.XMLPullParser(events=('start', 'comment', 'end'))
+ self._feed(parser, "<!-- text here -->\n")
+ self.assert_events(parser, [('comment', (self.etree.Comment, ' text here '))])
+ self._feed(parser, "<!-- more text here -->\n")
+ self.assert_events(parser, [('comment', (self.etree.Comment, ' more text here '))])
+ self._feed(parser, "<root-tag>text")
+ self.assert_event_tags(parser, [('start', 'root-tag')])
+ self._feed(parser, "<!-- inner comment-->\n")
+ self.assert_events(parser, [('comment', (self.etree.Comment, ' inner comment'))])
+ self._feed(parser, "</root-tag>\n")
+ self.assert_event_tags(parser, [('end', 'root-tag')])
+ self._feed(parser, "<!-- outer comment -->\n")
+ self.assert_events(parser, [('comment', (self.etree.Comment, ' outer comment '))])
+
+ parser = self.etree.XMLPullParser(events=('comment',))
+ self._feed(parser, "<!-- text here -->\n")
+ self.assert_events(parser, [('comment', (self.etree.Comment, ' text here '))])
+
+ @et_needs_pyversion(3, 8, 0, 'alpha', 4)
+ def test_events_pi(self):
+ # Note: lxml's PIs have target+text, ET's PIs have both in "text"
+ parser = self.etree.XMLPullParser(events=('start', 'pi', 'end'))
+ self._feed(parser, "<?pitarget?>\n")
+ self.assert_event_tags(parser, [('pi', self.etree.PI)])
+ parser = self.etree.XMLPullParser(events=('pi',))
+ self._feed(parser, "<?pitarget some text ?>\n")
+ self.assert_event_tags(parser, [('pi', self.etree.PI)])
+
+ def test_events_sequence(self):
+ # Test that events can be some sequence that's not just a tuple or list
+ eventset = {'end', 'start'}
+ parser = self.etree.XMLPullParser(events=eventset)
+ self._feed(parser, "<foo>bar</foo>")
+ self.assert_event_tags(parser, [('start', 'foo'), ('end', 'foo')])
+
+ class DummyIter(object):
+ def __init__(self):
+ self.events = iter(['start', 'end', 'start-ns'])
+ def __iter__(self):
+ return self
+ def __next__(self):
+ return next(self.events)
+ def next(self):
+ return next(self.events)
+
+ parser = self.etree.XMLPullParser(events=DummyIter())
+ self._feed(parser, "<foo>bar</foo>")
+ self.assert_event_tags(parser, [('start', 'foo'), ('end', 'foo')])
+
+ def test_unknown_event(self):
+ with self.assertRaises(ValueError):
+ self.etree.XMLPullParser(events=('start', 'end', 'bogus'))
+
+
+class _C14NTest(unittest.TestCase):
+ etree = None
+ maxDiff = None
+
+ if not hasattr(unittest.TestCase, 'subTest'):
+ @contextmanager
+ def subTest(self, name, **kwargs):
+ try:
+ yield
+ except unittest.SkipTest:
+ raise
+ except Exception as e:
+ print("Subtest {} failed: {}".format(name, e))
+ raise
+
+ def _canonicalize(self, input_file, **options):
+ return self.etree.canonicalize(from_file=input_file, **options)
+
+ #
+ # simple roundtrip tests (from c14n.py)
+
+ def c14n_roundtrip(self, xml, **options):
+ return self.etree.canonicalize(xml, **options)
+
+ def test_simple_roundtrip(self):
+ c14n_roundtrip = self.c14n_roundtrip
+ # Basics
+ self.assertEqual(c14n_roundtrip("<doc/>"), '<doc></doc>')
+ self.assertEqual(c14n_roundtrip("<doc xmlns='uri'/>"), # FIXME
+ '<doc xmlns="uri"></doc>')
+ self.assertEqual(c14n_roundtrip("<prefix:doc xmlns:prefix='uri'/>"),
+ '<prefix:doc xmlns:prefix="uri"></prefix:doc>')
+ self.assertEqual(c14n_roundtrip("<doc xmlns:prefix='uri'><prefix:bar/></doc>"),
+ '<doc><prefix:bar xmlns:prefix="uri"></prefix:bar></doc>')
+ self.assertEqual(c14n_roundtrip("<elem xmlns:wsu='http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-utility-1.0.xsd' xmlns:SOAP-ENV='http://schemas.xmlsoap.org/soap/envelope/' />"),
+ '<elem></elem>')
+
+ # C14N spec
+ self.assertEqual(c14n_roundtrip("<doc>Hello, world!<!-- Comment 1 --></doc>"),
+ '<doc>Hello, world!</doc>')
+ self.assertEqual(c14n_roundtrip("<value>&#x32;</value>"),
+ '<value>2</value>')
+ self.assertEqual(c14n_roundtrip('<compute><![CDATA[value>"0" && value<"10" ?"valid":"error"]]></compute>'),
+ '<compute>value&gt;"0" &amp;&amp; value&lt;"10" ?"valid":"error"</compute>')
+ self.assertEqual(c14n_roundtrip('''<compute expr='value>"0" &amp;&amp; value&lt;"10" ?"valid":"error"'>valid</compute>'''),
+ '<compute expr="value>&quot;0&quot; &amp;&amp; value&lt;&quot;10&quot; ?&quot;valid&quot;:&quot;error&quot;">valid</compute>')
+ self.assertEqual(c14n_roundtrip("<norm attr=' &apos; &#x20;&#13;&#xa;&#9; &apos; '/>"),
+ '<norm attr=" \' &#xD;&#xA;&#x9; \' "></norm>')
+ self.assertEqual(c14n_roundtrip("<normNames attr=' A &#x20;&#13;&#xa;&#9; B '/>"),
+ '<normNames attr=" A &#xD;&#xA;&#x9; B "></normNames>')
+ self.assertEqual(c14n_roundtrip("<normId id=' &apos; &#x20;&#13;&#xa;&#9; &apos; '/>"),
+ '<normId id=" \' &#xD;&#xA;&#x9; \' "></normId>')
+
+ # fragments from PJ's tests
+ #self.assertEqual(c14n_roundtrip("<doc xmlns:x='http://example.com/x' xmlns='http://example.com/default'><b y:a1='1' xmlns='http://example.com/default' a3='3' xmlns:y='http://example.com/y' y:a2='2'/></doc>"),
+ #'<doc xmlns:x="http://example.com/x"><b xmlns:y="http://example.com/y" a3="3" y:a1="1" y:a2="2"></b></doc>')
+
+ @et_needs_pyversion(3, 8, 7)
+ @et_exclude_pyversion(3, 9, 0)
+ def test_c14n_namespaces(self):
+ c14n_roundtrip = self.c14n_roundtrip
+ # Namespace issues
+ # https://bugs.launchpad.net/lxml/+bug/1869455
+ xml = '<X xmlns="http://nps/a"><Y targets="abc,xyz"></Y></X>'
+ self.assertEqual(c14n_roundtrip(xml), xml)
+ xml = '<X xmlns="http://nps/a"><Y xmlns="http://nsp/b" targets="abc,xyz"></Y></X>'
+ self.assertEqual(c14n_roundtrip(xml), xml)
+ xml = '<X xmlns="http://nps/a"><Y xmlns:b="http://nsp/b" b:targets="abc,xyz"></Y></X>'
+ self.assertEqual(c14n_roundtrip(xml), xml)
+
+ def test_c14n_exclusion(self):
+ c14n_roundtrip = self.c14n_roundtrip
+ xml = textwrap.dedent("""\
+ <root xmlns:x="http://example.com/x">
+ <a x:attr="attrx">
+ <b>abtext</b>
+ </a>
+ <b>btext</b>
+ <c>
+ <x:d>dtext</x:d>
+ </c>
+ </root>
+ """)
+ self.assertEqual(
+ c14n_roundtrip(xml, strip_text=True),
+ '<root>'
+ '<a xmlns:x="http://example.com/x" x:attr="attrx"><b>abtext</b></a>'
+ '<b>btext</b>'
+ '<c><x:d xmlns:x="http://example.com/x">dtext</x:d></c>'
+ '</root>')
+ self.assertEqual(
+ c14n_roundtrip(xml, strip_text=True, exclude_attrs=['{http://example.com/x}attr']),
+ '<root>'
+ '<a><b>abtext</b></a>'
+ '<b>btext</b>'
+ '<c><x:d xmlns:x="http://example.com/x">dtext</x:d></c>'
+ '</root>')
+ self.assertEqual(
+ c14n_roundtrip(xml, strip_text=True, exclude_tags=['{http://example.com/x}d']),
+ '<root>'
+ '<a xmlns:x="http://example.com/x" x:attr="attrx"><b>abtext</b></a>'
+ '<b>btext</b>'
+ '<c></c>'
+ '</root>')
+ self.assertEqual(
+ c14n_roundtrip(xml, strip_text=True, exclude_attrs=['{http://example.com/x}attr'],
+ exclude_tags=['{http://example.com/x}d']),
+ '<root>'
+ '<a><b>abtext</b></a>'
+ '<b>btext</b>'
+ '<c></c>'
+ '</root>')
+ self.assertEqual(
+ c14n_roundtrip(xml, strip_text=True, exclude_tags=['a', 'b']),
+ '<root>'
+ '<c><x:d xmlns:x="http://example.com/x">dtext</x:d></c>'
+ '</root>')
+ self.assertEqual(
+ c14n_roundtrip(xml, exclude_tags=['a', 'b']),
+ '<root>\n'
+ ' \n'
+ ' \n'
+ ' <c>\n'
+ ' <x:d xmlns:x="http://example.com/x">dtext</x:d>\n'
+ ' </c>\n'
+ '</root>')
+ self.assertEqual(
+ c14n_roundtrip(xml, strip_text=True, exclude_tags=['{http://example.com/x}d', 'b']),
+ '<root>'
+ '<a xmlns:x="http://example.com/x" x:attr="attrx"></a>'
+ '<c></c>'
+ '</root>')
+ self.assertEqual(
+ c14n_roundtrip(xml, exclude_tags=['{http://example.com/x}d', 'b']),
+ '<root>\n'
+ ' <a xmlns:x="http://example.com/x" x:attr="attrx">\n'
+ ' \n'
+ ' </a>\n'
+ ' \n'
+ ' <c>\n'
+ ' \n'
+ ' </c>\n'
+ '</root>')
+
+ #
+ # basic method=c14n tests from the c14n 2.0 specification. uses
+ # test files under xmltestdata/c14n-20.
+
+ # note that this uses generated C14N versions of the standard ET.write
+ # output, not roundtripped C14N (see above).
+
+ def test_xml_c14n2(self):
+ datadir = os.path.join(os.path.dirname(__file__), "c14n-20")
+ full_path = partial(os.path.join, datadir)
+
+ files = [filename[:-4] for filename in sorted(os.listdir(datadir))
+ if filename.endswith('.xml')]
+ input_files = [
+ filename for filename in files
+ if filename.startswith('in')
+ ]
+ configs = {
+ filename: {
+ # <c14n2:PrefixRewrite>sequential</c14n2:PrefixRewrite>
+ option.tag.split('}')[-1]: ((option.text or '').strip(), option)
+ for option in self.etree.parse(full_path(filename) + ".xml").getroot()
+ }
+ for filename in files
+ if filename.startswith('c14n')
+ }
+
+ tests = {
+ input_file: [
+ (filename, configs[filename.rsplit('_', 1)[-1]])
+ for filename in files
+ if filename.startswith('out_%s_' % input_file)
+ and filename.rsplit('_', 1)[-1] in configs
+ ]
+ for input_file in input_files
+ }
+
+ # Make sure we found all test cases.
+ self.assertEqual(30, len([
+ output_file for output_files in tests.values()
+ for output_file in output_files]))
+
+ def get_option(config, option_name, default=None):
+ return config.get(option_name, (default, ()))[0]
+
+ for input_file, output_files in tests.items():
+ for output_file, config in output_files:
+ keep_comments = get_option(
+ config, 'IgnoreComments') == 'true' # no, it's right :)
+ strip_text = get_option(
+ config, 'TrimTextNodes') == 'true'
+ rewrite_prefixes = get_option(
+ config, 'PrefixRewrite') == 'sequential'
+ if 'QNameAware' in config:
+ qattrs = [
+ "{%s}%s" % (el.get('NS'), el.get('Name'))
+ for el in config['QNameAware'][1].findall(
+ '{http://www.w3.org/2010/xml-c14n2}QualifiedAttr')
+ ]
+ qtags = [
+ "{%s}%s" % (el.get('NS'), el.get('Name'))
+ for el in config['QNameAware'][1].findall(
+ '{http://www.w3.org/2010/xml-c14n2}Element')
+ ]
+ else:
+ qtags = qattrs = None
+
+ # Build subtest description from config.
+ config_descr = ','.join(
+ "%s=%s" % (name, value or ','.join(c.tag.split('}')[-1] for c in children))
+ for name, (value, children) in sorted(config.items())
+ )
+
+ with self.subTest("{}({})".format(output_file, config_descr)):
+ if input_file == 'inNsRedecl' and not rewrite_prefixes:
+ self.skipTest(
+ "Redeclared namespace handling is not supported in {}".format(
+ output_file))
+ if input_file == 'inNsSuperfluous' and not rewrite_prefixes:
+ self.skipTest(
+ "Redeclared namespace handling is not supported in {}".format(
+ output_file))
+ if 'QNameAware' in config and config['QNameAware'][1].find(
+ '{http://www.w3.org/2010/xml-c14n2}XPathElement') is not None:
+ self.skipTest(
+ "QName rewriting in XPath text is not supported in {}".format(
+ output_file))
+
+ f = full_path(input_file + ".xml")
+ if input_file == 'inC14N5':
+ # Hack: avoid setting up external entity resolution in the parser.
+ with open(full_path('world.txt'), 'rb') as entity_file:
+ with open(f, 'rb') as f:
+ f = io.BytesIO(f.read().replace(b'&ent2;', entity_file.read().strip()))
+
+ text = self._canonicalize(
+ f,
+ with_comments=keep_comments,
+ strip_text=strip_text,
+ rewrite_prefixes=rewrite_prefixes,
+ qname_aware_tags=qtags, qname_aware_attrs=qattrs)
+
+ with io.open(full_path(output_file + ".xml"), 'r', encoding='utf8') as f:
+ expected = f.read()
+ if input_file == 'inC14N3' and self.etree is not etree:
+ # FIXME: cET resolves default attributes but ET does not!
+ expected = expected.replace(' attr="default"', '')
+ text = text.replace(' attr="default"', '')
+ self.assertEqual(expected, text)
+
+
+if etree:
+ class ETreeTestCase(_ETreeTestCaseBase):
+ etree = etree
+
+ class ETreePullTestCase(_XMLPullParserTest):
+ etree = etree
+
+ class ETreeElementSlicingTest(_ElementSlicingTest):
+ etree = etree
+
+ class ETreeC14NTest(_C14NTest):
+ etree = etree
+
+ class ETreeC14N2WriteTest(ETreeC14NTest):
+ def _canonicalize(self, input_file, with_comments=True, strip_text=False,
+ rewrite_prefixes=False, qname_aware_tags=None, qname_aware_attrs=None,
+ **options):
+ if rewrite_prefixes or qname_aware_attrs or qname_aware_tags:
+ self.skipTest("C14N 2.0 feature not supported with ElementTree.write()")
+
+ parser = self.etree.XMLParser(attribute_defaults=True, collect_ids=False)
+ tree = self.etree.parse(input_file, parser)
+ out = io.BytesIO()
+ tree.write(
+ out, method='c14n2',
+ with_comments=with_comments, strip_text=strip_text,
+ **options)
+ return out.getvalue().decode('utf8')
+
+ class ETreeC14N2TostringTest(ETreeC14NTest):
+ def _canonicalize(self, input_file, with_comments=True, strip_text=False,
+ rewrite_prefixes=False, qname_aware_tags=None, qname_aware_attrs=None,
+ **options):
+ if rewrite_prefixes or qname_aware_attrs or qname_aware_tags:
+ self.skipTest("C14N 2.0 feature not supported with ElementTree.tostring()")
+
+ parser = self.etree.XMLParser(attribute_defaults=True, collect_ids=False)
+ tree = self.etree.parse(input_file, parser)
+ return self.etree.tostring(
+ tree, method='c14n2',
+ with_comments=with_comments, strip_text=strip_text,
+ **options).decode('utf8')
+
+
+if ElementTree:
+ class ElementTreeTestCase(_ETreeTestCaseBase):
+ etree = ElementTree
+
+ @classmethod
+ def setUpClass(cls):
+ if sys.version_info >= (3, 9):
+ return
+ import warnings
+ # ElementTree warns about getiterator() in recent Pythons
+ warnings.filterwarnings(
+ 'ignore',
+ r'This method will be removed.*\.iter\(\).*instead',
+ PendingDeprecationWarning)
+
+ filter_by_version(
+ ElementTreeTestCase,
+ ElementTreeTestCase.required_versions_ET, ET_VERSION)
+
+ if hasattr(ElementTree, 'XMLPullParser'):
+ class ElementTreePullTestCase(_XMLPullParserTest):
+ etree = ElementTree
+ else:
+ ElementTreePullTestCase = None
+
+ if hasattr(ElementTree, 'canonicalize'):
+ class ElementTreeC14NTest(_C14NTest):
+ etree = ElementTree
+ else:
+ ElementTreeC14NTest = None
+
+ class ElementTreeElementSlicingTest(_ElementSlicingTest):
+ etree = ElementTree
+
+
+if cElementTree:
+ class CElementTreeTestCase(_ETreeTestCaseBase):
+ etree = cElementTree
+
+ filter_by_version(
+ CElementTreeTestCase,
+ CElementTreeTestCase.required_versions_cET, CET_VERSION)
+
+ class CElementTreeElementSlicingTest(_ElementSlicingTest):
+ etree = cElementTree
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ if etree:
+ suite.addTests([unittest.makeSuite(ETreeTestCase)])
+ suite.addTests([unittest.makeSuite(ETreePullTestCase)])
+ suite.addTests([unittest.makeSuite(ETreeElementSlicingTest)])
+ suite.addTests([unittest.makeSuite(ETreeC14NTest)])
+ suite.addTests([unittest.makeSuite(ETreeC14N2WriteTest)])
+ suite.addTests([unittest.makeSuite(ETreeC14N2TostringTest)])
+ if ElementTree:
+ suite.addTests([unittest.makeSuite(ElementTreeTestCase)])
+ if ElementTreePullTestCase:
+ suite.addTests([unittest.makeSuite(ElementTreePullTestCase)])
+ if ElementTreeC14NTest:
+ suite.addTests([unittest.makeSuite(ElementTreeC14NTest)])
+ suite.addTests([unittest.makeSuite(ElementTreeElementSlicingTest)])
+ if cElementTree:
+ suite.addTests([unittest.makeSuite(CElementTreeTestCase)])
+ suite.addTests([unittest.makeSuite(CElementTreeElementSlicingTest)])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_errors.py b/src/lxml/tests/test_errors.py
new file mode 100644
index 0000000..c0aee74
--- /dev/null
+++ b/src/lxml/tests/test_errors.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+import unittest
+
+# These tests check that error handling in the Pyrex code is
+# complete.
+# It is likely that if there are errors, instead of failing the code
+# will simply crash.
+
+import sys, gc, os.path
+from lxml import etree
+
+from .common_imports import HelperTestCase
+
+
+class ErrorTestCase(HelperTestCase):
+ etree = etree
+
+ def test_bad_element(self):
+ # attrib argument of Element() should be a dictionary, so if
+ # we pass a string we should get an error.
+ self.assertRaises(TypeError, self.etree.Element, 'a', 'b')
+
+ def test_empty_parse(self):
+ self.assertRaises(etree.XMLSyntaxError, etree.fromstring, '')
+
+ def test_element_cyclic_gc_none(self):
+ # test if cyclic reference can crash etree
+ Element = self.etree.Element
+ getrefcount = sys.getrefcount
+
+ # must disable tracing as it could change the refcounts
+ trace_func = sys.gettrace()
+ try:
+ sys.settrace(None)
+ gc.collect()
+
+ count = getrefcount(None)
+
+ l = [Element('name'), Element('name')]
+ l.append(l)
+
+ del l
+ gc.collect()
+ count = getrefcount(None) - count
+
+ self.assertEqual(count, 0)
+ finally:
+ sys.settrace(trace_func)
+
+ def test_xmlsyntaxerror_has_info(self):
+ broken_xml_name = 'test_broken.xml'
+ broken_xml_path = os.path.join(os.path.dirname(__file__), broken_xml_name)
+ fail_msg = 'test_broken.xml should raise an etree.XMLSyntaxError'
+ try:
+ etree.parse(broken_xml_path)
+ except etree.XMLSyntaxError as e:
+ # invariant
+ self.assertEqual(e.position, (e.lineno, e.offset + 1), 'position and lineno/offset out of sync')
+ # SyntaxError info derived from file & contents
+ self.assertTrue(e.filename.endswith(broken_xml_name), 'filename must be preserved')
+ self.assertEqual(e.lineno, 1)
+ self.assertEqual(e.offset, 10)
+ except Exception as e:
+ self.fail('{0}, not {1}'.format(fail_msg, type(e)))
+ else:
+ self.fail('test_broken.xml should raise an etree.XMLSyntaxError')
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ErrorTestCase)])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py
new file mode 100644
index 0000000..9cf7060
--- /dev/null
+++ b/src/lxml/tests/test_etree.py
@@ -0,0 +1,5381 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests specific to the extended etree API
+
+Tests that apply to the general ElementTree API should go into
+test_elementtree
+"""
+
+from __future__ import absolute_import
+
+from collections import OrderedDict
+import os.path
+import unittest
+import copy
+import sys
+import re
+import gc
+import operator
+import textwrap
+import zlib
+import gzip
+
+from .common_imports import etree, StringIO, BytesIO, HelperTestCase
+from .common_imports import fileInTestDir, fileUrlInTestDir, read_file, path2url, tmpfile
+from .common_imports import SillyFileLike, LargeFileLikeUnicode, doctest, make_doctest
+from .common_imports import canonicalize, _str, _bytes
+
+print("""
+TESTED VERSION: %s""" % etree.__version__ + """
+ Python: %r""" % (sys.version_info,) + """
+ lxml.etree: %r""" % (etree.LXML_VERSION,) + """
+ libxml used: %r""" % (etree.LIBXML_VERSION,) + """
+ libxml compiled: %r""" % (etree.LIBXML_COMPILED_VERSION,) + """
+ libxslt used: %r""" % (etree.LIBXSLT_VERSION,) + """
+ libxslt compiled: %r""" % (etree.LIBXSLT_COMPILED_VERSION,) + """
+ FS encoding: %s""" % (sys.getfilesystemencoding(),) + """
+ Default encoding: %s""" % (sys.getdefaultencoding(),) + """
+ Max Unicode: %s""" % (sys.maxunicode,) + """
+""")
+
+try:
+ _unicode = unicode
+except NameError:
+ # Python 3
+ _unicode = str
+
+
+class ETreeOnlyTestCase(HelperTestCase):
+ """Tests only for etree, not ElementTree"""
+ etree = etree
+
+ def test_version(self):
+ self.assertTrue(isinstance(etree.__version__, _unicode))
+ self.assertTrue(isinstance(etree.LXML_VERSION, tuple))
+ self.assertEqual(len(etree.LXML_VERSION), 4)
+ self.assertTrue(isinstance(etree.LXML_VERSION[0], int))
+ self.assertTrue(isinstance(etree.LXML_VERSION[1], int))
+ self.assertTrue(isinstance(etree.LXML_VERSION[2], int))
+ self.assertTrue(isinstance(etree.LXML_VERSION[3], int))
+ self.assertTrue(etree.__version__.startswith(
+ str(etree.LXML_VERSION[0])))
+
+ def test_c_api(self):
+ if hasattr(self.etree, '__pyx_capi__'):
+ # newer Pyrex compatible C-API
+ self.assertTrue(isinstance(self.etree.__pyx_capi__, dict))
+ self.assertTrue(len(self.etree.__pyx_capi__) > 0)
+ else:
+ # older C-API mechanism
+ self.assertTrue(hasattr(self.etree, '_import_c_api'))
+
+ def test_include_paths(self):
+ import lxml
+ includes = lxml.get_include()
+ self.assertTrue(includes)
+ self.assertTrue(len(includes) >= 2)
+ self.assertTrue(os.path.join(os.path.dirname(lxml.__file__), 'includes') in includes,
+ includes)
+
+ def test_element_names(self):
+ Element = self.etree.Element
+ el = Element('name')
+ self.assertEqual(el.tag, 'name')
+ el = Element('{}name')
+ self.assertEqual(el.tag, 'name')
+
+ def test_element_name_empty(self):
+ Element = self.etree.Element
+ el = Element('name')
+ self.assertRaises(ValueError, Element, '{}')
+ self.assertRaises(ValueError, setattr, el, 'tag', '{}')
+
+ self.assertRaises(ValueError, Element, '{test}')
+ self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
+
+ def test_element_name_colon(self):
+ Element = self.etree.Element
+ self.assertRaises(ValueError, Element, 'p:name')
+ self.assertRaises(ValueError, Element, '{test}p:name')
+
+ el = Element('name')
+ self.assertRaises(ValueError, setattr, el, 'tag', 'p:name')
+
+ def test_element_name_quote(self):
+ Element = self.etree.Element
+ self.assertRaises(ValueError, Element, "p'name")
+ self.assertRaises(ValueError, Element, 'p"name')
+
+ self.assertRaises(ValueError, Element, "{test}p'name")
+ self.assertRaises(ValueError, Element, '{test}p"name')
+
+ el = Element('name')
+ self.assertRaises(ValueError, setattr, el, 'tag', "p'name")
+ self.assertRaises(ValueError, setattr, el, 'tag', 'p"name')
+
+ def test_element_name_space(self):
+ Element = self.etree.Element
+ self.assertRaises(ValueError, Element, ' name ')
+ self.assertRaises(ValueError, Element, 'na me')
+ self.assertRaises(ValueError, Element, '{test} name')
+
+ el = Element('name')
+ self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
+
+ def test_subelement_name_empty(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ self.assertRaises(ValueError, SubElement, el, '{}')
+ self.assertRaises(ValueError, SubElement, el, '{test}')
+
+ def test_subelement_name_colon(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ self.assertRaises(ValueError, SubElement, el, 'p:name')
+ self.assertRaises(ValueError, SubElement, el, '{test}p:name')
+
+ def test_subelement_name_quote(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ self.assertRaises(ValueError, SubElement, el, "p'name")
+ self.assertRaises(ValueError, SubElement, el, "{test}p'name")
+
+ self.assertRaises(ValueError, SubElement, el, 'p"name')
+ self.assertRaises(ValueError, SubElement, el, '{test}p"name')
+
+ def test_subelement_name_space(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ self.assertRaises(ValueError, SubElement, el, ' name ')
+ self.assertRaises(ValueError, SubElement, el, 'na me')
+ self.assertRaises(ValueError, SubElement, el, '{test} name')
+
+ def test_subelement_attribute_invalid(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ self.assertRaises(ValueError, SubElement, el, 'name', {'a b c' : 'abc'})
+ self.assertRaises(ValueError, SubElement, el, 'name', {'a' : 'a\0\n'})
+ self.assertEqual(0, len(el))
+
+ def test_qname_empty(self):
+ QName = self.etree.QName
+ self.assertRaises(ValueError, QName, '')
+ self.assertRaises(ValueError, QName, None)
+ self.assertRaises(ValueError, QName, None, None)
+ self.assertRaises(ValueError, QName, 'test', '')
+
+ def test_qname_none(self):
+ QName = self.etree.QName
+ q = QName(None, 'TAG')
+ self.assertEqual('TAG', q)
+ self.assertEqual('TAG', q.localname)
+ self.assertEqual(None, q.namespace)
+
+ def test_qname_colon(self):
+ QName = self.etree.QName
+ self.assertRaises(ValueError, QName, 'p:name')
+ self.assertRaises(ValueError, QName, 'test', 'p:name')
+
+ def test_qname_space(self):
+ QName = self.etree.QName
+ self.assertRaises(ValueError, QName, ' name ')
+ self.assertRaises(ValueError, QName, 'na me')
+ self.assertRaises(ValueError, QName, 'test', ' name')
+
+ def test_qname_namespace_localname(self):
+ # ET doesn't have namespace/localname properties on QNames
+ QName = self.etree.QName
+ namespace, localname = 'http://myns', 'a'
+ qname = QName(namespace, localname)
+ self.assertEqual(namespace, qname.namespace)
+ self.assertEqual(localname, qname.localname)
+
+ def test_qname_element(self):
+ # ET doesn't have namespace/localname properties on QNames
+ QName = self.etree.QName
+ qname1 = QName('http://myns', 'a')
+ a = self.etree.Element(qname1, nsmap={'p' : 'http://myns'})
+
+ qname2 = QName(a)
+ self.assertEqual(a.tag, qname1.text)
+ self.assertEqual(a.tag, qname1)
+ self.assertEqual(qname1.text, qname2.text)
+ self.assertEqual(qname1, qname2.text)
+ self.assertEqual(qname1.text, qname2)
+ self.assertEqual(qname1, qname2)
+
+ def test_qname_text_resolve(self):
+ # ET doesn't resove QNames as text values
+ etree = self.etree
+ qname = etree.QName('http://myns', 'a')
+ a = etree.Element(qname, nsmap={'p' : 'http://myns'})
+ a.text = qname
+
+ self.assertEqual("p:a", a.text)
+
+ def test_nsmap_prefix_invalid(self):
+ etree = self.etree
+ self.assertRaises(ValueError,
+ etree.Element, "root", nsmap={'"' : 'testns'})
+ self.assertRaises(ValueError,
+ etree.Element, "root", nsmap={'&' : 'testns'})
+ self.assertRaises(ValueError,
+ etree.Element, "root", nsmap={'a:b' : 'testns'})
+
+ def test_clear_keep_tail(self):
+ XML = self.etree.XML
+ tostring = self.etree.tostring
+ a = XML('<a aa="A"><b ba="B">B1</b>B2<c ca="C">C1</c>C2</a>')
+ a[0].clear(keep_tail=True)
+ self.assertEqual(_bytes('<a aa="A"><b/>B2<c ca="C">C1</c>C2</a>'), tostring(a))
+
+ def test_attribute_has_key(self):
+ # ET in Py 3.x has no "attrib.has_key()" method
+ XML = self.etree.XML
+
+ root = XML(_bytes('<foo bar="Bar" xmlns:ns="http://ns.codespeak.net/test" ns:baz="Baz" />'))
+ self.assertEqual(
+ True, root.attrib.has_key('bar'))
+ self.assertEqual(
+ False, root.attrib.has_key('baz'))
+ self.assertEqual(
+ False, root.attrib.has_key('hah'))
+ self.assertEqual(
+ True,
+ root.attrib.has_key('{http://ns.codespeak.net/test}baz'))
+
+ def test_attribute_set(self):
+ Element = self.etree.Element
+ root = Element("root")
+ root.set("attr", "TEST")
+ self.assertEqual("TEST", root.get("attr"))
+
+ def test_attribute_set_nonstring(self):
+ # ElementTree accepts arbitrary attribute values
+ # lxml.etree allows only strings
+ Element = self.etree.Element
+
+ root = Element("root")
+ root.set("attr", "TEST")
+ self.assertEqual("TEST", root.get("attr"))
+ self.assertRaises(TypeError, root.set, "newattr", 5)
+
+ def test_attrib_and_keywords(self):
+ Element = self.etree.Element
+
+ root = Element("root")
+ root.set("attr", "TEST")
+ self.assertEqual("TEST", root.attrib["attr"])
+
+ root2 = Element("root2", root.attrib, attr2='TOAST')
+ self.assertEqual("TEST", root2.attrib["attr"])
+ self.assertEqual("TOAST", root2.attrib["attr2"])
+ self.assertEqual(None, root.attrib.get("attr2"))
+
+ def test_attrib_order(self):
+ Element = self.etree.Element
+
+ keys = ["attr%d" % i for i in range(12, 4, -1)]
+ values = ["TEST-%d" % i for i in range(12, 4, -1)]
+ items = list(zip(keys, values))
+
+ root = Element("root")
+ for key, value in items:
+ root.set(key, value)
+ self.assertEqual(keys, root.attrib.keys())
+ self.assertEqual(values, root.attrib.values())
+
+ attr_order = [
+ ('attr_99', 'TOAST-1'),
+ ('attr_98', 'TOAST-2'),
+ ]
+ ordered_dict_types = [OrderedDict, lambda x:x]
+ if sys.version_info >= (3, 6):
+ ordered_dict_types.append(dict)
+ else:
+ # Keyword arguments are not ordered in Py<3.6, and thus get sorted.
+ attr_order.sort()
+ attr_order += items
+ expected_keys = [attr[0] for attr in attr_order]
+ expected_values = [attr[1] for attr in attr_order]
+ expected_items = list(zip(expected_keys, expected_values))
+
+ for dict_type in ordered_dict_types:
+ root2 = Element("root2", dict_type(root.attrib),
+ attr_99='TOAST-1', attr_98='TOAST-2')
+
+ try:
+ self.assertSequenceEqual(expected_keys, root2.attrib.keys())
+ self.assertSequenceEqual(expected_values, root2.attrib.values())
+ self.assertSequenceEqual(expected_items, root2.attrib.items())
+ except AssertionError as exc:
+ exc.args = ("Order of '%s': %s" % (dict_type.__name__, exc.args[0]),) + exc.args[1:]
+ raise
+
+ self.assertEqual(keys, root.attrib.keys())
+ self.assertEqual(values, root.attrib.values())
+
+ def test_attribute_set_invalid(self):
+ # ElementTree accepts arbitrary attribute values
+ # lxml.etree allows only strings, or None for (html5) boolean attributes
+ Element = self.etree.Element
+ root = Element("root")
+ self.assertRaises(TypeError, root.set, "newattr", 5)
+ self.assertRaises(TypeError, root.set, "newattr", object)
+ self.assertRaises(TypeError, root.set, "newattr", None)
+ self.assertRaises(TypeError, root.set, "newattr")
+
+ def test_strip_attributes(self):
+ XML = self.etree.XML
+ xml = _bytes('<test a="5" b="10" c="20"><x a="4" b="2"/></test>')
+
+ root = XML(xml)
+ self.etree.strip_attributes(root, 'a')
+ self.assertEqual(_bytes('<test b="10" c="20"><x b="2"></x></test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_attributes(root, 'b', 'c')
+ self.assertEqual(_bytes('<test a="5"><x a="4"></x></test>'),
+ self._writeElement(root))
+
+ def test_strip_attributes_ns(self):
+ XML = self.etree.XML
+ xml = _bytes('<test xmlns:n="http://test/ns" a="6" b="10" c="20" n:a="5"><x a="4" n:b="2"/></test>')
+
+ root = XML(xml)
+ self.etree.strip_attributes(root, 'a')
+ self.assertEqual(
+ _bytes('<test xmlns:n="http://test/ns" b="10" c="20" n:a="5"><x n:b="2"></x></test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_attributes(root, '{http://test/ns}a', 'c')
+ self.assertEqual(
+ _bytes('<test xmlns:n="http://test/ns" a="6" b="10"><x a="4" n:b="2"></x></test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_attributes(root, '{http://test/ns}*')
+ self.assertEqual(
+ _bytes('<test xmlns:n="http://test/ns" a="6" b="10" c="20"><x a="4"></x></test>'),
+ self._writeElement(root))
+
+ def test_strip_elements(self):
+ XML = self.etree.XML
+ xml = _bytes('<test><a><b><c/></b></a><x><a><b/><c/></a></x></test>')
+
+ root = XML(xml)
+ self.etree.strip_elements(root, 'a')
+ self.assertEqual(_bytes('<test><x></x></test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_elements(root, 'b', 'c', 'X', 'Y', 'Z')
+ self.assertEqual(_bytes('<test><a></a><x><a></a></x></test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_elements(root, 'c')
+ self.assertEqual(_bytes('<test><a><b></b></a><x><a><b></b></a></x></test>'),
+ self._writeElement(root))
+
+ def test_strip_elements_ns(self):
+ XML = self.etree.XML
+ xml = _bytes('<test>TEST<n:a xmlns:n="urn:a">A<b>B<c xmlns="urn:c"/>C</b>BT</n:a>AT<x>X<a>A<b xmlns="urn:a"/>BT<c xmlns="urn:x"/>CT</a>AT</x>XT</test>')
+
+ root = XML(xml)
+ self.etree.strip_elements(root, 'a')
+ self.assertEqual(_bytes('<test>TEST<n:a xmlns:n="urn:a">A<b>B<c xmlns="urn:c"></c>C</b>BT</n:a>AT<x>X</x>XT</test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_elements(root, '{urn:a}b', 'c')
+ self.assertEqual(_bytes('<test>TEST<n:a xmlns:n="urn:a">A<b>B<c xmlns="urn:c"></c>C</b>BT</n:a>AT<x>X<a>A<c xmlns="urn:x"></c>CT</a>AT</x>XT</test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_elements(root, '{urn:a}*', 'c')
+ self.assertEqual(_bytes('<test>TEST<x>X<a>A<c xmlns="urn:x"></c>CT</a>AT</x>XT</test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_elements(root, '{urn:a}*', 'c', with_tail=False)
+ self.assertEqual(_bytes('<test>TESTAT<x>X<a>ABT<c xmlns="urn:x"></c>CT</a>AT</x>XT</test>'),
+ self._writeElement(root))
+
+ def test_strip_tags(self):
+ XML = self.etree.XML
+ xml = _bytes('<test>TEST<a>A<b>B<c/>CT</b>BT</a>AT<x>X<a>A<b/>BT<c/>CT</a>AT</x>XT</test>')
+
+ root = XML(xml)
+ self.etree.strip_tags(root, 'a')
+ self.assertEqual(_bytes('<test>TESTA<b>B<c></c>CT</b>BTAT<x>XA<b></b>BT<c></c>CTAT</x>XT</test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_tags(root, 'b', 'c', 'X', 'Y', 'Z')
+ self.assertEqual(_bytes('<test>TEST<a>ABCTBT</a>AT<x>X<a>ABTCT</a>AT</x>XT</test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_tags(root, 'c')
+ self.assertEqual(_bytes('<test>TEST<a>A<b>BCT</b>BT</a>AT<x>X<a>A<b></b>BTCT</a>AT</x>XT</test>'),
+ self._writeElement(root))
+
+ def test_strip_tags_pi_comment(self):
+ XML = self.etree.XML
+ PI = self.etree.ProcessingInstruction
+ Comment = self.etree.Comment
+ xml = _bytes('<!--comment1-->\n<?PI1?>\n<test>TEST<!--comment2-->XT<?PI2?></test>\n<!--comment3-->\n<?PI1?>')
+
+ root = XML(xml)
+ self.etree.strip_tags(root, PI)
+ self.assertEqual(_bytes('<!--comment1-->\n<?PI1?>\n<test>TEST<!--comment2-->XT</test>\n<!--comment3-->\n<?PI1?>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_tags(root, Comment)
+ self.assertEqual(_bytes('<!--comment1-->\n<?PI1?>\n<test>TESTXT<?PI2?></test>\n<!--comment3-->\n<?PI1?>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_tags(root, PI, Comment)
+ self.assertEqual(_bytes('<!--comment1-->\n<?PI1?>\n<test>TESTXT</test>\n<!--comment3-->\n<?PI1?>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_tags(root, Comment, PI)
+ self.assertEqual(_bytes('<!--comment1-->\n<?PI1?>\n<test>TESTXT</test>\n<!--comment3-->\n<?PI1?>'),
+ self._writeElement(root))
+
+ def test_strip_tags_pi_comment_all(self):
+ XML = self.etree.XML
+ ElementTree = self.etree.ElementTree
+ PI = self.etree.ProcessingInstruction
+ Comment = self.etree.Comment
+ xml = _bytes('<!--comment1-->\n<?PI1?>\n<test>TEST<!--comment2-->XT<?PI2?></test>\n<!--comment3-->\n<?PI1?>')
+
+ root = XML(xml)
+ self.etree.strip_tags(ElementTree(root), PI)
+ self.assertEqual(_bytes('<!--comment1-->\n<test>TEST<!--comment2-->XT</test>\n<!--comment3-->'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_tags(ElementTree(root), Comment)
+ self.assertEqual(_bytes('<?PI1?>\n<test>TESTXT<?PI2?></test>\n<?PI1?>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_tags(ElementTree(root), PI, Comment)
+ self.assertEqual(_bytes('<test>TESTXT</test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_tags(ElementTree(root), Comment, PI)
+ self.assertEqual(_bytes('<test>TESTXT</test>'),
+ self._writeElement(root))
+
+ def test_strip_tags_doc_style(self):
+ XML = self.etree.XML
+ xml = _bytes('''
+ <div>
+ <div>
+ I like <strong>sheep</strong>.
+ <br/>
+ I like lots of <strong>sheep</strong>.
+ <br/>
+ Click <a href="http://www.sheep.com">here</a>
+ for <a href="http://www.sheep.com">those</a> sheep.
+ <br/>
+ </div>
+ </div>
+ '''.strip())
+
+ root = XML(xml)
+ self.etree.strip_tags(root, 'a')
+ self.assertEqual(re.sub(_bytes('</?a[^>]*>'), _bytes(''), xml).replace(_bytes('<br/>'), _bytes('<br></br>')),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_tags(root, 'a', 'br')
+ self.assertEqual(re.sub(_bytes('</?a[^>]*>'), _bytes(''),
+ re.sub(_bytes('<br[^>]*>'), _bytes(''), xml)),
+ self._writeElement(root))
+
+ def test_strip_tags_ns(self):
+ XML = self.etree.XML
+ xml = _bytes('<test>TEST<n:a xmlns:n="urn:a">A<b>B<c xmlns="urn:c"/>CT</b>BT</n:a>AT<x>X<a>A<b xmlns="urn:a"/>BT<c xmlns="urn:x"/>CT</a>AT</x>XT</test>')
+
+ root = XML(xml)
+ self.etree.strip_tags(root, 'a')
+ self.assertEqual(_bytes('<test>TEST<n:a xmlns:n="urn:a">A<b>B<c xmlns="urn:c"></c>CT</b>BT</n:a>AT<x>XA<b xmlns="urn:a"></b>BT<c xmlns="urn:x"></c>CTAT</x>XT</test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_tags(root, '{urn:a}b', 'c')
+ self.assertEqual(_bytes('<test>TEST<n:a xmlns:n="urn:a">A<b>B<c xmlns="urn:c"></c>CT</b>BT</n:a>AT<x>X<a>ABT<c xmlns="urn:x"></c>CT</a>AT</x>XT</test>'),
+ self._writeElement(root))
+
+ root = XML(xml)
+ self.etree.strip_tags(root, '{urn:a}*', 'c')
+ self.assertEqual(_bytes('<test>TESTA<b>B<c xmlns="urn:c"></c>CT</b>BTAT<x>X<a>ABT<c xmlns="urn:x"></c>CT</a>AT</x>XT</test>'),
+ self._writeElement(root))
+
+ def test_strip_tags_and_remove(self):
+ # previously crashed
+ HTML = self.etree.HTML
+ root = HTML(_bytes('<div><h1>title</h1> <b>foo</b> <p>boo</p></div>'))[0][0]
+ self.assertEqual(_bytes('<div><h1>title</h1> <b>foo</b> <p>boo</p></div>'),
+ self.etree.tostring(root))
+ self.etree.strip_tags(root, 'b')
+ self.assertEqual(_bytes('<div><h1>title</h1> foo <p>boo</p></div>'),
+ self.etree.tostring(root))
+ root.remove(root[0])
+ self.assertEqual(_bytes('<div><p>boo</p></div>'),
+ self.etree.tostring(root))
+
+ def test_pi(self):
+ # lxml.etree separates target and text
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ ProcessingInstruction = self.etree.ProcessingInstruction
+
+ a = Element('a')
+ a.append(ProcessingInstruction('foo', 'some more text'))
+ self.assertEqual(a[0].target, 'foo')
+ self.assertEqual(a[0].text, 'some more text')
+
+ def test_pi_parse(self):
+ XML = self.etree.XML
+ root = XML(_bytes("<test><?mypi my test ?></test>"))
+ self.assertEqual(root[0].target, "mypi")
+ self.assertEqual(root[0].text, "my test ")
+
+ def test_pi_pseudo_attributes_get(self):
+ XML = self.etree.XML
+ root = XML(_bytes("<test><?mypi my='1' test=\" abc \" quotes=\"' '\" only names ?></test>"))
+ self.assertEqual(root[0].target, "mypi")
+ self.assertEqual(root[0].get('my'), "1")
+ self.assertEqual(root[0].get('test'), " abc ")
+ self.assertEqual(root[0].get('quotes'), "' '")
+ self.assertEqual(root[0].get('only'), None)
+ self.assertEqual(root[0].get('names'), None)
+ self.assertEqual(root[0].get('nope'), None)
+
+ def test_pi_pseudo_attributes_attrib(self):
+ XML = self.etree.XML
+ root = XML(_bytes("<test><?mypi my='1' test=\" abc \" quotes=\"' '\" only names ?></test>"))
+ self.assertEqual(root[0].target, "mypi")
+ self.assertEqual(root[0].attrib['my'], "1")
+ self.assertEqual(root[0].attrib['test'], " abc ")
+ self.assertEqual(root[0].attrib['quotes'], "' '")
+ self.assertRaises(KeyError, root[0].attrib.__getitem__, 'only')
+ self.assertRaises(KeyError, root[0].attrib.__getitem__, 'names')
+ self.assertRaises(KeyError, root[0].attrib.__getitem__, 'nope')
+
+ def test_deepcopy_pi(self):
+ # previously caused a crash
+ ProcessingInstruction = self.etree.ProcessingInstruction
+
+ a = ProcessingInstruction("PI", "ONE")
+ b = copy.deepcopy(a)
+ b.text = "ANOTHER"
+
+ self.assertEqual('ONE', a.text)
+ self.assertEqual('ANOTHER', b.text)
+
+ def test_deepcopy_elementtree_pi(self):
+ XML = self.etree.XML
+ tostring = self.etree.tostring
+ root = XML(_bytes("<?mypi my test ?><test/><!--comment -->"))
+ tree1 = self.etree.ElementTree(root)
+ self.assertEqual(_bytes("<?mypi my test ?><test/><!--comment -->"),
+ tostring(tree1))
+
+ tree2 = copy.deepcopy(tree1)
+ self.assertEqual(_bytes("<?mypi my test ?><test/><!--comment -->"),
+ tostring(tree2))
+
+ root2 = copy.deepcopy(tree1.getroot())
+ self.assertEqual(_bytes("<test/>"),
+ tostring(root2))
+
+ def test_deepcopy_elementtree_dtd(self):
+ XML = self.etree.XML
+ tostring = self.etree.tostring
+ xml = _bytes('<!DOCTYPE test [\n<!ENTITY entity "tasty">\n]>\n<test/>')
+ root = XML(xml)
+ tree1 = self.etree.ElementTree(root)
+ self.assertEqual(xml, tostring(tree1))
+
+ tree2 = copy.deepcopy(tree1)
+ self.assertEqual(xml, tostring(tree2))
+
+ root2 = copy.deepcopy(tree1.getroot())
+ self.assertEqual(_bytes("<test/>"),
+ tostring(root2))
+
+ def test_deepcopy_pi_dtd(self):
+ XML = self.etree.XML
+ tostring = self.etree.tostring
+ xml = _bytes('<!-- comment --><!DOCTYPE test [\n<!ENTITY entity "tasty">\n]>\n<test/>')
+ root = XML(xml)
+ tree1 = self.etree.ElementTree(root)
+ self.assertEqual(xml, tostring(tree1))
+
+ tree2 = copy.deepcopy(tree1)
+ self.assertEqual(xml, tostring(tree2))
+
+ def test_parse_remove_comments(self):
+ fromstring = self.etree.fromstring
+ tostring = self.etree.tostring
+ XMLParser = self.etree.XMLParser
+
+ xml = _bytes('<a><!--A--><b><!-- B --><c/></b><!--C--></a>')
+ parser = XMLParser(remove_comments=True)
+ root = fromstring(xml, parser)
+ self.assertEqual(
+ _bytes('<a><b><c/></b></a>'),
+ tostring(root))
+
+ def test_parse_remove_pis(self):
+ parse = self.etree.parse
+ tostring = self.etree.tostring
+ XMLParser = self.etree.XMLParser
+
+ xml = _bytes('<?test?><a><?A?><b><?B?><c/></b><?C?></a><?tail?>')
+
+ f = BytesIO(xml)
+ tree = parse(f)
+ self.assertEqual(
+ xml,
+ tostring(tree))
+
+ parser = XMLParser(remove_pis=True)
+ tree = parse(f, parser)
+ self.assertEqual(
+ _bytes('<a><b><c/></b></a>'),
+ tostring(tree))
+
+ def test_parse_parser_type_error(self):
+ # ET raises IOError only
+ parse = self.etree.parse
+ self.assertRaises(TypeError, parse, 'notthere.xml', object())
+
+ def test_iterparse_getiterator(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b><d/></b><c/></a>')
+
+ counts = []
+ for event, elem in iterparse(f):
+ counts.append(len(list(elem.getiterator())))
+ self.assertEqual(
+ [1,2,1,4],
+ counts)
+
+ def test_iterparse_tree_comments(self):
+ # ET removes comments
+ iterparse = self.etree.iterparse
+ tostring = self.etree.tostring
+
+ f = BytesIO('<a><!--A--><b><!-- B --><c/></b><!--C--></a>')
+ events = list(iterparse(f))
+ root = events[-1][1]
+ self.assertEqual(3, len(events))
+ self.assertEqual(
+ _bytes('<a><!--A--><b><!-- B --><c/></b><!--C--></a>'),
+ tostring(root))
+
+ def test_iterparse_comments(self):
+ # ET removes comments
+ iterparse = self.etree.iterparse
+ tostring = self.etree.tostring
+
+ def name(event, el):
+ if event == 'comment':
+ return el.text
+ else:
+ return el.tag
+
+ f = BytesIO('<a><!--A--><b><!-- B --><c/></b><!--C--></a>')
+ events = list(iterparse(f, events=('end', 'comment')))
+ root = events[-1][1]
+ self.assertEqual(6, len(events))
+ self.assertEqual(['A', ' B ', 'c', 'b', 'C', 'a'],
+ [ name(*item) for item in events ])
+ self.assertEqual(
+ _bytes('<a><!--A--><b><!-- B --><c/></b><!--C--></a>'),
+ tostring(root))
+
+ def test_iterparse_pis(self):
+ # ET removes pis
+ iterparse = self.etree.iterparse
+ tostring = self.etree.tostring
+ ElementTree = self.etree.ElementTree
+
+ def name(event, el):
+ if event == 'pi':
+ return el.target, el.text
+ else:
+ return el.tag
+
+ f = BytesIO('<?pia a?><a><?pib b?><b><?pic c?><c/></b><?pid d?></a><?pie e?>')
+ events = list(iterparse(f, events=('end', 'pi')))
+ root = events[-2][1]
+ self.assertEqual(8, len(events))
+ self.assertEqual([('pia','a'), ('pib','b'), ('pic','c'), 'c', 'b',
+ ('pid','d'), 'a', ('pie','e')],
+ [ name(*item) for item in events ])
+ self.assertEqual(
+ _bytes('<?pia a?><a><?pib b?><b><?pic c?><c/></b><?pid d?></a><?pie e?>'),
+ tostring(ElementTree(root)))
+
+ def test_iterparse_remove_comments(self):
+ iterparse = self.etree.iterparse
+ tostring = self.etree.tostring
+
+ f = BytesIO('<a><!--A--><b><!-- B --><c/></b><!--C--></a>')
+ events = list(iterparse(f, remove_comments=True,
+ events=('end', 'comment')))
+ root = events[-1][1]
+ self.assertEqual(3, len(events))
+ self.assertEqual(['c', 'b', 'a'],
+ [ el.tag for (event, el) in events ])
+ self.assertEqual(
+ _bytes('<a><b><c/></b></a>'),
+ tostring(root))
+
+ def test_iterparse_broken(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b><c/></a>')
+ # ET raises ExpatError, lxml raises XMLSyntaxError
+ self.assertRaises(self.etree.XMLSyntaxError, list, iterparse(f))
+
+ def test_iterparse_broken_recover(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b><c/></a>')
+ it = iterparse(f, events=('start', 'end'), recover=True)
+ events = [(ev, el.tag) for ev, el in it]
+ root = it.root
+ self.assertTrue(root is not None)
+
+ self.assertEqual(1, events.count(('start', 'a')))
+ self.assertEqual(1, events.count(('end', 'a')))
+
+ self.assertEqual(1, events.count(('start', 'b')))
+ self.assertEqual(1, events.count(('end', 'b')))
+
+ self.assertEqual(1, events.count(('start', 'c')))
+ self.assertEqual(1, events.count(('end', 'c')))
+
+ def test_iterparse_broken_multi_recover(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b><c/></d><b><c/></a></b>')
+ it = iterparse(f, events=('start', 'end'), recover=True)
+ events = [(ev, el.tag) for ev, el in it]
+ root = it.root
+ self.assertTrue(root is not None)
+
+ self.assertEqual(1, events.count(('start', 'a')))
+ self.assertEqual(1, events.count(('end', 'a')))
+
+ self.assertEqual(2, events.count(('start', 'b')))
+ self.assertEqual(2, events.count(('end', 'b')))
+
+ self.assertEqual(2, events.count(('start', 'c')))
+ self.assertEqual(2, events.count(('end', 'c')))
+
+ def test_iterparse_strip(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO("""
+ <a> \n \n <b> b test </b> \n
+
+ \n\t <c> \n </c> </a> \n """)
+ iterator = iterparse(f, remove_blank_text=True)
+ text = [ (element.text, element.tail)
+ for event, element in iterator ]
+ self.assertEqual(
+ [(" b test ", None), (" \n ", None), (None, None)],
+ text)
+
+ def test_iterparse_tag(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b><d/></b><c/></a>')
+
+ iterator = iterparse(f, tag="b", events=('start', 'end'))
+ events = list(iterator)
+ root = iterator.root
+ self.assertEqual(
+ [('start', root[0]), ('end', root[0])],
+ events)
+
+ def test_iterparse_tag_all(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b><d/></b><c/></a>')
+
+ iterator = iterparse(f, tag="*", events=('start', 'end'))
+ events = list(iterator)
+ self.assertEqual(
+ 8,
+ len(events))
+
+ def test_iterparse_tag_ns(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a xmlns="urn:test:1"><b><d/></b><c/></a>')
+
+ iterator = iterparse(f, tag="{urn:test:1}b", events=('start', 'end'))
+ events = list(iterator)
+ root = iterator.root
+ self.assertEqual(
+ [('start', root[0]), ('end', root[0])],
+ events)
+
+ def test_iterparse_tag_ns_empty(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a><b><d/></b><c/></a>')
+ iterator = iterparse(f, tag="{}b", events=('start', 'end'))
+ events = list(iterator)
+ root = iterator.root
+ self.assertEqual(
+ [('start', root[0]), ('end', root[0])],
+ events)
+
+ f = BytesIO('<a xmlns="urn:test:1"><b><d/></b><c/></a>')
+ iterator = iterparse(f, tag="{}b", events=('start', 'end'))
+ events = list(iterator)
+ root = iterator.root
+ self.assertEqual([], events)
+
+ def test_iterparse_tag_ns_all(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a xmlns="urn:test:1"><b><d/></b><c/></a>')
+ iterator = iterparse(f, tag="{urn:test:1}*", events=('start', 'end'))
+ events = list(iterator)
+ self.assertEqual(8, len(events))
+
+ def test_iterparse_tag_ns_empty_all(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<a xmlns="urn:test:1"><b><d/></b><c/></a>')
+ iterator = iterparse(f, tag="{}*", events=('start', 'end'))
+ events = list(iterator)
+ self.assertEqual([], events)
+
+ f = BytesIO('<a><b><d/></b><c/></a>')
+ iterator = iterparse(f, tag="{}*", events=('start', 'end'))
+ events = list(iterator)
+ self.assertEqual(8, len(events))
+
+ def test_iterparse_encoding_error(self):
+ text = _str('Søk på nettet')
+ wrong_declaration = "<?xml version='1.0' encoding='UTF-8'?>"
+ xml_latin1 = (_str('%s<a>%s</a>') % (wrong_declaration, text)
+ ).encode('iso-8859-1')
+
+ self.assertRaises(self.etree.ParseError,
+ list, self.etree.iterparse(BytesIO(xml_latin1)))
+
+ def test_iterparse_encoding_8bit_override(self):
+ text = _str('Søk på nettet', encoding="UTF-8")
+ wrong_declaration = "<?xml version='1.0' encoding='UTF-8'?>"
+ xml_latin1 = (_str('%s<a>%s</a>') % (wrong_declaration, text)
+ ).encode('iso-8859-1')
+
+ iterator = self.etree.iterparse(BytesIO(xml_latin1),
+ encoding="iso-8859-1")
+ self.assertEqual(1, len(list(iterator)))
+
+ a = iterator.root
+ self.assertEqual(a.text, text)
+
+ def test_iterparse_keep_cdata(self):
+ tostring = self.etree.tostring
+ f = BytesIO('<root><![CDATA[test]]></root>')
+ context = self.etree.iterparse(f, strip_cdata=False)
+ content = [ el.text for event,el in context ]
+
+ self.assertEqual(['test'], content)
+ self.assertEqual(_bytes('<root><![CDATA[test]]></root>'),
+ tostring(context.root))
+
+ def test_parser_encoding_unknown(self):
+ self.assertRaises(
+ LookupError, self.etree.XMLParser, encoding="hopefully unknown")
+
+ def test_parser_encoding(self):
+ self.etree.XMLParser(encoding="ascii")
+ self.etree.XMLParser(encoding="utf-8")
+ self.etree.XMLParser(encoding="iso-8859-1")
+
+ def test_feed_parser_recover(self):
+ parser = self.etree.XMLParser(recover=True)
+
+ parser.feed('<?xml version=')
+ parser.feed('"1.0"?><ro')
+ parser.feed('ot><')
+ parser.feed('a test="works"')
+ parser.feed('><othertag/></root') # <a> not closed!
+ parser.feed('>')
+
+ root = parser.close()
+
+ self.assertEqual(root.tag, "root")
+ self.assertEqual(len(root), 1)
+ self.assertEqual(root[0].tag, "a")
+ self.assertEqual(root[0].get("test"), "works")
+ self.assertEqual(len(root[0]), 1)
+ self.assertEqual(root[0][0].tag, "othertag")
+ # FIXME: would be nice to get some errors logged ...
+ #self.assertTrue(len(parser.error_log) > 0, "error log is empty")
+
+ def test_feed_parser_recover_no_id_dict(self):
+ # test that recover mode plays nicely with the no-id-dict setup
+ parser = self.etree.XMLParser(recover=True, collect_ids=False)
+
+ parser.feed('<?xml version=')
+ parser.feed('"1.0"?><ro')
+ parser.feed('ot xml:id="123"><')
+ parser.feed('a test="works" xml:id=')
+ parser.feed('"321"><othertag/></root') # <a> not closed!
+ parser.feed('>')
+
+ root = parser.close()
+
+ self.assertEqual(root.tag, "root")
+ self.assertEqual(len(root), 1)
+ self.assertEqual(root[0].tag, "a")
+ self.assertEqual(root[0].get("test"), "works")
+ self.assertEqual(root[0].attrib, {
+ 'test': 'works',
+ '{http://www.w3.org/XML/1998/namespace}id': '321'})
+ self.assertEqual(len(root[0]), 1)
+ self.assertEqual(root[0][0].tag, "othertag")
+ # FIXME: would be nice to get some errors logged ...
+ #self.assertTrue(len(parser.error_log) > 0, "error log is empty")
+
+ def test_elementtree_parser_target_type_error(self):
+ assertEqual = self.assertEqual
+ assertFalse = self.assertFalse
+
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start")
+ assertFalse(attrib)
+ assertEqual("TAG", tag)
+ def end(self, tag):
+ events.append("end")
+ assertEqual("TAG", tag)
+ def close(self):
+ return "DONE" # no Element!
+
+ parser = self.etree.XMLParser(target=Target())
+ tree = self.etree.ElementTree()
+
+ self.assertRaises(TypeError,
+ tree.parse, BytesIO("<TAG/>"), parser=parser)
+ self.assertEqual(["start", "end"], events)
+
+ def test_parser_target_feed_exception(self):
+ # ET doesn't call .close() on errors
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ def end(self, tag):
+ events.append("end-" + tag)
+ if tag == 'a':
+ raise ValueError("dead and gone")
+ def data(self, data):
+ events.append("data-" + data)
+ def close(self):
+ events.append("close")
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target())
+
+ try:
+ parser.feed(_bytes('<root>A<a>ca</a>B</root>'))
+ done = parser.close()
+ self.fail("error expected, but parsing succeeded")
+ except ValueError:
+ done = 'value error received as expected'
+
+ self.assertEqual(["start-root", "data-A", "start-a",
+ "data-ca", "end-a", "close"],
+ events)
+
+ def test_parser_target_fromstring_exception(self):
+ # ET doesn't call .close() on errors
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ def end(self, tag):
+ events.append("end-" + tag)
+ if tag == 'a':
+ raise ValueError("dead and gone")
+ def data(self, data):
+ events.append("data-" + data)
+ def close(self):
+ events.append("close")
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target())
+
+ try:
+ done = self.etree.fromstring(_bytes('<root>A<a>ca</a>B</root>'),
+ parser=parser)
+ self.fail("error expected, but parsing succeeded")
+ except ValueError:
+ done = 'value error received as expected'
+
+ self.assertEqual(["start-root", "data-A", "start-a",
+ "data-ca", "end-a", "close"],
+ events)
+
+ def test_parser_target_feed_no_id_dict(self):
+ # test that target parsing works nicely with the no-id-hash setup
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ def end(self, tag):
+ events.append("end-" + tag)
+ def data(self, data):
+ events.append("data-" + data)
+ def comment(self, text):
+ events.append("comment-" + text)
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target(), collect_ids=False)
+
+ parser.feed(_bytes('<!--a--><root xml:id="123">A<!--b-->'))
+ parser.feed(_bytes('<sub xml:id="321"/>B</root>'))
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual(["comment-a", "start-root", "data-A", "comment-b",
+ "start-sub", "end-sub", "data-B", "end-root"],
+ events)
+
+ def test_parser_target_comment(self):
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ def end(self, tag):
+ events.append("end-" + tag)
+ def data(self, data):
+ events.append("data-" + data)
+ def comment(self, text):
+ events.append("comment-" + text)
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target())
+
+ parser.feed(_bytes('<!--a--><root>A<!--b--><sub/><!--c-->B</root><!--d-->'))
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual(["comment-a", "start-root", "data-A", "comment-b",
+ "start-sub", "end-sub", "comment-c", "data-B",
+ "end-root", "comment-d"],
+ events)
+
+ def test_parser_target_pi(self):
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ def end(self, tag):
+ events.append("end-" + tag)
+ def data(self, data):
+ events.append("data-" + data)
+ def pi(self, target, data):
+ events.append("pi-" + target + "-" + data)
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target())
+
+ parser.feed(_bytes('<?test a?><root>A<?test b?>B</root><?test c?>'))
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual(["pi-test-a", "start-root", "data-A", "pi-test-b",
+ "data-B", "end-root", "pi-test-c"],
+ events)
+
+ def test_parser_target_cdata(self):
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ def end(self, tag):
+ events.append("end-" + tag)
+ def data(self, data):
+ events.append("data-" + data)
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target(),
+ strip_cdata=False)
+
+ parser.feed(_bytes('<root>A<a><![CDATA[ca]]></a>B</root>'))
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual(["start-root", "data-A", "start-a",
+ "data-ca", "end-a", "data-B", "end-root"],
+ events)
+
+ def test_parser_target_recover(self):
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append("start-" + tag)
+ def end(self, tag):
+ events.append("end-" + tag)
+ def data(self, data):
+ events.append("data-" + data)
+ def close(self):
+ events.append("close")
+ return "DONE"
+
+ parser = self.etree.XMLParser(target=Target(),
+ recover=True)
+
+ parser.feed(_bytes('<root>A<a>ca</a>B</not-root>'))
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual(["start-root", "data-A", "start-a",
+ "data-ca", "end-a", "data-B",
+ "end-root", "close"],
+ events)
+
+ def test_iterwalk_tag(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b><d/></b><c/></a>'))
+
+ iterator = iterwalk(root, tag="b", events=('start', 'end'))
+ events = list(iterator)
+ self.assertEqual(
+ [('start', root[0]), ('end', root[0])],
+ events)
+
+ def test_iterwalk_tag_all(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b><d/></b><c/></a>'))
+
+ iterator = iterwalk(root, tag="*", events=('start', 'end'))
+ events = list(iterator)
+ self.assertEqual(
+ 8,
+ len(events))
+
+ def test_iterwalk(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b></b><c/></a>'))
+
+ events = list(iterwalk(root))
+ self.assertEqual(
+ [('end', root[0]), ('end', root[1]), ('end', root)],
+ events)
+
+ def test_iterwalk_comments_root_element(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(
+ b'<!--C0--><a><!--Ca--><b><!--Cb--></b><!--Cc--><c/></a><!--C99-->')
+
+ iterator = iterwalk(root, events=('start', 'end', 'comment'))
+ events = list(iterator)
+ self.assertEqual(
+ [('start', root), ('comment', root[0]),
+ ('start', root[1]), ('comment', root[1][0]), ('end', root[1]),
+ ('comment', root[2]), ('start', root[3]), ('end', root[3]),
+ ('end', root),
+ ],
+ events)
+
+ def test_iterwalk_comments_tree(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(
+ b'<!--C0--><a><!--Ca--><b><!--Cb--></b><!--Cc--><c/></a><!--C99-->')
+
+ iterator = iterwalk(self.etree.ElementTree(root), events=('start', 'end', 'comment'))
+ events = list(iterator)
+ self.assertEqual(
+ [('comment', root.getprevious()),
+ ('start', root), ('comment', root[0]), # <a>
+ ('start', root[1]), ('comment', root[1][0]), ('end', root[1]), # <b>
+ ('comment', root[2]), ('start', root[3]), ('end', root[3]), # <c>
+ ('end', root), ('comment', root.getnext()),
+ ],
+ events)
+
+ def test_iterwalk_pis_root_element(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(
+ b'<?C0?><a><?Ca?><b><?Cb?></b><?Cc?><c/></a><?C99?>')
+
+ iterator = iterwalk(root, events=('start', 'end', 'pi'))
+ events = list(iterator)
+ self.assertEqual(
+ [('start', root), ('pi', root[0]),
+ ('start', root[1]), ('pi', root[1][0]), ('end', root[1]),
+ ('pi', root[2]), ('start', root[3]), ('end', root[3]),
+ ('end', root),
+ ],
+ events)
+
+ def test_iterwalk_pis_tree(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(
+ b'<?C0?><a><?Ca?><b><?Cb?></b><?Cc?><c/></a><?C99?>')
+
+ iterator = iterwalk(self.etree.ElementTree(root), events=('start', 'end', 'pi'))
+ events = list(iterator)
+ self.assertEqual(
+ [('pi', root.getprevious()),
+ ('start', root), ('pi', root[0]), # <a>
+ ('start', root[1]), ('pi', root[1][0]), ('end', root[1]), # <b>
+ ('pi', root[2]), ('start', root[3]), ('end', root[3]), # <c>
+ ('end', root), ('pi', root.getnext()),
+ ],
+ events)
+
+ def test_iterwalk_pis_comments_tree(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(
+ b'<!--C0--><?C0?><!--C1--><a><?Ca?><b><!--Cb--></b><?Cc?><c/></a><!--C99--><?C99?>')
+
+ iterator = iterwalk(self.etree.ElementTree(root), events=('start', 'end', 'pi', 'comment'))
+ events = list(iterator)
+ self.assertEqual(
+ [('comment', root.getprevious().getprevious().getprevious()),
+ ('pi', root.getprevious().getprevious()),
+ ('comment', root.getprevious()),
+ ('start', root), ('pi', root[0]), # <a>
+ ('start', root[1]), ('comment', root[1][0]), ('end', root[1]), # <b>
+ ('pi', root[2]), ('start', root[3]), ('end', root[3]), # <c>
+ ('end', root), ('comment', root.getnext()), ('pi', root.getnext().getnext()),
+ ],
+ events)
+
+ def test_iterwalk_pis_comments_tree_no_events(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(
+ b'<!--C0--><?C0?><!--C1--><a><?Ca?><b><!--Cb--></b><?Cc?><c/></a><!--C99--><?C99?>')
+
+ iterator = iterwalk(self.etree.ElementTree(root), events=('start', 'end'))
+ events = list(iterator)
+ self.assertEqual(
+ [('start', root), # <a>
+ ('start', root[1]), ('end', root[1]), # <b>
+ ('start', root[3]), ('end', root[3]), # <c>
+ ('end', root),
+ ],
+ events)
+
+ def test_iterwalk_start(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b></b><c/></a>'))
+
+ iterator = iterwalk(root, events=('start',))
+ events = list(iterator)
+ self.assertEqual(
+ [('start', root), ('start', root[0]), ('start', root[1])],
+ events)
+
+ def test_iterwalk_start_end(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b></b><c/></a>'))
+
+ iterator = iterwalk(root, events=('start','end'))
+ events = list(iterator)
+ self.assertEqual(
+ [('start', root), ('start', root[0]), ('end', root[0]),
+ ('start', root[1]), ('end', root[1]), ('end', root)],
+ events)
+
+ def test_iterwalk_start_tags(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b></b><c/><b><d/></b></a>'))
+
+ iterator = iterwalk(root, events=('start',), tag='b')
+ events = list(iterator)
+ self.assertEqual(
+ [('start', root[0]), ('start', root[2])],
+ events)
+
+ def test_iterwalk_start_end_tags(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b></b><c/><b><d/></b></a>'))
+
+ iterator = iterwalk(root, events=('start', 'end'), tag='b')
+ events = list(iterator)
+ self.assertEqual(
+ [('start', root[0]), ('end', root[0]), ('start', root[2]), ('end', root[2])],
+ events)
+
+ def test_iterwalk_start_end_tags_with_root(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b></b><c/><b><d/></b></a>'))
+
+ iterator = iterwalk(root, events=('start', 'end'), tag=('b', 'a'))
+ events = list(iterator)
+ self.assertEqual(
+ [('start', root),
+ ('start', root[0]), ('end', root[0]),
+ ('start', root[2]), ('end', root[2]),
+ ('end', root),
+ ],
+ events)
+
+ def test_iterwalk_clear(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b></b><c/></a>'))
+
+ iterator = iterwalk(root)
+ for event, elem in iterator:
+ elem.clear()
+
+ self.assertEqual(0,
+ len(root))
+
+ def test_iterwalk_attrib_ns(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a xmlns="ns1"><b><c xmlns="ns2"/></b></a>'))
+
+ attr_name = '{testns}bla'
+ events = []
+ iterator = iterwalk(root, events=('start','end','start-ns','end-ns'))
+ for event, elem in iterator:
+ events.append(event)
+ if event == 'start':
+ if elem.tag != '{ns1}a':
+ elem.set(attr_name, 'value')
+
+ self.assertEqual(
+ ['start-ns', 'start', 'start', 'start-ns', 'start',
+ 'end', 'end-ns', 'end', 'end', 'end-ns'],
+ events)
+
+ self.assertEqual(
+ None,
+ root.get(attr_name))
+ self.assertEqual(
+ 'value',
+ root[0].get(attr_name))
+
+ def test_iterwalk_end_skip(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b><c/></b><d><e/></d></a>'))
+
+ iterator = iterwalk(root)
+ tags = []
+ for event, elem in iterator:
+ tags.append(elem.tag)
+ # requesting a skip after an 'end' event should never have an effect
+ iterator.skip_subtree()
+
+ self.assertEqual(['c', 'b', 'e', 'd', 'a'], tags)
+
+ def test_iterwalk_start_end_skip(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b><c/></b><d><e/></d></a>'))
+
+ iterator = iterwalk(root, events=('start', 'end'))
+ tags = []
+ for event, elem in iterator:
+ tags.append((event, elem.tag))
+ if elem.tag in ('b', 'e'):
+ # skipping should only have an effect on 'start', not on 'end'
+ iterator.skip_subtree()
+
+ self.assertEqual(
+ [('start', 'a'),
+ ('start', 'b'), ('end', 'b'), # ignored child 'c'
+ ('start', 'd'),
+ ('start', 'e'), ('end', 'e'),
+ ('end', 'd'),
+ ('end', 'a')],
+ tags)
+
+ def test_iterwalk_ns_skip(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes(
+ '<a xmlns="ns1"><b xmlns="nsb"><c xmlns="ns2"/></b><d xmlns="ns2"><e/></d></a>'))
+
+ events = []
+ iterator = iterwalk(root, events=('start','start-ns','end-ns'))
+ for event, elem in iterator:
+ if event in ('start-ns', 'end-ns'):
+ events.append((event, elem))
+ if event == 'start-ns' and elem == ('', 'nsb'):
+ events.append('skip')
+ iterator.skip_subtree()
+ else:
+ events.append((event, elem.tag))
+
+ self.assertEqual(
+ [('start-ns', ('', 'ns1')),
+ ('start', '{ns1}a'),
+ ('start-ns', ('', 'nsb')),
+ 'skip',
+ ('start', '{nsb}b'),
+ ('end-ns', None),
+ ('start-ns', ('', 'ns2')),
+ ('start', '{ns2}d'),
+ ('start', '{ns2}e'),
+ ('end-ns', None),
+ ('end-ns', None)
+ ],
+ events)
+
+ def test_iterwalk_getiterator(self):
+ iterwalk = self.etree.iterwalk
+ root = self.etree.XML(_bytes('<a><b><d/></b><c/></a>'))
+
+ counts = []
+ for event, elem in iterwalk(root):
+ counts.append(len(list(elem.getiterator())))
+ self.assertEqual(
+ [1,2,1,4],
+ counts)
+
+ def test_itertext_comment_pi(self):
+ # https://bugs.launchpad.net/lxml/+bug/1844674
+ XML = self.etree.XML
+ root = XML(_bytes(
+ "<root>RTEXT<a></a>ATAIL<b/><!-- COMMENT -->CTAIL<?PI PITEXT?> PITAIL </root>"
+ ))
+
+ text = list(root.itertext())
+ self.assertEqual(["RTEXT", "ATAIL", "CTAIL", " PITAIL "],
+ text)
+
+ def test_resolve_string_dtd(self):
+ parse = self.etree.parse
+ parser = self.etree.XMLParser(dtd_validation=True)
+ assertEqual = self.assertEqual
+ test_url = _str("__nosuch.dtd")
+
+ class MyResolver(self.etree.Resolver):
+ def resolve(self, url, id, context):
+ assertEqual(url, test_url)
+ return self.resolve_string(
+ _str('''<!ENTITY myentity "%s">
+ <!ELEMENT doc ANY>''') % url, context)
+
+ parser.resolvers.add(MyResolver())
+
+ xml = _str('<!DOCTYPE doc SYSTEM "%s"><doc>&myentity;</doc>') % test_url
+ tree = parse(StringIO(xml), parser)
+ root = tree.getroot()
+ self.assertEqual(root.text, test_url)
+
+ def test_resolve_bytes_dtd(self):
+ parse = self.etree.parse
+ parser = self.etree.XMLParser(dtd_validation=True)
+ assertEqual = self.assertEqual
+ test_url = _str("__nosuch.dtd")
+
+ class MyResolver(self.etree.Resolver):
+ def resolve(self, url, id, context):
+ assertEqual(url, test_url)
+ return self.resolve_string(
+ (_str('''<!ENTITY myentity "%s">
+ <!ELEMENT doc ANY>''') % url).encode('utf-8'),
+ context)
+
+ parser.resolvers.add(MyResolver())
+
+ xml = _str('<!DOCTYPE doc SYSTEM "%s"><doc>&myentity;</doc>') % test_url
+ tree = parse(StringIO(xml), parser)
+ root = tree.getroot()
+ self.assertEqual(root.text, test_url)
+
+ def test_resolve_filelike_dtd(self):
+ parse = self.etree.parse
+ parser = self.etree.XMLParser(dtd_validation=True)
+ assertEqual = self.assertEqual
+ test_url = _str("__nosuch.dtd")
+
+ class MyResolver(self.etree.Resolver):
+ def resolve(self, url, id, context):
+ assertEqual(url, test_url)
+ return self.resolve_file(
+ SillyFileLike(
+ _str('''<!ENTITY myentity "%s">
+ <!ELEMENT doc ANY>''') % url), context)
+
+ parser.resolvers.add(MyResolver())
+
+ xml = _str('<!DOCTYPE doc SYSTEM "%s"><doc>&myentity;</doc>') % test_url
+ tree = parse(StringIO(xml), parser)
+ root = tree.getroot()
+ self.assertEqual(root.text, test_url)
+
+ def test_resolve_filename_dtd(self):
+ parse = self.etree.parse
+ parser = self.etree.XMLParser(attribute_defaults=True)
+ assertEqual = self.assertEqual
+ test_url = _str("__nosuch.dtd")
+
+ class MyResolver(self.etree.Resolver):
+ def resolve(self, url, id, context):
+ assertEqual(url, test_url)
+ return self.resolve_filename(
+ fileInTestDir('test.dtd'), context)
+
+ parser.resolvers.add(MyResolver())
+
+ xml = _str('<!DOCTYPE a SYSTEM "%s"><a><b/></a>') % test_url
+ tree = parse(StringIO(xml), parser)
+ root = tree.getroot()
+ self.assertEqual(
+ root.attrib, {'default': 'valueA'})
+ self.assertEqual(
+ root[0].attrib, {'default': 'valueB'})
+
+ def test_resolve_filename_dtd_relative(self):
+ parse = self.etree.parse
+ parser = self.etree.XMLParser(attribute_defaults=True)
+ assertEqual = self.assertEqual
+ test_url = _str("__nosuch.dtd")
+
+ class MyResolver(self.etree.Resolver):
+ def resolve(self, url, id, context):
+ expected = fileUrlInTestDir(test_url)
+ url = url.replace('file://', 'file:') # depends on libxml2 version
+ expected = expected.replace('file://', 'file:')
+ assertEqual(url, expected)
+ return self.resolve_filename(
+ fileUrlInTestDir('test.dtd'), context)
+
+ parser.resolvers.add(MyResolver())
+
+ xml = _str('<!DOCTYPE a SYSTEM "%s"><a><b/></a>') % test_url
+ tree = parse(StringIO(xml), parser,
+ base_url=fileUrlInTestDir('__test.xml'))
+ root = tree.getroot()
+ self.assertEqual(
+ root.attrib, {'default': 'valueA'})
+ self.assertEqual(
+ root[0].attrib, {'default': 'valueB'})
+
+ def test_resolve_file_dtd(self):
+ parse = self.etree.parse
+ parser = self.etree.XMLParser(attribute_defaults=True)
+ assertEqual = self.assertEqual
+ test_url = _str("__nosuch.dtd")
+
+ class MyResolver(self.etree.Resolver):
+ def resolve(self, url, id, context):
+ assertEqual(url, test_url)
+ return self.resolve_file(
+ open(fileInTestDir('test.dtd'), 'rb'), context)
+
+ parser.resolvers.add(MyResolver())
+
+ xml = _str('<!DOCTYPE a SYSTEM "%s"><a><b/></a>') % test_url
+ tree = parse(StringIO(xml), parser)
+ root = tree.getroot()
+ self.assertEqual(
+ root.attrib, {'default': 'valueA'})
+ self.assertEqual(
+ root[0].attrib, {'default': 'valueB'})
+
+ def test_resolve_empty(self):
+ parse = self.etree.parse
+ parser = self.etree.XMLParser(load_dtd=True)
+ assertEqual = self.assertEqual
+ test_url = _str("__nosuch.dtd")
+
+ class check(object):
+ resolved = False
+
+ class MyResolver(self.etree.Resolver):
+ def resolve(self, url, id, context):
+ assertEqual(url, test_url)
+ check.resolved = True
+ return self.resolve_empty(context)
+
+ parser.resolvers.add(MyResolver())
+
+ xml = _str('<!DOCTYPE doc SYSTEM "%s"><doc>&myentity;</doc>') % test_url
+ self.assertRaises(etree.XMLSyntaxError, parse, StringIO(xml), parser)
+ self.assertTrue(check.resolved)
+
+ def test_resolve_error(self):
+ parse = self.etree.parse
+ parser = self.etree.XMLParser(dtd_validation=True)
+
+ class _LocalException(Exception):
+ pass
+
+ class MyResolver(self.etree.Resolver):
+ def resolve(self, url, id, context):
+ raise _LocalException
+
+ parser.resolvers.add(MyResolver())
+
+ xml = '<!DOCTYPE doc SYSTEM "test"><doc>&myentity;</doc>'
+ self.assertRaises(_LocalException, parse, BytesIO(xml), parser)
+
+ def test_entity_parse(self):
+ parse = self.etree.parse
+ tostring = self.etree.tostring
+ parser = self.etree.XMLParser(resolve_entities=False)
+ Entity = self.etree.Entity
+
+ xml = _bytes('<!DOCTYPE doc SYSTEM "test"><doc>&myentity;</doc>')
+ tree = parse(BytesIO(xml), parser)
+ root = tree.getroot()
+ self.assertEqual(root[0].tag, Entity)
+ self.assertEqual(root[0].text, "&myentity;")
+ self.assertEqual(root[0].tail, None)
+ self.assertEqual(root[0].name, "myentity")
+
+ self.assertEqual(_bytes('<doc>&myentity;</doc>'),
+ tostring(root))
+
+ def test_entity_restructure(self):
+ xml = _bytes('''<!DOCTYPE root [ <!ENTITY nbsp "&#160;"> ]>
+ <root>
+ <child1/>
+ <child2/>
+ <child3>&nbsp;</child3>
+ </root>''')
+
+ parser = self.etree.XMLParser(resolve_entities=False)
+ root = etree.fromstring(xml, parser)
+ self.assertEqual([ el.tag for el in root ],
+ ['child1', 'child2', 'child3'])
+
+ root[0] = root[-1]
+ self.assertEqual([ el.tag for el in root ],
+ ['child3', 'child2'])
+ self.assertEqual(root[0][0].text, '&nbsp;')
+ self.assertEqual(root[0][0].name, 'nbsp')
+
+ def test_entity_append(self):
+ Entity = self.etree.Entity
+ Element = self.etree.Element
+ tostring = self.etree.tostring
+
+ root = Element("root")
+ root.append( Entity("test") )
+
+ self.assertEqual(root[0].tag, Entity)
+ self.assertEqual(root[0].text, "&test;")
+ self.assertEqual(root[0].tail, None)
+ self.assertEqual(root[0].name, "test")
+
+ self.assertEqual(_bytes('<root>&test;</root>'),
+ tostring(root))
+
+ def test_entity_append_parsed(self):
+ Entity = self.etree.Entity
+ Element = self.etree.Element
+ parser = self.etree.XMLParser(resolve_entities=False)
+ entity = self.etree.XML('''<!DOCTYPE data [
+ <!ENTITY a "a">
+ <!ENTITY b "&a;">
+ ]>
+ <data>&b;</data>
+ ''', parser)
+
+ el = Element('test')
+ el.append(entity)
+ self.assertEqual(el.tag, 'test')
+ self.assertEqual(el[0].tag, 'data')
+ self.assertEqual(el[0][0].tag, Entity)
+ self.assertEqual(el[0][0].name, 'b')
+
+ def test_entity_values(self):
+ Entity = self.etree.Entity
+ self.assertEqual(Entity("test").text, '&test;')
+ self.assertEqual(Entity("#17683").text, '&#17683;')
+ self.assertEqual(Entity("#x1768").text, '&#x1768;')
+ self.assertEqual(Entity("#x98AF").text, '&#x98AF;')
+
+ def test_entity_error(self):
+ Entity = self.etree.Entity
+ self.assertRaises(ValueError, Entity, 'a b c')
+ self.assertRaises(ValueError, Entity, 'a,b')
+ self.assertRaises(ValueError, Entity, 'a\0b')
+ self.assertRaises(ValueError, Entity, '#abc')
+ self.assertRaises(ValueError, Entity, '#xxyz')
+
+ def test_cdata(self):
+ CDATA = self.etree.CDATA
+ Element = self.etree.Element
+ tostring = self.etree.tostring
+
+ root = Element("root")
+ root.text = CDATA('test')
+
+ self.assertEqual('test',
+ root.text)
+ self.assertEqual(_bytes('<root><![CDATA[test]]></root>'),
+ tostring(root))
+
+ def test_cdata_tail(self):
+ CDATA = self.etree.CDATA
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ tostring = self.etree.tostring
+
+ root = Element("root")
+ child = SubElement(root, 'child')
+ child.tail = CDATA('test')
+
+ self.assertEqual('test', child.tail)
+ self.assertEqual(_bytes('<root><child/><![CDATA[test]]></root>'),
+ tostring(root))
+
+ root = Element("root")
+ root.tail = CDATA('test')
+
+ self.assertEqual('test', root.tail)
+ self.assertEqual(_bytes('<root/><![CDATA[test]]>'),
+ tostring(root))
+
+ def test_cdata_type(self):
+ CDATA = self.etree.CDATA
+ Element = self.etree.Element
+ root = Element("root")
+
+ root.text = CDATA("test")
+ self.assertEqual('test', root.text)
+
+ root.text = CDATA(_str("test"))
+ self.assertEqual('test', root.text)
+
+ self.assertRaises(TypeError, CDATA, 1)
+
+ def test_cdata_errors(self):
+ CDATA = self.etree.CDATA
+ Element = self.etree.Element
+
+ root = Element("root")
+ cdata = CDATA('test')
+
+ self.assertRaises(TypeError,
+ root.set, 'attr', cdata)
+ self.assertRaises(TypeError,
+ operator.setitem, root.attrib, 'attr', cdata)
+
+ def test_cdata_parser(self):
+ tostring = self.etree.tostring
+ parser = self.etree.XMLParser(strip_cdata=False)
+ root = self.etree.XML(_bytes('<root><![CDATA[test]]></root>'), parser)
+
+ self.assertEqual('test', root.text)
+ self.assertEqual(_bytes('<root><![CDATA[test]]></root>'),
+ tostring(root))
+
+ def test_cdata_xpath(self):
+ tostring = self.etree.tostring
+ parser = self.etree.XMLParser(strip_cdata=False)
+ root = self.etree.XML(_bytes('<root><![CDATA[test]]></root>'), parser)
+ self.assertEqual(_bytes('<root><![CDATA[test]]></root>'),
+ tostring(root))
+
+ self.assertEqual(['test'], root.xpath('//text()'))
+
+ # TypeError in etree, AssertionError in ElementTree;
+ def test_setitem_assert(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+
+ self.assertRaises(TypeError,
+ a.__setitem__, 0, 'foo')
+
+ def test_append_error(self):
+ Element = self.etree.Element
+ root = Element('root')
+ # raises AssertionError in ElementTree
+ self.assertRaises(TypeError, root.append, None)
+ self.assertRaises(TypeError, root.extend, [None])
+ self.assertRaises(TypeError, root.extend, [Element('one'), None])
+ self.assertEqual('one', root[0].tag)
+
+ def test_append_recursive_error(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ root = Element('root')
+ self.assertRaises(ValueError, root.append, root)
+ child = SubElement(root, 'child')
+ self.assertRaises(ValueError, child.append, root)
+ child2 = SubElement(child, 'child2')
+ self.assertRaises(ValueError, child2.append, root)
+ self.assertRaises(ValueError, child2.append, child)
+ self.assertEqual('child2', root[0][0].tag)
+
+ def test_addnext(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ root = Element('root')
+ SubElement(root, 'a')
+ SubElement(root, 'b')
+
+ self.assertEqual(['a', 'b'],
+ [c.tag for c in root])
+ root[1].addnext(root[0])
+ self.assertEqual(['b', 'a'],
+ [c.tag for c in root])
+
+ def test_addprevious(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ root = Element('root')
+ SubElement(root, 'a')
+ SubElement(root, 'b')
+
+ self.assertEqual(['a', 'b'],
+ [c.tag for c in root])
+ root[0].addprevious(root[1])
+ self.assertEqual(['b', 'a'],
+ [c.tag for c in root])
+
+ def test_addnext_cycle(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ root = Element('root')
+ a = SubElement(root, 'a')
+ b = SubElement(a, 'b')
+ # appending parent as sibling is forbidden
+ self.assertRaises(ValueError, b.addnext, a)
+ self.assertEqual(['a'], [c.tag for c in root])
+ self.assertEqual(['b'], [c.tag for c in a])
+
+ def test_addprevious_cycle(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ root = Element('root')
+ a = SubElement(root, 'a')
+ b = SubElement(a, 'b')
+ # appending parent as sibling is forbidden
+ self.assertRaises(ValueError, b.addprevious, a)
+ self.assertEqual(['a'], [c.tag for c in root])
+ self.assertEqual(['b'], [c.tag for c in a])
+
+ def test_addnext_cycle_long(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ root = Element('root')
+ a = SubElement(root, 'a')
+ b = SubElement(a, 'b')
+ c = SubElement(b, 'c')
+ # appending parent as sibling is forbidden
+ self.assertRaises(ValueError, c.addnext, a)
+
+ def test_addprevious_cycle_long(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ root = Element('root')
+ a = SubElement(root, 'a')
+ b = SubElement(a, 'b')
+ c = SubElement(b, 'c')
+ # appending parent as sibling is forbidden
+ self.assertRaises(ValueError, c.addprevious, a)
+
+ def test_addprevious_noops(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ root = Element('root')
+ a = SubElement(root, 'a')
+ b = SubElement(root, 'b')
+ a.addprevious(a)
+ self.assertEqual('a', root[0].tag)
+ self.assertEqual('b', root[1].tag)
+ b.addprevious(b)
+ self.assertEqual('a', root[0].tag)
+ self.assertEqual('b', root[1].tag)
+ b.addprevious(a)
+ self.assertEqual('a', root[0].tag)
+ self.assertEqual('b', root[1].tag)
+
+ def test_addnext_noops(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ root = Element('root')
+ a = SubElement(root, 'a')
+ b = SubElement(root, 'b')
+ a.addnext(a)
+ self.assertEqual('a', root[0].tag)
+ self.assertEqual('b', root[1].tag)
+ b.addnext(b)
+ self.assertEqual('a', root[0].tag)
+ self.assertEqual('b', root[1].tag)
+ a.addnext(b)
+ self.assertEqual('a', root[0].tag)
+ self.assertEqual('b', root[1].tag)
+
+ def test_addnext_root(self):
+ Element = self.etree.Element
+ a = Element('a')
+ b = Element('b')
+ self.assertRaises(TypeError, a.addnext, b)
+
+ def test_addprevious_pi(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ PI = self.etree.PI
+ root = Element('root')
+ SubElement(root, 'a')
+ pi = PI('TARGET', 'TEXT')
+ pi.tail = "TAIL"
+
+ self.assertEqual(_bytes('<root><a></a></root>'),
+ self._writeElement(root))
+ root[0].addprevious(pi)
+ self.assertEqual(_bytes('<root><?TARGET TEXT?>TAIL<a></a></root>'),
+ self._writeElement(root))
+
+ def test_addprevious_root_pi(self):
+ Element = self.etree.Element
+ PI = self.etree.PI
+ root = Element('root')
+ pi = PI('TARGET', 'TEXT')
+ pi.tail = "TAIL"
+
+ self.assertEqual(_bytes('<root></root>'),
+ self._writeElement(root))
+ root.addprevious(pi)
+ self.assertEqual(_bytes('<?TARGET TEXT?>\n<root></root>'),
+ self._writeElement(root))
+
+ def test_addnext_pi(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ PI = self.etree.PI
+ root = Element('root')
+ SubElement(root, 'a')
+ pi = PI('TARGET', 'TEXT')
+ pi.tail = "TAIL"
+
+ self.assertEqual(_bytes('<root><a></a></root>'),
+ self._writeElement(root))
+ root[0].addnext(pi)
+ self.assertEqual(_bytes('<root><a></a><?TARGET TEXT?>TAIL</root>'),
+ self._writeElement(root))
+
+ def test_addnext_root_pi(self):
+ Element = self.etree.Element
+ PI = self.etree.PI
+ root = Element('root')
+ pi = PI('TARGET', 'TEXT')
+ pi.tail = "TAIL"
+
+ self.assertEqual(_bytes('<root></root>'),
+ self._writeElement(root))
+ root.addnext(pi)
+ self.assertEqual(_bytes('<root></root>\n<?TARGET TEXT?>'),
+ self._writeElement(root))
+
+ def test_addnext_comment(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ Comment = self.etree.Comment
+ root = Element('root')
+ SubElement(root, 'a')
+ comment = Comment('TEXT ')
+ comment.tail = "TAIL"
+
+ self.assertEqual(_bytes('<root><a></a></root>'),
+ self._writeElement(root))
+ root[0].addnext(comment)
+ self.assertEqual(_bytes('<root><a></a><!--TEXT -->TAIL</root>'),
+ self._writeElement(root))
+
+ def test_addnext_root_comment(self):
+ Element = self.etree.Element
+ Comment = self.etree.Comment
+ root = Element('root')
+ comment = Comment('TEXT ')
+ comment.tail = "TAIL"
+
+ self.assertEqual(_bytes('<root></root>'),
+ self._writeElement(root))
+ root.addnext(comment)
+ self.assertEqual(_bytes('<root></root>\n<!--TEXT -->'),
+ self._writeElement(root))
+
+ def test_addprevious_comment(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ Comment = self.etree.Comment
+ root = Element('root')
+ SubElement(root, 'a')
+ comment = Comment('TEXT ')
+ comment.tail = "TAIL"
+
+ self.assertEqual(_bytes('<root><a></a></root>'),
+ self._writeElement(root))
+ root[0].addprevious(comment)
+ self.assertEqual(_bytes('<root><!--TEXT -->TAIL<a></a></root>'),
+ self._writeElement(root))
+
+ def test_addprevious_root_comment(self):
+ Element = self.etree.Element
+ Comment = self.etree.Comment
+ root = Element('root')
+ comment = Comment('TEXT ')
+ comment.tail = "TAIL"
+
+ self.assertEqual(_bytes('<root></root>'),
+ self._writeElement(root))
+ root.addprevious(comment)
+ self.assertEqual(_bytes('<!--TEXT -->\n<root></root>'),
+ self._writeElement(root))
+
+ # ET's Elements have items() and key(), but not values()
+ def test_attribute_values(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc alpha="Alpha" beta="Beta" gamma="Gamma"/>'))
+ values = root.values()
+ values.sort()
+ self.assertEqual(['Alpha', 'Beta', 'Gamma'], values)
+
+ # gives error in ElementTree
+ def test_comment_empty(self):
+ Element = self.etree.Element
+ Comment = self.etree.Comment
+
+ a = Element('a')
+ a.append(Comment())
+ self.assertEqual(
+ _bytes('<a><!----></a>'),
+ self._writeElement(a))
+
+ # ElementTree ignores comments
+ def test_comment_parse_empty(self):
+ ElementTree = self.etree.ElementTree
+ tostring = self.etree.tostring
+
+ xml = _bytes('<a><b/><!----><c/></a>')
+ f = BytesIO(xml)
+ doc = ElementTree(file=f)
+ a = doc.getroot()
+ self.assertEqual(
+ '',
+ a[1].text)
+ self.assertEqual(
+ xml,
+ tostring(a))
+
+ # ElementTree ignores comments
+ def test_comment_no_proxy_yet(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<a><b></b><!-- hoi --><c></c></a>')
+ doc = ElementTree(file=f)
+ a = doc.getroot()
+ self.assertEqual(
+ ' hoi ',
+ a[1].text)
+
+ # does not raise an exception in ElementTree
+ def test_comment_immutable(self):
+ Element = self.etree.Element
+ Comment = self.etree.Comment
+
+ c = Comment()
+ el = Element('myel')
+
+ self.assertRaises(TypeError, c.append, el)
+ self.assertRaises(TypeError, c.insert, 0, el)
+ self.assertRaises(TypeError, c.set, "myattr", "test")
+
+ def test_comment_immutable_attrib(self):
+ c = self.etree.Comment()
+ self.assertEqual(0, len(c.attrib))
+
+ self.assertFalse(c.attrib.__contains__('nope'))
+ self.assertFalse('nope' in c.attrib)
+ self.assertFalse('nope' in c.attrib.keys())
+ self.assertFalse('nope' in c.attrib.values())
+ self.assertFalse(('nope', 'huhu') in c.attrib.items())
+
+ self.assertEqual([], list(c.attrib))
+ self.assertEqual([], list(c.attrib.keys()))
+ self.assertEqual([], list(c.attrib.items()))
+ self.assertEqual([], list(c.attrib.values()))
+ self.assertEqual([], list(c.attrib.iterkeys()))
+ self.assertEqual([], list(c.attrib.iteritems()))
+ self.assertEqual([], list(c.attrib.itervalues()))
+
+ self.assertEqual('HUHU', c.attrib.pop('nope', 'HUHU'))
+ self.assertRaises(KeyError, c.attrib.pop, 'nope')
+
+ self.assertRaises(KeyError, c.attrib.__getitem__, 'only')
+ self.assertRaises(KeyError, c.attrib.__getitem__, 'names')
+ self.assertRaises(KeyError, c.attrib.__getitem__, 'nope')
+ self.assertRaises(KeyError, c.attrib.__setitem__, 'nope', 'yep')
+ self.assertRaises(KeyError, c.attrib.__delitem__, 'nope')
+
+ # test passing 'None' to dump()
+ def test_dump_none(self):
+ self.assertRaises(TypeError, self.etree.dump, None)
+
+ def test_prefix(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<a xmlns:foo="http://www.infrae.com/ns/1"><foo:b/></a>')
+ doc = ElementTree(file=f)
+ a = doc.getroot()
+ self.assertEqual(
+ None,
+ a.prefix)
+ self.assertEqual(
+ 'foo',
+ a[0].prefix)
+
+ def test_prefix_default_ns(self):
+ ElementTree = self.etree.ElementTree
+
+ f = BytesIO('<a xmlns="http://www.infrae.com/ns/1"><b/></a>')
+ doc = ElementTree(file=f)
+ a = doc.getroot()
+ self.assertEqual(
+ None,
+ a.prefix)
+ self.assertEqual(
+ None,
+ a[0].prefix)
+
+ def test_getparent(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ self.assertEqual(
+ None,
+ a.getparent())
+ self.assertEqual(
+ a,
+ b.getparent())
+ self.assertEqual(
+ b.getparent(),
+ c.getparent())
+ self.assertEqual(
+ b,
+ d.getparent())
+
+ def test_iterchildren(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<three/></doc>'))
+ result = []
+ for el in root.iterchildren():
+ result.append(el.tag)
+ self.assertEqual(['one', 'two', 'three'], result)
+
+ def test_iterchildren_reversed(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<three/></doc>'))
+ result = []
+ for el in root.iterchildren(reversed=True):
+ result.append(el.tag)
+ self.assertEqual(['three', 'two', 'one'], result)
+
+ def test_iterchildren_tag(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<two>Bla</two></doc>'))
+ result = []
+ for el in root.iterchildren(tag='two'):
+ result.append(el.text)
+ self.assertEqual(['Two', 'Bla'], result)
+
+ def test_iterchildren_tag_posarg(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<two>Bla</two></doc>'))
+ result = []
+ for el in root.iterchildren('two'):
+ result.append(el.text)
+ self.assertEqual(['Two', 'Bla'], result)
+
+ def test_iterchildren_tag_reversed(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<two>Bla</two></doc>'))
+ result = []
+ for el in root.iterchildren(reversed=True, tag='two'):
+ result.append(el.text)
+ self.assertEqual(['Bla', 'Two'], result)
+
+ def test_iterchildren_tag_multiple(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<two>Bla</two><three/></doc>'))
+ result = []
+ for el in root.iterchildren(tag=['two', 'three']):
+ result.append(el.text)
+ self.assertEqual(['Two', 'Bla', None], result)
+
+ def test_iterchildren_tag_multiple_posarg(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<two>Bla</two><three/></doc>'))
+ result = []
+ for el in root.iterchildren('two', 'three'):
+ result.append(el.text)
+ self.assertEqual(['Two', 'Bla', None], result)
+
+ def test_iterchildren_tag_multiple_reversed(self):
+ XML = self.etree.XML
+
+ root = XML(_bytes('<doc><one/><two>Two</two>Hm<two>Bla</two><three/></doc>'))
+ result = []
+ for el in root.iterchildren(reversed=True, tag=['two', 'three']):
+ result.append(el.text)
+ self.assertEqual([None, 'Bla', 'Two'], result)
+
+ def test_iterancestors(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ self.assertEqual(
+ [],
+ list(a.iterancestors()))
+ self.assertEqual(
+ [a],
+ list(b.iterancestors()))
+ self.assertEqual(
+ [a],
+ list(c.iterancestors()))
+ self.assertEqual(
+ [b, a],
+ list(d.iterancestors()))
+
+ def test_iterancestors_tag(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ self.assertEqual(
+ [a],
+ list(d.iterancestors('a')))
+ self.assertEqual(
+ [a],
+ list(d.iterancestors(tag='a')))
+
+ self.assertEqual(
+ [b, a],
+ list(d.iterancestors('*')))
+ self.assertEqual(
+ [b, a],
+ list(d.iterancestors(tag='*')))
+
+ def test_iterancestors_tag_multiple(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ self.assertEqual(
+ [b, a],
+ list(d.iterancestors(tag=('a', 'b'))))
+ self.assertEqual(
+ [b, a],
+ list(d.iterancestors('a', 'b')))
+
+ self.assertEqual(
+ [],
+ list(d.iterancestors(tag=('w', 'x', 'y', 'z'))))
+ self.assertEqual(
+ [],
+ list(d.iterancestors('w', 'x', 'y', 'z')))
+
+ self.assertEqual(
+ [],
+ list(d.iterancestors(tag=('d', 'x'))))
+ self.assertEqual(
+ [],
+ list(d.iterancestors('d', 'x')))
+
+ self.assertEqual(
+ [b, a],
+ list(d.iterancestors(tag=('b', '*'))))
+ self.assertEqual(
+ [b, a],
+ list(d.iterancestors('b', '*')))
+
+ self.assertEqual(
+ [b],
+ list(d.iterancestors(tag=('b', 'c'))))
+ self.assertEqual(
+ [b],
+ list(d.iterancestors('b', 'c')))
+
+ def test_iterdescendants(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+
+ self.assertEqual(
+ [b, d, c, e],
+ list(a.iterdescendants()))
+ self.assertEqual(
+ [],
+ list(d.iterdescendants()))
+
+ def test_iterdescendants_tag(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+
+ self.assertEqual(
+ [],
+ list(a.iterdescendants('a')))
+ self.assertEqual(
+ [],
+ list(a.iterdescendants(tag='a')))
+
+ a2 = SubElement(e, 'a')
+ self.assertEqual(
+ [a2],
+ list(a.iterdescendants('a')))
+
+ self.assertEqual(
+ [a2],
+ list(c.iterdescendants('a')))
+ self.assertEqual(
+ [a2],
+ list(c.iterdescendants(tag='a')))
+
+ def test_iterdescendants_tag_multiple(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+
+ self.assertEqual(
+ [b, e],
+ list(a.iterdescendants(tag=('a', 'b', 'e'))))
+ self.assertEqual(
+ [b, e],
+ list(a.iterdescendants('a', 'b', 'e')))
+
+ a2 = SubElement(e, 'a')
+ self.assertEqual(
+ [b, a2],
+ list(a.iterdescendants(tag=('a', 'b'))))
+ self.assertEqual(
+ [b, a2],
+ list(a.iterdescendants('a', 'b')))
+
+ self.assertEqual(
+ [],
+ list(c.iterdescendants(tag=('x', 'y', 'z'))))
+ self.assertEqual(
+ [],
+ list(c.iterdescendants('x', 'y', 'z')))
+
+ self.assertEqual(
+ [b, d, c, e, a2],
+ list(a.iterdescendants(tag=('x', 'y', 'z', '*'))))
+ self.assertEqual(
+ [b, d, c, e, a2],
+ list(a.iterdescendants('x', 'y', 'z', '*')))
+
+ def test_getroottree(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ self.assertEqual(
+ a,
+ a.getroottree().getroot())
+ self.assertEqual(
+ a,
+ b.getroottree().getroot())
+ self.assertEqual(
+ a,
+ d.getroottree().getroot())
+
+ def test_getnext(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ self.assertEqual(
+ None,
+ a.getnext())
+ self.assertEqual(
+ c,
+ b.getnext())
+ self.assertEqual(
+ None,
+ c.getnext())
+
+ def test_getprevious(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ self.assertEqual(
+ None,
+ a.getprevious())
+ self.assertEqual(
+ b,
+ c.getprevious())
+ self.assertEqual(
+ None,
+ b.getprevious())
+
+ def test_itersiblings(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ self.assertEqual(
+ [],
+ list(a.itersiblings()))
+ self.assertEqual(
+ [c],
+ list(b.itersiblings()))
+ self.assertEqual(
+ [],
+ list(c.itersiblings()))
+ self.assertEqual(
+ [b],
+ list(c.itersiblings(preceding=True)))
+ self.assertEqual(
+ [],
+ list(b.itersiblings(preceding=True)))
+
+ def test_itersiblings_tag(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ self.assertEqual(
+ [],
+ list(a.itersiblings(tag='XXX')))
+ self.assertEqual(
+ [c],
+ list(b.itersiblings(tag='c')))
+ self.assertEqual(
+ [c],
+ list(b.itersiblings(tag='*')))
+ self.assertEqual(
+ [b],
+ list(c.itersiblings(preceding=True, tag='b')))
+ self.assertEqual(
+ [],
+ list(c.itersiblings(preceding=True, tag='c')))
+
+ def test_itersiblings_tag_multiple(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(a, 'e')
+ self.assertEqual(
+ [],
+ list(a.itersiblings(tag=('XXX', 'YYY'))))
+ self.assertEqual(
+ [c, e],
+ list(b.itersiblings(tag=('c', 'd', 'e'))))
+ self.assertEqual(
+ [b],
+ list(c.itersiblings(preceding=True, tag=('b', 'b', 'c', 'd'))))
+ self.assertEqual(
+ [c, b],
+ list(e.itersiblings(preceding=True, tag=('c', '*'))))
+
+ def test_parseid(self):
+ parseid = self.etree.parseid
+ XML = self.etree.XML
+ xml_text = _bytes('''
+ <!DOCTYPE document [
+ <!ELEMENT document (h1,p)*>
+ <!ELEMENT h1 (#PCDATA)>
+ <!ATTLIST h1 myid ID #REQUIRED>
+ <!ELEMENT p (#PCDATA)>
+ <!ATTLIST p someid ID #REQUIRED>
+ ]>
+ <document>
+ <h1 myid="chapter1">...</h1>
+ <p id="note1" class="note">...</p>
+ <p>Regular paragraph.</p>
+ <p xml:id="xmlid">XML:ID paragraph.</p>
+ <p someid="warn1" class="warning">...</p>
+ </document>
+ ''')
+
+ tree, dic = parseid(BytesIO(xml_text))
+ root = tree.getroot()
+ root2 = XML(xml_text)
+ self.assertEqual(self._writeElement(root),
+ self._writeElement(root2))
+ expected = {
+ "chapter1" : root[0],
+ "xmlid" : root[3],
+ "warn1" : root[4]
+ }
+ self.assertTrue("chapter1" in dic)
+ self.assertTrue("warn1" in dic)
+ self.assertTrue("xmlid" in dic)
+ self._checkIDDict(dic, expected)
+
+ def test_XMLDTDID(self):
+ XMLDTDID = self.etree.XMLDTDID
+ XML = self.etree.XML
+ xml_text = _bytes('''
+ <!DOCTYPE document [
+ <!ELEMENT document (h1,p)*>
+ <!ELEMENT h1 (#PCDATA)>
+ <!ATTLIST h1 myid ID #REQUIRED>
+ <!ELEMENT p (#PCDATA)>
+ <!ATTLIST p someid ID #REQUIRED>
+ ]>
+ <document>
+ <h1 myid="chapter1">...</h1>
+ <p id="note1" class="note">...</p>
+ <p>Regular paragraph.</p>
+ <p xml:id="xmlid">XML:ID paragraph.</p>
+ <p someid="warn1" class="warning">...</p>
+ </document>
+ ''')
+
+ root, dic = XMLDTDID(xml_text)
+ root2 = XML(xml_text)
+ self.assertEqual(self._writeElement(root),
+ self._writeElement(root2))
+ expected = {
+ "chapter1" : root[0],
+ "xmlid" : root[3],
+ "warn1" : root[4]
+ }
+ self.assertTrue("chapter1" in dic)
+ self.assertTrue("warn1" in dic)
+ self.assertTrue("xmlid" in dic)
+ self._checkIDDict(dic, expected)
+
+ def test_XMLDTDID_empty(self):
+ XMLDTDID = self.etree.XMLDTDID
+ XML = self.etree.XML
+ xml_text = _bytes('''
+ <document>
+ <h1 myid="chapter1">...</h1>
+ <p id="note1" class="note">...</p>
+ <p>Regular paragraph.</p>
+ <p someid="warn1" class="warning">...</p>
+ </document>
+ ''')
+
+ root, dic = XMLDTDID(xml_text)
+ root2 = XML(xml_text)
+ self.assertEqual(self._writeElement(root),
+ self._writeElement(root2))
+ expected = {}
+ self._checkIDDict(dic, expected)
+
+ def test_XMLDTDID_no_id_dict(self):
+ XMLDTDID = self.etree.XMLDTDID
+ XML = self.etree.XML
+ xml_text = _bytes('''
+ <!DOCTYPE document [
+ <!ELEMENT document (h1,p)*>
+ <!ELEMENT h1 (#PCDATA)>
+ <!ATTLIST h1 myid ID #REQUIRED>
+ <!ELEMENT p (#PCDATA)>
+ <!ATTLIST p someid ID #REQUIRED>
+ ]>
+ <document>
+ <h1 myid="chapter1">...</h1>
+ <p id="note1" class="note">...</p>
+ <p>Regular paragraph.</p>
+ <p xml:id="xmlid">XML:ID paragraph.</p>
+ <p someid="warn1" class="warning">...</p>
+ </document>
+ ''')
+
+ parser = etree.XMLParser(collect_ids=False)
+ root, dic = XMLDTDID(xml_text, parser=parser)
+ root2 = XML(xml_text)
+ self.assertEqual(self._writeElement(root),
+ self._writeElement(root2))
+ self.assertFalse(dic)
+ self._checkIDDict(dic, {})
+
+ def _checkIDDict(self, dic, expected):
+ self.assertEqual(len(dic),
+ len(expected))
+ self.assertEqual(sorted(dic.items()),
+ sorted(expected.items()))
+ if sys.version_info < (3,):
+ self.assertEqual(sorted(dic.iteritems()),
+ sorted(expected.iteritems()))
+ self.assertEqual(sorted(dic.keys()),
+ sorted(expected.keys()))
+ if sys.version_info < (3,):
+ self.assertEqual(sorted(dic.iterkeys()),
+ sorted(expected.iterkeys()))
+ if sys.version_info < (3,):
+ self.assertEqual(sorted(dic.values()),
+ sorted(expected.values()))
+ self.assertEqual(sorted(dic.itervalues()),
+ sorted(expected.itervalues()))
+
+ def test_register_namespace_xml(self):
+ self.assertRaises(ValueError, self.etree.register_namespace,
+ "XML", "http://www.w3.org/XML/1998/namespace")
+ self.assertRaises(ValueError, self.etree.register_namespace,
+ "xml", "http://www.w3.org/XML/2345")
+ self.etree.register_namespace("xml", "http://www.w3.org/XML/1998/namespace") # ok
+
+ def test_namespaces(self):
+ etree = self.etree
+
+ r = {'foo': 'http://ns.infrae.com/foo'}
+ e = etree.Element('{http://ns.infrae.com/foo}bar', nsmap=r)
+ self.assertEqual(
+ 'foo',
+ e.prefix)
+ self.assertEqual(
+ _bytes('<foo:bar xmlns:foo="http://ns.infrae.com/foo"></foo:bar>'),
+ self._writeElement(e))
+
+ def test_namespaces_default(self):
+ etree = self.etree
+
+ r = {None: 'http://ns.infrae.com/foo'}
+ e = etree.Element('{http://ns.infrae.com/foo}bar', nsmap=r)
+ self.assertEqual(
+ None,
+ e.prefix)
+ self.assertEqual(
+ '{http://ns.infrae.com/foo}bar',
+ e.tag)
+ self.assertEqual(
+ _bytes('<bar xmlns="http://ns.infrae.com/foo"></bar>'),
+ self._writeElement(e))
+
+ def test_namespaces_default_and_other(self):
+ etree = self.etree
+
+ r = {None: 'http://ns.infrae.com/foo', 'p': 'http://test/'}
+ e = etree.Element('{http://ns.infrae.com/foo}bar', nsmap=r)
+ self.assertEqual(None, e.prefix)
+ self.assertEqual('{http://ns.infrae.com/foo}bar', e.tag)
+ self.assertEqual(
+ _bytes('<bar xmlns="http://ns.infrae.com/foo" xmlns:p="http://test/"></bar>'),
+ self._writeElement(e))
+
+ def test_namespaces_default_and_attr(self):
+ etree = self.etree
+
+ r = {None: 'http://ns.infrae.com/foo',
+ 'hoi': 'http://ns.infrae.com/hoi'}
+ e = etree.Element('{http://ns.infrae.com/foo}bar', nsmap=r)
+ e.set('{http://ns.infrae.com/hoi}test', 'value')
+ self.assertEqual(
+ _bytes('<bar xmlns="http://ns.infrae.com/foo" xmlns:hoi="http://ns.infrae.com/hoi" hoi:test="value"></bar>'),
+ self._writeElement(e))
+
+ def test_attribute_keeps_namespace_prefix_on_merge(self):
+ etree = self.etree
+
+ root = etree.Element('{http://test/ns}root',
+ nsmap={None: 'http://test/ns'})
+ sub = etree.Element('{http://test/ns}sub',
+ nsmap={'test': 'http://test/ns'})
+
+ sub.attrib['{http://test/ns}attr'] = 'value'
+ self.assertEqual(sub.attrib['{http://test/ns}attr'], 'value')
+ self.assertEqual(
+ _bytes('<test:sub xmlns:test="http://test/ns" test:attr="value"/>'),
+ etree.tostring(sub))
+
+ root.append(sub)
+ self.assertEqual(
+ _bytes('<root xmlns="http://test/ns">'
+ '<sub xmlns:test="http://test/ns" test:attr="value"/>'
+ '</root>'),
+ etree.tostring(root))
+
+ def test_attribute_keeps_namespace_prefix_on_merge_with_nons(self):
+ etree = self.etree
+
+ root = etree.Element('root')
+ sub = etree.Element('{http://test/ns}sub',
+ nsmap={'test': 'http://test/ns'})
+
+ sub.attrib['{http://test/ns}attr'] = 'value'
+ self.assertEqual(sub.attrib['{http://test/ns}attr'], 'value')
+ self.assertEqual(
+ _bytes('<test:sub xmlns:test="http://test/ns" test:attr="value"/>'),
+ etree.tostring(sub))
+
+ root.append(sub)
+ self.assertEqual(
+ _bytes('<root>'
+ '<test:sub xmlns:test="http://test/ns" test:attr="value"/>'
+ '</root>'),
+ etree.tostring(root))
+
+ def test_attribute_gets_namespace_prefix_on_merge_with_nons(self):
+ etree = self.etree
+
+ root = etree.Element('root')
+ sub = etree.Element('{http://test/ns}sub',
+ nsmap={None: 'http://test/ns'})
+
+ sub.attrib['{http://test/ns}attr'] = 'value'
+ self.assertEqual(sub.attrib['{http://test/ns}attr'], 'value')
+ self.assertEqual(
+ _bytes('<sub xmlns="http://test/ns" '
+ 'xmlns:ns0="http://test/ns" ns0:attr="value"/>'),
+ etree.tostring(sub))
+
+ root.append(sub)
+ self.assertEqual(
+ _bytes('<root>'
+ '<sub xmlns="http://test/ns"'
+ ' xmlns:ns0="http://test/ns" ns0:attr="value"/>'
+ '</root>'),
+ etree.tostring(root))
+
+ def test_attribute_gets_namespace_prefix_on_merge(self):
+ etree = self.etree
+
+ root = etree.Element('{http://test/ns}root',
+ nsmap={'test': 'http://test/ns',
+ None: 'http://test/ns'})
+ sub = etree.Element('{http://test/ns}sub',
+ nsmap={None: 'http://test/ns'})
+
+ sub.attrib['{http://test/ns}attr'] = 'value'
+ self.assertEqual(sub.attrib['{http://test/ns}attr'], 'value')
+ self.assertEqual(
+ _bytes('<sub xmlns="http://test/ns" '
+ 'xmlns:ns0="http://test/ns" ns0:attr="value"/>'),
+ etree.tostring(sub))
+
+ root.append(sub)
+ self.assertEqual(
+ _bytes('<test:root xmlns:test="http://test/ns" xmlns="http://test/ns">'
+ '<test:sub test:attr="value"/>'
+ '</test:root>'),
+ etree.tostring(root))
+
+ def test_namespaces_elementtree(self):
+ etree = self.etree
+ r = {None: 'http://ns.infrae.com/foo',
+ 'hoi': 'http://ns.infrae.com/hoi'}
+ e = etree.Element('{http://ns.infrae.com/foo}z', nsmap=r)
+ tree = etree.ElementTree(element=e)
+ etree.SubElement(e, '{http://ns.infrae.com/hoi}x')
+ self.assertEqual(
+ _bytes('<z xmlns="http://ns.infrae.com/foo" xmlns:hoi="http://ns.infrae.com/hoi"><hoi:x></hoi:x></z>'),
+ self._writeElement(e))
+
+ def test_namespaces_default_copy_element(self):
+ etree = self.etree
+
+ r = {None: 'http://ns.infrae.com/foo'}
+ e1 = etree.Element('{http://ns.infrae.com/foo}bar', nsmap=r)
+ e2 = etree.Element('{http://ns.infrae.com/foo}bar', nsmap=r)
+
+ e1.append(e2)
+
+ self.assertEqual(
+ None,
+ e1.prefix)
+ self.assertEqual(
+ None,
+ e1[0].prefix)
+ self.assertEqual(
+ '{http://ns.infrae.com/foo}bar',
+ e1.tag)
+ self.assertEqual(
+ '{http://ns.infrae.com/foo}bar',
+ e1[0].tag)
+
+ def test_namespaces_copy_element(self):
+ etree = self.etree
+
+ r = {None: 'http://ns.infrae.com/BAR'}
+ e1 = etree.Element('{http://ns.infrae.com/BAR}bar', nsmap=r)
+ e2 = etree.Element('{http://ns.infrae.com/foo}bar', nsmap=r)
+
+ e1.append(e2)
+
+ self.assertEqual(
+ None,
+ e1.prefix)
+ self.assertNotEqual(
+ None,
+ e2.prefix)
+ self.assertEqual(
+ '{http://ns.infrae.com/BAR}bar',
+ e1.tag)
+ self.assertEqual(
+ '{http://ns.infrae.com/foo}bar',
+ e2.tag)
+
+ def test_namespaces_reuse_after_move(self):
+ ns_href = "http://a.b.c"
+ one = self.etree.fromstring(
+ _bytes('<foo><bar xmlns:ns="%s"><ns:baz/></bar></foo>' % ns_href))
+ baz = one[0][0]
+
+ two = self.etree.fromstring(
+ _bytes('<root xmlns:ns="%s"/>' % ns_href))
+ two.append(baz)
+ del one # make sure the source document is deallocated
+
+ self.assertEqual('{%s}baz' % ns_href, baz.tag)
+ self.assertEqual(
+ _bytes('<root xmlns:ns="%s"><ns:baz/></root>' % ns_href),
+ self.etree.tostring(two))
+
+ def test_namespace_cleanup(self):
+ xml = _bytes(
+ '<foo xmlns="F" xmlns:x="x">'
+ '<bar xmlns:ns="NS" xmlns:b="b" xmlns="B">'
+ '<ns:baz/>'
+ '</bar></foo>'
+ )
+ root = self.etree.fromstring(xml)
+ self.assertEqual(xml, self.etree.tostring(root))
+ self.etree.cleanup_namespaces(root)
+ self.assertEqual(
+ _bytes('<foo xmlns="F"><bar xmlns:ns="NS" xmlns="B"><ns:baz/></bar></foo>'),
+ self.etree.tostring(root))
+
+ def test_namespace_cleanup_attributes(self):
+ xml = _bytes(
+ '<foo xmlns="F" xmlns:x="X" xmlns:a="A">'
+ '<bar xmlns:ns="NS" xmlns:b="b" xmlns="B">'
+ '<ns:baz a:test="attr"/>'
+ '</bar></foo>'
+ )
+ root = self.etree.fromstring(xml)
+ self.assertEqual(xml, self.etree.tostring(root))
+ self.etree.cleanup_namespaces(root)
+ self.assertEqual(
+ _bytes('<foo xmlns="F" xmlns:a="A">'
+ '<bar xmlns:ns="NS" xmlns="B">'
+ '<ns:baz a:test="attr"/>'
+ '</bar></foo>'),
+ self.etree.tostring(root))
+
+ def test_namespace_cleanup_many(self):
+ xml = ('<n12:foo ' +
+ ' '.join('xmlns:n{n}="NS{n}"'.format(n=i) for i in range(100)) +
+ '><n68:a/></n12:foo>').encode('utf8')
+ root = self.etree.fromstring(xml)
+ self.assertEqual(xml, self.etree.tostring(root))
+ self.etree.cleanup_namespaces(root)
+ self.assertEqual(
+ b'<n12:foo xmlns:n12="NS12" xmlns:n68="NS68"><n68:a/></n12:foo>',
+ self.etree.tostring(root))
+
+ def test_namespace_cleanup_deep(self):
+ xml = ('<root>' +
+ ''.join('<a xmlns:n{n}="NS{n}">'.format(n=i) for i in range(100)) +
+ '<n64:x/>' + '</a>'*100 + '</root>').encode('utf8')
+ root = self.etree.fromstring(xml)
+ self.assertEqual(xml, self.etree.tostring(root))
+ self.etree.cleanup_namespaces(root)
+ self.assertEqual(
+ b'<root>' + b'<a>'*64 + b'<a xmlns:n64="NS64">' + b'<a>'*35 +
+ b'<n64:x/>' + b'</a>'*100 + b'</root>',
+ self.etree.tostring(root))
+
+ def test_namespace_cleanup_deep_to_top(self):
+ xml = ('<root>' +
+ ''.join('<a xmlns:n{n}="NS{n}">'.format(n=i) for i in range(100)) +
+ '<n64:x xmlns:a="A" a:attr="X"/>' +
+ '</a>'*100 +
+ '</root>').encode('utf8')
+ root = self.etree.fromstring(xml)
+ self.assertEqual(xml, self.etree.tostring(root))
+ self.etree.cleanup_namespaces(root, top_nsmap={'n64': 'NS64'})
+ self.assertEqual(
+ b'<root xmlns:n64="NS64">' + b'<a>'*100 +
+ b'<n64:x xmlns:a="A" a:attr="X"/>' + b'</a>'*100 + b'</root>',
+ self.etree.tostring(root))
+
+ def test_namespace_cleanup_keep_prefixes(self):
+ xml = ('<root xmlns:n64="NS64" xmlns:foo="FOO" xmlns:unused1="UNUSED" xmlns:no="NO">'
+ '<a xmlns:unused2="UNUSED"><n64:x xmlns:a="A" a:attr="X"/></a>'
+ '<foo>foo:bar</foo>'
+ '</root>').encode('utf8')
+ root = self.etree.fromstring(xml)
+ self.assertEqual(xml, self.etree.tostring(root))
+ self.etree.cleanup_namespaces(root, keep_ns_prefixes=['foo'])
+ self.assertEqual(
+ b'<root xmlns:n64="NS64" xmlns:foo="FOO">'
+ b'<a><n64:x xmlns:a="A" a:attr="X"/></a>'
+ b'<foo>foo:bar</foo>'
+ b'</root>',
+ self.etree.tostring(root))
+
+ def test_namespace_cleanup_keep_prefixes_top(self):
+ xml = ('<root xmlns:n64="NS64" xmlns:unused1="UNUSED" xmlns:no="NO">'
+ '<sub xmlns:foo="FOO">'
+ '<a xmlns:unused2="UNUSED"><n64:x xmlns:a="A" a:attr="X"/></a>'
+ '<foo>foo:bar</foo>'
+ '</sub>'
+ '</root>').encode('utf8')
+ root = self.etree.fromstring(xml)
+ self.assertEqual(xml, self.etree.tostring(root))
+ self.etree.cleanup_namespaces(
+ root,
+ top_nsmap={'foo': 'FOO', 'unused1': 'UNUSED'},
+ keep_ns_prefixes=['foo'])
+ self.assertEqual(
+ b'<root xmlns:n64="NS64" xmlns:foo="FOO">'
+ b'<sub>'
+ b'<a><n64:x xmlns:a="A" a:attr="X"/></a>'
+ b'<foo>foo:bar</foo>'
+ b'</sub>'
+ b'</root>',
+ self.etree.tostring(root))
+
+ def test_element_nsmap(self):
+ etree = self.etree
+
+ r = {None: 'http://ns.infrae.com/foo',
+ 'hoi': 'http://ns.infrae.com/hoi'}
+ e = etree.Element('{http://ns.infrae.com/foo}bar', nsmap=r)
+ self.assertEqual(
+ r,
+ e.nsmap)
+
+ def test_subelement_nsmap(self):
+ etree = self.etree
+
+ re = {None: 'http://ns.infrae.com/foo',
+ 'hoi': 'http://ns.infrae.com/hoi'}
+ e = etree.Element('{http://ns.infrae.com/foo}bar', nsmap=re)
+
+ rs = {None: 'http://ns.infrae.com/honk',
+ 'top': 'http://ns.infrae.com/top'}
+ s = etree.SubElement(e, '{http://ns.infrae.com/honk}bar', nsmap=rs)
+
+ r = re.copy()
+ r.update(rs)
+ self.assertEqual(re, e.nsmap)
+ self.assertEqual(r, s.nsmap)
+
+ def test_html_prefix_nsmap(self):
+ etree = self.etree
+ el = etree.HTML('<hha:page-description>aa</hha:page-description>').find('.//page-description')
+ self.assertEqual({'hha': None}, el.nsmap)
+
+ def test_getchildren(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+ self.assertEqual(
+ _bytes('<a><b><d></d></b><c><e></e></c></a>'),
+ self.etree.tostring(a, method="c14n"))
+ self.assertEqual(
+ [b, c],
+ a.getchildren())
+ self.assertEqual(
+ [d],
+ b.getchildren())
+ self.assertEqual(
+ [],
+ d.getchildren())
+
+ def test_getiterator(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+
+ self.assertEqual(
+ [a, b, d, c, e],
+ list(a.getiterator()))
+ self.assertEqual(
+ [d],
+ list(d.getiterator()))
+
+ def test_getiterator_empty(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+
+ self.assertEqual(
+ [],
+ list(a.getiterator('none')))
+ self.assertEqual(
+ [],
+ list(e.getiterator('none')))
+ self.assertEqual(
+ [e],
+ list(e.getiterator()))
+
+ def test_getiterator_filter(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+
+ self.assertEqual(
+ [a],
+ list(a.getiterator('a')))
+ a2 = SubElement(e, 'a')
+ self.assertEqual(
+ [a, a2],
+ list(a.getiterator('a')))
+ self.assertEqual(
+ [a2],
+ list(c.getiterator('a')))
+
+ def test_getiterator_filter_all(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+
+ self.assertEqual(
+ [a, b, d, c, e],
+ list(a.getiterator('*')))
+
+ def test_getiterator_filter_comment(self):
+ Element = self.etree.Element
+ Comment = self.etree.Comment
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ comment_b = Comment("TEST-b")
+ b.append(comment_b)
+
+ self.assertEqual(
+ [comment_b],
+ list(a.getiterator(Comment)))
+
+ comment_a = Comment("TEST-a")
+ a.append(comment_a)
+
+ self.assertEqual(
+ [comment_b, comment_a],
+ list(a.getiterator(Comment)))
+
+ self.assertEqual(
+ [comment_b],
+ list(b.getiterator(Comment)))
+
+ def test_getiterator_filter_pi(self):
+ Element = self.etree.Element
+ PI = self.etree.ProcessingInstruction
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ pi_b = PI("TEST-b")
+ b.append(pi_b)
+
+ self.assertEqual(
+ [pi_b],
+ list(a.getiterator(PI)))
+
+ pi_a = PI("TEST-a")
+ a.append(pi_a)
+
+ self.assertEqual(
+ [pi_b, pi_a],
+ list(a.getiterator(PI)))
+
+ self.assertEqual(
+ [pi_b],
+ list(b.getiterator(PI)))
+
+ def test_getiterator_with_text(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = 'a'
+ b = SubElement(a, 'b')
+ b.text = 'b'
+ b.tail = 'b1'
+ c = SubElement(a, 'c')
+ c.text = 'c'
+ c.tail = 'c1'
+ d = SubElement(b, 'd')
+ d.text = 'd'
+ d.tail = 'd1'
+ e = SubElement(c, 'e')
+ e.text = 'e'
+ e.tail = 'e1'
+
+ self.assertEqual(
+ [a, b, d, c, e],
+ list(a.getiterator()))
+ #self.assertEqual(
+ # [d],
+ # list(d.getiterator()))
+
+ def test_getiterator_filter_with_text(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = 'a'
+ b = SubElement(a, 'b')
+ b.text = 'b'
+ b.tail = 'b1'
+ c = SubElement(a, 'c')
+ c.text = 'c'
+ c.tail = 'c1'
+ d = SubElement(b, 'd')
+ d.text = 'd'
+ d.tail = 'd1'
+ e = SubElement(c, 'e')
+ e.text = 'e'
+ e.tail = 'e1'
+
+ self.assertEqual(
+ [a],
+ list(a.getiterator('a')))
+ a2 = SubElement(e, 'a')
+ self.assertEqual(
+ [a, a2],
+ list(a.getiterator('a')))
+ self.assertEqual(
+ [a2],
+ list(e.getiterator('a')))
+
+ def test_getiterator_filter_multiple(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+ f = SubElement(c, 'f')
+
+ self.assertEqual(
+ [a, b],
+ list(a.getiterator('a', 'b')))
+ self.assertEqual(
+ [],
+ list(a.getiterator('x', 'y')))
+ self.assertEqual(
+ [a, f],
+ list(a.getiterator('f', 'a')))
+ self.assertEqual(
+ [c, e, f],
+ list(c.getiterator('c', '*', 'a')))
+ self.assertEqual(
+ [],
+ list(a.getiterator( (), () )))
+
+ def test_getiterator_filter_multiple_tuple(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+ f = SubElement(c, 'f')
+
+ self.assertEqual(
+ [a, b],
+ list(a.getiterator( ('a', 'b') )))
+ self.assertEqual(
+ [],
+ list(a.getiterator( ('x', 'y') )))
+ self.assertEqual(
+ [a, f],
+ list(a.getiterator( ('f', 'a') )))
+ self.assertEqual(
+ [c, e, f],
+ list(c.getiterator( ('c', '*', 'a') )))
+ self.assertEqual(
+ [],
+ list(a.getiterator( () )))
+
+ def test_getiterator_filter_namespace(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('{a}a')
+ b = SubElement(a, '{a}b')
+ c = SubElement(a, '{a}c')
+ d = SubElement(b, '{b}d')
+ e = SubElement(c, '{a}e')
+ f = SubElement(c, '{b}f')
+ g = SubElement(c, 'g')
+
+ self.assertEqual(
+ [a],
+ list(a.getiterator('{a}a')))
+ self.assertEqual(
+ [],
+ list(a.getiterator('{b}a')))
+ self.assertEqual(
+ [],
+ list(a.getiterator('a')))
+ self.assertEqual(
+ [a,b,d,c,e,f,g],
+ list(a.getiterator('*')))
+ self.assertEqual(
+ [f],
+ list(c.getiterator('{b}*')))
+ self.assertEqual(
+ [d, f],
+ list(a.getiterator('{b}*')))
+ self.assertEqual(
+ [g],
+ list(a.getiterator('g')))
+ self.assertEqual(
+ [g],
+ list(a.getiterator('{}g')))
+ self.assertEqual(
+ [g],
+ list(a.getiterator('{}*')))
+
+ def test_getiterator_filter_local_name(self):
+ Element = self.etree.Element
+ Comment = self.etree.Comment
+ SubElement = self.etree.SubElement
+
+ a = Element('{a}a')
+ b = SubElement(a, '{nsA}b')
+ c = SubElement(b, '{nsB}b')
+ d = SubElement(a, 'b')
+ e = SubElement(a, '{nsA}e')
+ f = SubElement(e, '{nsB}e')
+ g = SubElement(e, 'e')
+ a.append(Comment('test'))
+
+ self.assertEqual(
+ [b, c, d],
+ list(a.getiterator('{*}b')))
+ self.assertEqual(
+ [e, f, g],
+ list(a.getiterator('{*}e')))
+ self.assertEqual(
+ [a, b, c, d, e, f, g],
+ list(a.getiterator('{*}*')))
+
+ def test_getiterator_filter_entities(self):
+ Element = self.etree.Element
+ Entity = self.etree.Entity
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ entity_b = Entity("TEST-b")
+ b.append(entity_b)
+
+ self.assertEqual(
+ [entity_b],
+ list(a.getiterator(Entity)))
+
+ entity_a = Entity("TEST-a")
+ a.append(entity_a)
+
+ self.assertEqual(
+ [entity_b, entity_a],
+ list(a.getiterator(Entity)))
+
+ self.assertEqual(
+ [entity_b],
+ list(b.getiterator(Entity)))
+
+ def test_getiterator_filter_element(self):
+ Element = self.etree.Element
+ Comment = self.etree.Comment
+ PI = self.etree.PI
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ a.append(Comment("test"))
+ a.append(PI("pi", "content"))
+ c = SubElement(a, 'c')
+
+ self.assertEqual(
+ [a, b, c],
+ list(a.getiterator(Element)))
+
+ def test_getiterator_filter_all_comment_pi(self):
+ # ElementTree iterates over everything here
+ Element = self.etree.Element
+ Comment = self.etree.Comment
+ PI = self.etree.PI
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ a.append(Comment("test"))
+ a.append(PI("pi", "content"))
+ c = SubElement(a, 'c')
+
+ self.assertEqual(
+ [a, b, c],
+ list(a.getiterator('*')))
+
+ def test_elementtree_getiterator(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ ElementTree = self.etree.ElementTree
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+ t = ElementTree(element=a)
+
+ self.assertEqual(
+ [a, b, d, c, e],
+ list(t.getiterator()))
+
+ def test_elementtree_getiterator_filter(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ ElementTree = self.etree.ElementTree
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(b, 'd')
+ e = SubElement(c, 'e')
+ t = ElementTree(element=a)
+
+ self.assertEqual(
+ [a],
+ list(t.getiterator('a')))
+ a2 = SubElement(e, 'a')
+ self.assertEqual(
+ [a, a2],
+ list(t.getiterator('a')))
+
+ def test_elementtree_getelementpath(self):
+ a = etree.Element("a")
+ b = etree.SubElement(a, "b")
+ c = etree.SubElement(a, "c")
+ d1 = etree.SubElement(c, "d")
+ d2 = etree.SubElement(c, "d")
+ c.text = d1.text = 'TEXT'
+
+ tree = etree.ElementTree(a)
+ self.assertEqual('.', tree.getelementpath(a))
+ self.assertEqual('c/d[1]', tree.getelementpath(d1))
+ self.assertEqual('c/d[2]', tree.getelementpath(d2))
+
+ self.assertEqual(d1, tree.find(tree.getelementpath(d1)))
+ self.assertEqual(d2, tree.find(tree.getelementpath(d2)))
+
+ tree = etree.ElementTree(c)
+ self.assertEqual('.', tree.getelementpath(c))
+ self.assertEqual('d[2]', tree.getelementpath(d2))
+ self.assertEqual(d2, tree.find(tree.getelementpath(d2)))
+
+ tree = etree.ElementTree(b) # not a parent of a/c/d1/d2
+ self.assertEqual('.', tree.getelementpath(b))
+ self.assertRaises(ValueError, tree.getelementpath, a)
+ self.assertRaises(ValueError, tree.getelementpath, c)
+ self.assertRaises(ValueError, tree.getelementpath, d2)
+
+ def test_elementtree_getelementpath_ns(self):
+ a = etree.Element("{http://ns1/}a")
+ b = etree.SubElement(a, "{http://ns1/}b")
+ c = etree.SubElement(a, "{http://ns1/}c")
+ d1 = etree.SubElement(c, "{http://ns1/}d")
+ d2 = etree.SubElement(c, "{http://ns2/}d")
+ d3 = etree.SubElement(c, "{http://ns1/}d")
+
+ tree = etree.ElementTree(a)
+ self.assertEqual('.', tree.getelementpath(a))
+ self.assertEqual('{http://ns1/}c/{http://ns1/}d[1]',
+ tree.getelementpath(d1))
+ self.assertEqual('{http://ns1/}c/{http://ns2/}d',
+ tree.getelementpath(d2))
+ self.assertEqual('{http://ns1/}c/{http://ns1/}d[2]',
+ tree.getelementpath(d3))
+
+ self.assertEqual(a, tree.find(tree.getelementpath(a)))
+ self.assertEqual(b, tree.find(tree.getelementpath(b)))
+ self.assertEqual(c, tree.find(tree.getelementpath(c)))
+ self.assertEqual(d1, tree.find(tree.getelementpath(d1)))
+ self.assertEqual(d2, tree.find(tree.getelementpath(d2)))
+ self.assertEqual(d3, tree.find(tree.getelementpath(d3)))
+
+ tree = etree.ElementTree(c)
+ self.assertEqual('{http://ns1/}d[1]', tree.getelementpath(d1))
+ self.assertEqual('{http://ns2/}d', tree.getelementpath(d2))
+ self.assertEqual('{http://ns1/}d[2]', tree.getelementpath(d3))
+ self.assertEqual(d1, tree.find(tree.getelementpath(d1)))
+ self.assertEqual(d2, tree.find(tree.getelementpath(d2)))
+ self.assertEqual(d3, tree.find(tree.getelementpath(d3)))
+
+ tree = etree.ElementTree(b) # not a parent of d1/d2
+ self.assertRaises(ValueError, tree.getelementpath, d1)
+ self.assertRaises(ValueError, tree.getelementpath, d2)
+
+ def test_elementtree_iter_qname(self):
+ XML = self.etree.XML
+ ElementTree = self.etree.ElementTree
+ QName = self.etree.QName
+ tree = ElementTree(XML(
+ _bytes('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><b/></a>')))
+ self.assertEqual(
+ list(tree.iter(QName("b"))),
+ list(tree.iter("b")),
+ )
+ self.assertEqual(
+ list(tree.iter(QName("X", "b"))),
+ list(tree.iter("{X}b")),
+ )
+
+ self.assertEqual(
+ [e.tag for e in tree.iter(QName("X", "b"), QName("b"))],
+ ['{X}b', 'b', '{X}b', 'b', 'b']
+ )
+ self.assertEqual(
+ list(tree.iter(QName("X", "b"), QName("b"))),
+ list(tree.iter("{X}b", "b"))
+ )
+
+ def test_elementtree_find_qname(self):
+ XML = self.etree.XML
+ ElementTree = self.etree.ElementTree
+ QName = self.etree.QName
+ tree = ElementTree(XML(_bytes('<a><b><c/></b><b/><c><b/></c></a>')))
+ self.assertEqual(tree.find(QName("c")), tree.getroot()[2])
+
+ def test_elementtree_findall_qname(self):
+ XML = self.etree.XML
+ ElementTree = self.etree.ElementTree
+ QName = self.etree.QName
+ tree = ElementTree(XML(_bytes('<a><b><c/></b><b/><c><b/></c></a>')))
+ self.assertEqual(len(list(tree.findall(QName("c")))), 1)
+
+ def test_elementtree_findall_ns_qname(self):
+ XML = self.etree.XML
+ ElementTree = self.etree.ElementTree
+ QName = self.etree.QName
+ tree = ElementTree(XML(
+ _bytes('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><b/></a>')))
+ self.assertEqual(len(list(tree.findall(QName("b")))), 2)
+ self.assertEqual(len(list(tree.findall(QName("X", "b")))), 1)
+
+ def test_findall_ns(self):
+ XML = self.etree.XML
+ root = XML(_bytes('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><b/></a>'))
+ self.assertEqual(len(root.findall(".//{X}b")), 2)
+ self.assertEqual(len(root.findall(".//{X}*")), 2)
+ self.assertEqual(len(root.findall(".//b")), 3)
+
+ def test_findall_different_nsmaps(self):
+ XML = self.etree.XML
+ root = XML(_bytes('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><y:b/></a>'))
+ nsmap = {'xx': 'X'}
+ self.assertEqual(len(root.findall(".//xx:b", namespaces=nsmap)), 2)
+ self.assertEqual(len(root.findall(".//xx:*", namespaces=nsmap)), 2)
+ self.assertEqual(len(root.findall(".//b", namespaces=nsmap)), 2)
+ nsmap = {'xx': 'Y'}
+ self.assertEqual(len(root.findall(".//xx:b", namespaces=nsmap)), 1)
+ self.assertEqual(len(root.findall(".//xx:*", namespaces=nsmap)), 1)
+ self.assertEqual(len(root.findall(".//b", namespaces=nsmap)), 2)
+
+ def test_findall_empty_prefix(self):
+ XML = self.etree.XML
+ root = XML(_bytes('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><y:b/></a>'))
+ nsmap = {'xx': 'X'}
+ self.assertEqual(len(root.findall(".//xx:b", namespaces=nsmap)), 2)
+ nsmap = {'xx': 'X', None: 'Y'}
+ self.assertEqual(len(root.findall(".//b", namespaces=nsmap)), 1)
+ nsmap = {'xx': 'X', '': 'Y'}
+ self.assertEqual(len(root.findall(".//b", namespaces=nsmap)), 1)
+
+ def test_findall_syntax_error(self):
+ XML = self.etree.XML
+ root = XML(_bytes('<a><b><c/></b><b/><c><b/><b/></c><b/></a>'))
+ self.assertRaises(SyntaxError, root.findall, '')
+ self.assertRaises(SyntaxError, root.findall, '//') # absolute path on Element
+ self.assertRaises(SyntaxError, root.findall, './//')
+
+ def test_index(self):
+ etree = self.etree
+ e = etree.Element('foo')
+ for i in range(10):
+ etree.SubElement(e, 'a%s' % i)
+ for i in range(10):
+ self.assertEqual(
+ i,
+ e.index(e[i]))
+ self.assertEqual(
+ 3, e.index(e[3], 3))
+ self.assertRaises(
+ ValueError, e.index, e[3], 4)
+ self.assertRaises(
+ ValueError, e.index, e[3], 0, 2)
+ self.assertRaises(
+ ValueError, e.index, e[8], 0, -3)
+ self.assertRaises(
+ ValueError, e.index, e[8], -5, -3)
+ self.assertEqual(
+ 8, e.index(e[8], 0, -1))
+ self.assertEqual(
+ 8, e.index(e[8], -12, -1))
+ self.assertEqual(
+ 0, e.index(e[0], -12, -1))
+
+ def test_replace(self):
+ etree = self.etree
+ e = etree.Element('foo')
+ for i in range(10):
+ el = etree.SubElement(e, 'a%s' % i)
+ el.text = "text%d" % i
+ el.tail = "tail%d" % i
+
+ child0 = e[0]
+ child1 = e[1]
+ child2 = e[2]
+
+ e.replace(e[0], e[1])
+ self.assertEqual(
+ 9, len(e))
+ self.assertEqual(
+ child1, e[0])
+ self.assertEqual(
+ child1.text, "text1")
+ self.assertEqual(
+ child1.tail, "tail1")
+ self.assertEqual(
+ child0.tail, "tail0")
+ self.assertEqual(
+ child2, e[1])
+
+ e.replace(e[-1], e[0])
+ self.assertEqual(
+ child1, e[-1])
+ self.assertEqual(
+ child1.text, "text1")
+ self.assertEqual(
+ child1.tail, "tail1")
+ self.assertEqual(
+ child2, e[0])
+
+ def test_replace_new(self):
+ etree = self.etree
+ e = etree.Element('foo')
+ for i in range(10):
+ etree.SubElement(e, 'a%s' % i)
+
+ new_element = etree.Element("test")
+ new_element.text = "TESTTEXT"
+ new_element.tail = "TESTTAIL"
+ child1 = e[1]
+ e.replace(e[0], new_element)
+ self.assertEqual(
+ new_element, e[0])
+ self.assertEqual(
+ "TESTTEXT",
+ e[0].text)
+ self.assertEqual(
+ "TESTTAIL",
+ e[0].tail)
+ self.assertEqual(
+ child1, e[1])
+
+ def test_setslice_all_reversed(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+
+ e = Element('e')
+ f = Element('f')
+ g = Element('g')
+
+ a[:] = [e, f, g]
+ self.assertEqual(
+ [e, f, g],
+ list(a))
+
+ a[::-1] = [e, f, g]
+ self.assertEqual(
+ [g, f, e],
+ list(a))
+
+ def test_setslice_step(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ e = SubElement(a, 'e')
+
+ x = Element('x')
+ y = Element('y')
+
+ a[1::2] = [x, y]
+ self.assertEqual(
+ [b, x, d, y],
+ list(a))
+
+ def test_setslice_step_negative(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ e = SubElement(a, 'e')
+
+ x = Element('x')
+ y = Element('y')
+
+ a[1::-1] = [x, y]
+ self.assertEqual(
+ [y, x, d, e],
+ list(a))
+
+ def test_setslice_step_negative2(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ e = SubElement(a, 'e')
+
+ x = Element('x')
+ y = Element('y')
+
+ a[::-2] = [x, y]
+ self.assertEqual(
+ [b, y, d, x],
+ list(a))
+
+ def test_setslice_step_overrun(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ try:
+ slice
+ except NameError:
+ print("slice() not found")
+ return
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(a, 'd')
+ e = SubElement(a, 'e')
+
+ x = Element('x')
+ y = Element('y')
+ z = Element('z')
+
+ self.assertRaises(
+ ValueError,
+ operator.setitem, a, slice(1,None,2), [x, y, z])
+
+ self.assertEqual(
+ [b, c, d, e],
+ list(a))
+
+ def test_sourceline_XML(self):
+ XML = self.etree.XML
+ root = XML(_bytes('''<?xml version="1.0"?>
+ <root><test>
+
+ <bla/></test>
+ </root>
+ '''))
+
+ self.assertEqual(
+ [2, 2, 4],
+ [ el.sourceline for el in root.getiterator() ])
+
+ def test_large_sourceline_XML(self):
+ XML = self.etree.XML
+ root = XML(_bytes(
+ '<?xml version="1.0"?>\n'
+ '<root>' + '\n' * 65536 +
+ '<p>' + '\n' * 65536 + '</p>\n' +
+ '<br/>\n'
+ '</root>'))
+
+ if self.etree.LIBXML_VERSION >= (2, 9):
+ expected = [2, 131074, 131076]
+ else:
+ expected = [2, 65535, 65535]
+
+ self.assertEqual(expected, [el.sourceline for el in root.iter()])
+
+ def test_sourceline_parse(self):
+ parse = self.etree.parse
+ tree = parse(fileInTestDir('include/test_xinclude.xml'))
+
+ self.assertEqual(
+ [1, 2, 3],
+ [ el.sourceline for el in tree.getiterator() ])
+
+ def test_sourceline_iterparse_end(self):
+ iterparse = self.etree.iterparse
+ lines = [ el.sourceline for (event, el) in
+ iterparse(fileInTestDir('include/test_xinclude.xml')) ]
+
+ self.assertEqual(
+ [2, 3, 1],
+ lines)
+
+ def test_sourceline_iterparse_start(self):
+ iterparse = self.etree.iterparse
+ lines = [ el.sourceline for (event, el) in
+ iterparse(fileInTestDir('include/test_xinclude.xml'),
+ events=("start",)) ]
+
+ self.assertEqual(
+ [1, 2, 3],
+ lines)
+
+ def test_sourceline_element(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ el = Element("test")
+ self.assertEqual(None, el.sourceline)
+
+ child = SubElement(el, "test")
+ self.assertEqual(None, el.sourceline)
+ self.assertEqual(None, child.sourceline)
+
+ def test_XML_base_url_docinfo(self):
+ etree = self.etree
+ root = etree.XML(_bytes("<root/>"), base_url="http://no/such/url")
+ docinfo = root.getroottree().docinfo
+ self.assertEqual(docinfo.URL, "http://no/such/url")
+
+ def test_XML_set_base_url_docinfo(self):
+ etree = self.etree
+ root = etree.XML(_bytes("<root/>"), base_url="http://no/such/url")
+ docinfo = root.getroottree().docinfo
+ self.assertEqual(docinfo.URL, "http://no/such/url")
+ docinfo.URL = "https://secret/url"
+ self.assertEqual(docinfo.URL, "https://secret/url")
+
+ def test_parse_stringio_base_url(self):
+ etree = self.etree
+ tree = etree.parse(BytesIO("<root/>"), base_url="http://no/such/url")
+ docinfo = tree.docinfo
+ self.assertEqual(docinfo.URL, "http://no/such/url")
+
+ def test_parse_base_url_docinfo(self):
+ etree = self.etree
+ tree = etree.parse(fileInTestDir('include/test_xinclude.xml'),
+ base_url="http://no/such/url")
+ docinfo = tree.docinfo
+ self.assertEqual(docinfo.URL, "http://no/such/url")
+
+ def test_HTML_base_url_docinfo(self):
+ etree = self.etree
+ root = etree.HTML(_bytes("<html/>"), base_url="http://no/such/url")
+ docinfo = root.getroottree().docinfo
+ self.assertEqual(docinfo.URL, "http://no/such/url")
+
+ def test_docinfo_public(self):
+ etree = self.etree
+ xml_header = '<?xml version="1.0" encoding="ascii"?>'
+ pub_id = "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ sys_id = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
+ doctype_string = '<!DOCTYPE html PUBLIC "%s" "%s">' % (pub_id, sys_id)
+
+ xml = _bytes(xml_header + doctype_string + '<html><body></body></html>')
+
+ tree = etree.parse(BytesIO(xml))
+ docinfo = tree.docinfo
+ self.assertEqual(docinfo.encoding, "ascii")
+ self.assertEqual(docinfo.xml_version, "1.0")
+ self.assertEqual(docinfo.public_id, pub_id)
+ self.assertEqual(docinfo.system_url, sys_id)
+ self.assertEqual(docinfo.root_name, 'html')
+ self.assertEqual(docinfo.doctype, doctype_string)
+
+ def test_docinfo_system(self):
+ etree = self.etree
+ xml_header = '<?xml version="1.0" encoding="UTF-8"?>'
+ sys_id = "some.dtd"
+ doctype_string = '<!DOCTYPE html SYSTEM "%s">' % sys_id
+ xml = _bytes(xml_header + doctype_string + '<html><body></body></html>')
+
+ tree = etree.parse(BytesIO(xml))
+ docinfo = tree.docinfo
+ self.assertEqual(docinfo.encoding, "UTF-8")
+ self.assertEqual(docinfo.xml_version, "1.0")
+ self.assertEqual(docinfo.public_id, None)
+ self.assertEqual(docinfo.system_url, sys_id)
+ self.assertEqual(docinfo.root_name, 'html')
+ self.assertEqual(docinfo.doctype, doctype_string)
+
+ def test_docinfo_empty(self):
+ etree = self.etree
+ xml = _bytes('<html><body></body></html>')
+ tree = etree.parse(BytesIO(xml))
+ docinfo = tree.docinfo
+ self.assertEqual(docinfo.encoding, "UTF-8")
+ self.assertEqual(docinfo.xml_version, "1.0")
+ self.assertEqual(docinfo.public_id, None)
+ self.assertEqual(docinfo.system_url, None)
+ self.assertEqual(docinfo.root_name, 'html')
+ self.assertEqual(docinfo.doctype, '')
+
+ def test_docinfo_name_only(self):
+ etree = self.etree
+ xml = _bytes('<!DOCTYPE root><root></root>')
+ tree = etree.parse(BytesIO(xml))
+ docinfo = tree.docinfo
+ self.assertEqual(docinfo.encoding, "UTF-8")
+ self.assertEqual(docinfo.xml_version, "1.0")
+ self.assertEqual(docinfo.public_id, None)
+ self.assertEqual(docinfo.system_url, None)
+ self.assertEqual(docinfo.root_name, 'root')
+ self.assertEqual(docinfo.doctype, '<!DOCTYPE root>')
+
+ def test_doctype_name_only_roundtrip(self):
+ etree = self.etree
+ xml = _bytes('<!DOCTYPE root>\n<root/>')
+ tree = etree.parse(BytesIO(xml))
+ self.assertEqual(xml, etree.tostring(tree))
+
+ def test_doctype_output_override(self):
+ etree = self.etree
+ pub_id = "-//W3C//DTD XHTML 1.0 Transitional//EN"
+ sys_id = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
+ doctype_string = _bytes('<!DOCTYPE html PUBLIC "%s" "%s">' % (pub_id, sys_id))
+
+ xml = _bytes('<!DOCTYPE root>\n<root/>')
+ tree = etree.parse(BytesIO(xml))
+ self.assertEqual(xml.replace(_bytes('<!DOCTYPE root>'), doctype_string),
+ etree.tostring(tree, doctype=doctype_string))
+
+ def test_xml_base(self):
+ etree = self.etree
+ root = etree.XML(_bytes("<root/>"), base_url="http://no/such/url")
+ self.assertEqual(root.base, "http://no/such/url")
+ self.assertEqual(
+ root.get('{http://www.w3.org/XML/1998/namespace}base'), None)
+ root.base = "https://secret/url"
+ self.assertEqual(root.base, "https://secret/url")
+ self.assertEqual(
+ root.get('{http://www.w3.org/XML/1998/namespace}base'),
+ "https://secret/url")
+
+ def test_xml_base_attribute(self):
+ etree = self.etree
+ root = etree.XML(_bytes("<root/>"), base_url="http://no/such/url")
+ self.assertEqual(root.base, "http://no/such/url")
+ self.assertEqual(
+ root.get('{http://www.w3.org/XML/1998/namespace}base'), None)
+ root.set('{http://www.w3.org/XML/1998/namespace}base',
+ "https://secret/url")
+ self.assertEqual(root.base, "https://secret/url")
+ self.assertEqual(
+ root.get('{http://www.w3.org/XML/1998/namespace}base'),
+ "https://secret/url")
+
+ def test_html_base(self):
+ etree = self.etree
+ root = etree.HTML(_bytes("<html><body></body></html>"),
+ base_url="http://no/such/url")
+ self.assertEqual(root.base, "http://no/such/url")
+
+ def test_html_base_tag(self):
+ etree = self.etree
+ root = etree.HTML(_bytes('<html><head><base href="http://no/such/url"></head></html>'))
+ self.assertEqual(root.base, "http://no/such/url")
+
+ def test_indent(self):
+ ET = self.etree
+ elem = ET.XML("<root></root>")
+ ET.indent(elem)
+ self.assertEqual(ET.tostring(elem), b'<root/>')
+
+ elem = ET.XML("<html><body>text</body></html>")
+ ET.indent(elem)
+ self.assertEqual(ET.tostring(elem), b'<html>\n <body>text</body>\n</html>')
+
+ elem = ET.XML("<html> <body>text</body> </html>")
+ ET.indent(elem)
+ self.assertEqual(ET.tostring(elem), b'<html>\n <body>text</body>\n</html>')
+
+ elem = ET.XML("<html> <body>text</body> </html>")
+ ET.indent(elem)
+ self.assertEqual(ET.tostring(elem), b'<html>\n <body>text</body>\n</html>')
+
+ elem = ET.XML("<html><body>text</body>tail</html>")
+ ET.indent(elem)
+ self.assertEqual(ET.tostring(elem), b'<html>\n <body>text</body>tail</html>')
+
+ elem = ET.XML("<html><body><p>par</p>\n<p>text</p>\t<p><br/></p></body></html>")
+ ET.indent(elem)
+ self.assertEqual(
+ ET.tostring(elem),
+ b'<html>\n'
+ b' <body>\n'
+ b' <p>par</p>\n'
+ b' <p>text</p>\n'
+ b' <p>\n'
+ b' <br/>\n'
+ b' </p>\n'
+ b' </body>\n'
+ b'</html>'
+ )
+
+ elem = ET.XML("<html><body><p>pre<br/>post</p><p>text</p></body></html>")
+ ET.indent(elem)
+ self.assertEqual(
+ ET.tostring(elem),
+ b'<html>\n'
+ b' <body>\n'
+ b' <p>pre<br/>post</p>\n'
+ b' <p>text</p>\n'
+ b' </body>\n'
+ b'</html>'
+ )
+
+ def test_indent_space(self):
+ ET = self.etree
+ elem = ET.XML("<html><body><p>pre<br/>post</p><p>text</p></body></html>")
+ ET.indent(elem, space='\t')
+ self.assertEqual(
+ ET.tostring(elem),
+ b'<html>\n'
+ b'\t<body>\n'
+ b'\t\t<p>pre<br/>post</p>\n'
+ b'\t\t<p>text</p>\n'
+ b'\t</body>\n'
+ b'</html>'
+ )
+
+ elem = ET.XML("<html><body><p>pre<br/>post</p><p>text</p></body></html>")
+ ET.indent(elem, space='')
+ self.assertEqual(
+ ET.tostring(elem),
+ b'<html>\n'
+ b'<body>\n'
+ b'<p>pre<br/>post</p>\n'
+ b'<p>text</p>\n'
+ b'</body>\n'
+ b'</html>'
+ )
+
+ def test_indent_space_caching(self):
+ ET = self.etree
+ elem = ET.XML("<html><body><p>par</p><p>text</p><p><br/></p><p /></body></html>")
+ ET.indent(elem)
+ self.assertEqual(
+ {el.tail for el in elem.iter()},
+ {None, "\n", "\n ", "\n "}
+ )
+ self.assertEqual(
+ {el.text for el in elem.iter()},
+ {None, "\n ", "\n ", "\n ", "par", "text"}
+ )
+ # NOTE: lxml does not reuse Python text strings across elements.
+ #self.assertEqual(
+ # len({el.tail for el in elem.iter()}),
+ # len({id(el.tail) for el in elem.iter()}),
+ #)
+
+ def test_indent_level(self):
+ ET = self.etree
+ elem = ET.XML("<html><body><p>pre<br/>post</p><p>text</p></body></html>")
+ try:
+ ET.indent(elem, level=-1)
+ except ValueError:
+ pass
+ else:
+ self.assertTrue(False, "ValueError not raised")
+ self.assertEqual(
+ ET.tostring(elem),
+ b"<html><body><p>pre<br/>post</p><p>text</p></body></html>"
+ )
+
+ ET.indent(elem, level=2)
+ self.assertEqual(
+ ET.tostring(elem),
+ b'<html>\n'
+ b' <body>\n'
+ b' <p>pre<br/>post</p>\n'
+ b' <p>text</p>\n'
+ b' </body>\n'
+ b' </html>'
+ )
+
+ elem = ET.XML("<html><body><p>pre<br/>post</p><p>text</p></body></html>")
+ ET.indent(elem, level=1, space=' ')
+ self.assertEqual(
+ ET.tostring(elem),
+ b'<html>\n'
+ b' <body>\n'
+ b' <p>pre<br/>post</p>\n'
+ b' <p>text</p>\n'
+ b' </body>\n'
+ b' </html>'
+ )
+
+ def test_parse_fileobject_unicode(self):
+ # parse from a file object that returns unicode strings
+ f = LargeFileLikeUnicode()
+ tree = self.etree.parse(f)
+ root = tree.getroot()
+ self.assertTrue(root.tag.endswith('root'))
+
+ def test_dtd_io(self):
+ # check that DTDs that go in also go back out
+ xml = _bytes('''\
+ <!DOCTYPE test SYSTEM "test.dtd" [
+ <!ENTITY entity "tasty">
+ <!ELEMENT test (a)>
+ <!ELEMENT a (#PCDATA)>
+ ]>
+ <test><a>test-test</a></test>\
+ ''')
+ tree = self.etree.parse(BytesIO(xml))
+ self.assertEqual(self.etree.tostring(tree).replace(_bytes(" "), _bytes("")),
+ xml.replace(_bytes(" "), _bytes("")))
+
+ def test_byte_zero(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ self.assertRaises(ValueError, setattr, a, "text", 'ha\0ho')
+ self.assertRaises(ValueError, setattr, a, "tail", 'ha\0ho')
+
+ self.assertRaises(ValueError, Element, 'ha\0ho')
+
+ def test_unicode_byte_zero(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ self.assertRaises(ValueError, setattr, a, "text",
+ _str('ha\0ho'))
+ self.assertRaises(ValueError, setattr, a, "tail",
+ _str('ha\0ho'))
+
+ self.assertRaises(ValueError, Element,
+ _str('ha\0ho'))
+
+ def test_byte_invalid(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ self.assertRaises(ValueError, setattr, a, "text", 'ha\x07ho')
+ self.assertRaises(ValueError, setattr, a, "text", 'ha\x02ho')
+
+ self.assertRaises(ValueError, setattr, a, "tail", 'ha\x07ho')
+ self.assertRaises(ValueError, setattr, a, "tail", 'ha\x02ho')
+
+ self.assertRaises(ValueError, Element, 'ha\x07ho')
+ self.assertRaises(ValueError, Element, 'ha\x02ho')
+
+ def test_unicode_byte_invalid(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ self.assertRaises(ValueError, setattr, a, "text",
+ _str('ha\x07ho'))
+ self.assertRaises(ValueError, setattr, a, "text",
+ _str('ha\x02ho'))
+
+ self.assertRaises(ValueError, setattr, a, "tail",
+ _str('ha\x07ho'))
+ self.assertRaises(ValueError, setattr, a, "tail",
+ _str('ha\x02ho'))
+
+ self.assertRaises(ValueError, Element,
+ _str('ha\x07ho'))
+ self.assertRaises(ValueError, Element,
+ _str('ha\x02ho'))
+
+ def test_unicode_byte_invalid_sequence(self):
+ Element = self.etree.Element
+
+ a = Element('a')
+ self.assertRaises(ValueError, setattr, a, "text",
+ _str('ha\u1234\x07ho'))
+ self.assertRaises(ValueError, setattr, a, "text",
+ _str('ha\u1234\x02ho'))
+
+ self.assertRaises(ValueError, setattr, a, "tail",
+ _str('ha\u1234\x07ho'))
+ self.assertRaises(ValueError, setattr, a, "tail",
+ _str('ha\u1234\x02ho'))
+
+ self.assertRaises(ValueError, Element,
+ _str('ha\u1234\x07ho'))
+ self.assertRaises(ValueError, Element,
+ _str('ha\u1234\x02ho'))
+
+ def test_encoding_tostring_utf16(self):
+ # ElementTree fails to serialize this
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ result = tostring(a, encoding='UTF-16')
+ self.assertEqual(_bytes('<a><b></b><c></c></a>'),
+ canonicalize(result))
+
+ def test_tostring_none(self):
+ # ElementTree raises an AssertionError here
+ tostring = self.etree.tostring
+ self.assertRaises(TypeError, self.etree.tostring, None)
+
+ def test_tostring_pretty(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ result = tostring(a)
+ self.assertEqual(result, _bytes("<a><b/><c/></a>"))
+
+ result = tostring(a, pretty_print=False)
+ self.assertEqual(result, _bytes("<a><b/><c/></a>"))
+
+ result = tostring(a, pretty_print=True)
+ self.assertEqual(result, _bytes("<a>\n <b/>\n <c/>\n</a>\n"))
+
+ def test_tostring_with_tail(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.tail = "aTAIL"
+ b = SubElement(a, 'b')
+ b.tail = "bTAIL"
+ c = SubElement(a, 'c')
+
+ result = tostring(a)
+ self.assertEqual(result, _bytes("<a><b/>bTAIL<c/></a>aTAIL"))
+
+ result = tostring(a, with_tail=False)
+ self.assertEqual(result, _bytes("<a><b/>bTAIL<c/></a>"))
+
+ result = tostring(a, with_tail=True)
+ self.assertEqual(result, _bytes("<a><b/>bTAIL<c/></a>aTAIL"))
+
+ def test_tostring_method_html_with_tail(self):
+ tostring = self.etree.tostring
+ html = self.etree.fromstring(
+ '<html><body>'
+ '<div><p>Some text<i>\r\n</i></p></div>\r\n'
+ '</body></html>',
+ parser=self.etree.HTMLParser())
+ self.assertEqual(html.tag, 'html')
+ div = html.find('.//div')
+ self.assertEqual(div.tail, '\r\n')
+ result = tostring(div, method='html')
+ self.assertEqual(
+ result,
+ _bytes("<div><p>Some text<i>\r\n</i></p></div>\r\n"))
+ result = tostring(div, method='html', with_tail=True)
+ self.assertEqual(
+ result,
+ _bytes("<div><p>Some text<i>\r\n</i></p></div>\r\n"))
+ result = tostring(div, method='html', with_tail=False)
+ self.assertEqual(
+ result,
+ _bytes("<div><p>Some text<i>\r\n</i></p></div>"))
+
+ def test_standalone(self):
+ tostring = self.etree.tostring
+ XML = self.etree.XML
+ ElementTree = self.etree.ElementTree
+ Element = self.etree.Element
+
+ tree = Element("root").getroottree()
+ self.assertEqual(None, tree.docinfo.standalone)
+
+ tree = XML(_bytes("<root/>")).getroottree()
+ self.assertEqual(None, tree.docinfo.standalone)
+
+ tree = XML(_bytes(
+ "<?xml version='1.0' encoding='ASCII' standalone='yes'?>\n<root/>"
+ )).getroottree()
+ self.assertEqual(True, tree.docinfo.standalone)
+
+ tree = XML(_bytes(
+ "<?xml version='1.0' encoding='ASCII' standalone='no'?>\n<root/>"
+ )).getroottree()
+ self.assertEqual(False, tree.docinfo.standalone)
+
+ def test_tostring_standalone(self):
+ tostring = self.etree.tostring
+ XML = self.etree.XML
+ ElementTree = self.etree.ElementTree
+
+ root = XML(_bytes("<root/>"))
+
+ tree = ElementTree(root)
+ self.assertEqual(None, tree.docinfo.standalone)
+
+ result = tostring(root, xml_declaration=True, encoding="ASCII")
+ self.assertEqual(result, _bytes(
+ "<?xml version='1.0' encoding='ASCII'?>\n<root/>"))
+
+ result = tostring(root, xml_declaration=True, encoding="ASCII",
+ standalone=True)
+ self.assertEqual(result, _bytes(
+ "<?xml version='1.0' encoding='ASCII' standalone='yes'?>\n<root/>"))
+
+ tree = ElementTree(XML(result))
+ self.assertEqual(True, tree.docinfo.standalone)
+
+ result = tostring(root, xml_declaration=True, encoding="ASCII",
+ standalone=False)
+ self.assertEqual(result, _bytes(
+ "<?xml version='1.0' encoding='ASCII' standalone='no'?>\n<root/>"))
+
+ tree = ElementTree(XML(result))
+ self.assertEqual(False, tree.docinfo.standalone)
+
+ def test_tostring_standalone_in_out(self):
+ tostring = self.etree.tostring
+ XML = self.etree.XML
+ ElementTree = self.etree.ElementTree
+
+ root = XML(_bytes(
+ "<?xml version='1.0' encoding='UTF-8' standalone='yes'?>\n<root/>"))
+
+ tree = ElementTree(root)
+ self.assertEqual(True, tree.docinfo.standalone)
+
+ result = tostring(root, xml_declaration=True, encoding="ASCII")
+ self.assertEqual(result, _bytes(
+ "<?xml version='1.0' encoding='ASCII'?>\n<root/>"))
+
+ result = tostring(root, xml_declaration=True, encoding="ASCII",
+ standalone=True)
+ self.assertEqual(result, _bytes(
+ "<?xml version='1.0' encoding='ASCII' standalone='yes'?>\n<root/>"))
+
+ def test_tostring_method_text_encoding(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = "A"
+ a.tail = "tail"
+ b = SubElement(a, 'b')
+ b.text = "B"
+ b.tail = _str("Søk på nettet")
+ c = SubElement(a, 'c')
+ c.text = "C"
+
+ result = tostring(a, method="text", encoding="UTF-16")
+
+ self.assertEqual(_str('ABSøk på nettetCtail').encode("UTF-16"),
+ result)
+
+ def test_tostring_method_text_unicode(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ a.text = _str('Søk på nettetA')
+ a.tail = "tail"
+ b = SubElement(a, 'b')
+ b.text = "B"
+ b.tail = _str('Søk på nettetB')
+ c = SubElement(a, 'c')
+ c.text = "C"
+
+ self.assertRaises(UnicodeEncodeError,
+ tostring, a, method="text")
+
+ self.assertEqual(
+ _str('Søk på nettetABSøk på nettetBCtail').encode('utf-8'),
+ tostring(a, encoding="UTF-8", method="text"))
+
+ def test_tounicode(self):
+ tounicode = self.etree.tounicode
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ self.assertTrue(isinstance(tounicode(a), _unicode))
+ self.assertEqual(_bytes('<a><b></b><c></c></a>'),
+ canonicalize(tounicode(a)))
+
+ def test_tounicode_element(self):
+ tounicode = self.etree.tounicode
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(c, 'd')
+ self.assertTrue(isinstance(tounicode(b), _unicode))
+ self.assertTrue(isinstance(tounicode(c), _unicode))
+ self.assertEqual(_bytes('<b></b>'),
+ canonicalize(tounicode(b)))
+ self.assertEqual(_bytes('<c><d></d></c>'),
+ canonicalize(tounicode(c)))
+
+ def test_tounicode_none(self):
+ tounicode = self.etree.tounicode
+ self.assertRaises(TypeError, self.etree.tounicode, None)
+
+ def test_tounicode_element_tail(self):
+ tounicode = self.etree.tounicode
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(c, 'd')
+ b.tail = 'Foo'
+
+ self.assertTrue(isinstance(tounicode(b), _unicode))
+ self.assertTrue(tounicode(b) == '<b/>Foo' or
+ tounicode(b) == '<b />Foo')
+
+ def test_tounicode_pretty(self):
+ tounicode = self.etree.tounicode
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ result = tounicode(a)
+ self.assertEqual(result, "<a><b/><c/></a>")
+
+ result = tounicode(a, pretty_print=False)
+ self.assertEqual(result, "<a><b/><c/></a>")
+
+ result = tounicode(a, pretty_print=True)
+ self.assertEqual(result, "<a>\n <b/>\n <c/>\n</a>\n")
+
+ def test_tostring_unicode(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ self.assertTrue(isinstance(tostring(a, encoding=_unicode), _unicode))
+ self.assertEqual(_bytes('<a><b></b><c></c></a>'),
+ canonicalize(tostring(a, encoding=_unicode)))
+
+ def test_tostring_unicode_element(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(c, 'd')
+ self.assertTrue(isinstance(tostring(b, encoding=_unicode), _unicode))
+ self.assertTrue(isinstance(tostring(c, encoding=_unicode), _unicode))
+ self.assertEqual(_bytes('<b></b>'),
+ canonicalize(tostring(b, encoding=_unicode)))
+ self.assertEqual(_bytes('<c><d></d></c>'),
+ canonicalize(tostring(c, encoding=_unicode)))
+
+ def test_tostring_unicode_none(self):
+ tostring = self.etree.tostring
+ self.assertRaises(TypeError, self.etree.tostring,
+ None, encoding=_unicode)
+
+ def test_tostring_unicode_element_tail(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+ d = SubElement(c, 'd')
+ b.tail = 'Foo'
+
+ self.assertTrue(isinstance(tostring(b, encoding=_unicode), _unicode))
+ self.assertTrue(tostring(b, encoding=_unicode) == '<b/>Foo' or
+ tostring(b, encoding=_unicode) == '<b />Foo')
+
+ def test_tostring_unicode_pretty(self):
+ tostring = self.etree.tostring
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+
+ a = Element('a')
+ b = SubElement(a, 'b')
+ c = SubElement(a, 'c')
+
+ result = tostring(a, encoding=_unicode)
+ self.assertEqual(result, "<a><b/><c/></a>")
+
+ result = tostring(a, encoding=_unicode, pretty_print=False)
+ self.assertEqual(result, "<a><b/><c/></a>")
+
+ result = tostring(a, encoding=_unicode, pretty_print=True)
+ self.assertEqual(result, "<a>\n <b/>\n <c/>\n</a>\n")
+
+ def test_pypy_proxy_collect(self):
+ root = etree.Element('parent')
+ etree.SubElement(root, 'child')
+
+ self.assertEqual(len(root), 1)
+ self.assertEqual(root[0].tag, 'child')
+
+ # in PyPy, GC used to kill the Python proxy instance without cleanup
+ gc.collect()
+ self.assertEqual(len(root), 1)
+ self.assertEqual(root[0].tag, 'child')
+
+ def test_element_refcycle(self):
+ class SubEl(etree.ElementBase):
+ pass
+
+ el1 = SubEl()
+ el2 = SubEl()
+ self.assertEqual('SubEl', el1.tag)
+ self.assertEqual('SubEl', el2.tag)
+ el1.other = el2
+ el2.other = el1
+
+ del el1, el2
+ gc.collect()
+ # not really testing anything here, but it shouldn't crash
+
+ def test_proxy_collect_siblings(self):
+ root = etree.Element('parent')
+ c1 = etree.SubElement(root, 'child1')
+ c2 = etree.SubElement(root, 'child2')
+
+ root.remove(c1)
+ root.remove(c2)
+ c1.addnext(c2)
+ del c1
+ # trigger deallocation attempt of c1
+ c2.getprevious()
+ # make sure it wasn't deallocated
+ self.assertEqual('child1', c2.getprevious().tag)
+
+ def test_proxy_collect_siblings_text(self):
+ root = etree.Element('parent')
+ c1 = etree.SubElement(root, 'child1')
+ c2 = etree.SubElement(root, 'child2')
+
+ root.remove(c1)
+ root.remove(c2)
+ c1.addnext(c2)
+ c1.tail = 'abc'
+ c2.tail = 'xyz'
+ del c1
+ # trigger deallocation attempt of c1
+ c2.getprevious()
+ # make sure it wasn't deallocated
+ self.assertEqual('child1', c2.getprevious().tag)
+ self.assertEqual('abc', c2.getprevious().tail)
+
+ # helper methods
+
+ def _writeElement(self, element, encoding='us-ascii', compression=0):
+ """Write out element for comparison.
+ """
+ ElementTree = self.etree.ElementTree
+ f = BytesIO()
+ tree = ElementTree(element=element)
+ tree.write(f, encoding=encoding, compression=compression)
+ data = f.getvalue()
+ if compression:
+ data = zlib.decompress(data)
+ return canonicalize(data)
+
+
+class _XIncludeTestCase(HelperTestCase):
+ def test_xinclude_text(self):
+ filename = fileInTestDir('test_broken.xml')
+ root = etree.XML(_bytes('''\
+ <doc xmlns:xi="http://www.w3.org/2001/XInclude">
+ <xi:include href="%s" parse="text"/>
+ </doc>
+ ''' % path2url(filename)))
+ old_text = root.text
+ content = read_file(filename)
+ old_tail = root[0].tail
+
+ self.include( etree.ElementTree(root) )
+ self.assertEqual(old_text + content + old_tail,
+ root.text)
+
+ def test_xinclude(self):
+ tree = etree.parse(fileInTestDir('include/test_xinclude.xml'))
+ self.assertNotEqual(
+ 'a',
+ tree.getroot()[1].tag)
+ # process xincludes
+ self.include( tree )
+ # check whether we find it replaced with included data
+ self.assertEqual(
+ 'a',
+ tree.getroot()[1].tag)
+
+ def test_xinclude_resolver(self):
+ class res(etree.Resolver):
+ include_text = read_file(fileInTestDir('test.xml'))
+ called = {}
+ def resolve(self, url, id, context):
+ if url.endswith(".dtd"):
+ self.called["dtd"] = True
+ return self.resolve_filename(
+ fileInTestDir('test.dtd'), context)
+ elif url.endswith("test_xinclude.xml"):
+ self.called["input"] = True
+ return None # delegate to default resolver
+ else:
+ self.called["include"] = True
+ return self.resolve_string(self.include_text, context)
+
+ res_instance = res()
+ parser = etree.XMLParser(load_dtd = True)
+ parser.resolvers.add(res_instance)
+
+ tree = etree.parse(fileInTestDir('include/test_xinclude.xml'),
+ parser = parser)
+
+ self.include(tree)
+
+ called = list(res_instance.called.items())
+ called.sort()
+ self.assertEqual(
+ [("dtd", True), ("include", True), ("input", True)],
+ called)
+
+ def test_xinclude_resolver_recursive(self):
+ data = textwrap.dedent('''
+ <doc xmlns:xi="http://www.w3.org/2001/XInclude">
+ <foo/>
+ <xi:include href="./test.xml" />
+ </doc>
+ ''')
+
+ class Resolver(etree.Resolver):
+ called = {}
+
+ def resolve(self, url, id, context):
+ if url.endswith("test_xinclude.xml"):
+ assert not self.called.get("input")
+ self.called["input"] = True
+ return None # delegate to default resolver
+ elif url.endswith('/test5.xml'):
+ assert not self.called.get("DONE")
+ self.called["DONE"] = True
+ return self.resolve_string('<DONE/>', context)
+ else:
+ _, filename = url.rsplit('/', 1)
+ assert not self.called.get(filename)
+ self.called[filename] = True
+ next_data = data.replace(
+ 'test.xml', 'test%d.xml' % len(self.called))
+ return self.resolve_string(next_data, context)
+
+ res_instance = Resolver()
+ parser = etree.XMLParser(load_dtd=True)
+ parser.resolvers.add(res_instance)
+
+ tree = etree.parse(fileInTestDir('include/test_xinclude.xml'),
+ parser=parser)
+
+ self.include(tree)
+
+ called = list(res_instance.called.items())
+ called.sort()
+ self.assertEqual(
+ [("DONE", True), ("input", True), ("test.xml", True),
+ ("test2.xml", True), ("test3.xml", True), ("test4.xml", True)],
+ called)
+
+
+class ETreeXIncludeTestCase(_XIncludeTestCase):
+ def include(self, tree):
+ tree.xinclude()
+
+
+class ElementIncludeTestCase(_XIncludeTestCase):
+ from lxml import ElementInclude
+
+ def include(self, tree, loader=None, max_depth=None):
+ self.ElementInclude.include(tree.getroot(), loader=loader, max_depth=max_depth)
+
+ XINCLUDE = {}
+
+ XINCLUDE["Recursive1.xml"] = """\
+ <?xml version='1.0'?>
+ <document xmlns:xi="http://www.w3.org/2001/XInclude">
+ <p>The following is the source code of Recursive2.xml:</p>
+ <xi:include href="Recursive2.xml"/>
+ </document>
+ """
+
+ XINCLUDE["Recursive2.xml"] = """\
+ <?xml version='1.0'?>
+ <document xmlns:xi="http://www.w3.org/2001/XInclude">
+ <p>The following is the source code of Recursive3.xml:</p>
+ <xi:include href="Recursive3.xml"/>
+ </document>
+ """
+
+ XINCLUDE["Recursive3.xml"] = """\
+ <?xml version='1.0'?>
+ <document xmlns:xi="http://www.w3.org/2001/XInclude">
+ <p>The following is the source code of Recursive1.xml:</p>
+ <xi:include href="Recursive1.xml"/>
+ </document>
+ """
+
+ XINCLUDE["NonRecursive1.xml"] = """\
+ <?xml version='1.0'?>
+ <document xmlns:xi="http://www.w3.org/2001/XInclude">
+ <p>The following is multiple times the source code of NonRecursive3.xml:</p>
+ <xi:include href="NonRecursive3.xml"/>
+ <xi:include href="NonRecursive3.xml"/>
+ <p>The following is multiple times the source code of Leaf.xml:</p>
+ <xi:include href="Leaf.xml"/>
+ <xi:include href="Leaf.xml"/>
+ <xi:include href="Leaf.xml"/>
+ <p>One more time the source code of NonRecursive3.xml:</p>
+ <xi:include href="NonRecursive3.xml"/>
+ </document>
+ """
+
+ XINCLUDE["NonRecursive2.xml"] = """\
+ <?xml version='1.0'?>
+ <document xmlns:xi="http://www.w3.org/2001/XInclude">
+ <p>The following is multiple times the source code of NonRecursive3.xml:</p>
+ <xi:include href="NonRecursive3.xml"/>
+ <xi:include href="NonRecursive3.xml"/>
+ </document>
+ """
+
+ XINCLUDE["NonRecursive3.xml"] = """\
+ <?xml version='1.0'?>
+ <document xmlns:xi="http://www.w3.org/2001/XInclude">
+ <p>The following is multiple times the source code of Leaf.xml:</p>
+ <xi:include href="Leaf.xml"/>
+ <xi:include href="Leaf.xml"/>
+ </document>
+ """
+
+ XINCLUDE["Leaf.xml"] = """\
+ <?xml version='1.0'?>
+ <document xmlns:xi="http://www.w3.org/2001/XInclude">
+ <p>No further includes</p>
+ </document>
+ """
+
+ def xinclude_loader(self, href, parse="xml", encoding=None):
+ try:
+ data = textwrap.dedent(self.XINCLUDE[href])
+ except KeyError:
+ raise OSError("resource not found")
+ if parse == "xml":
+ data = etree.fromstring(data)
+ return data
+
+ def test_xinclude_failures(self):
+ # Test infinitely recursive includes.
+ document = self.xinclude_loader("Recursive1.xml").getroottree()
+ with self.assertRaises(self.ElementInclude.FatalIncludeError) as cm:
+ self.include(document, self.xinclude_loader)
+ self.assertEqual(str(cm.exception),
+ "recursive include of 'Recursive2.xml' detected")
+
+ # Test 'max_depth' limitation.
+ document = self.xinclude_loader("Recursive1.xml").getroottree()
+ with self.assertRaises(self.ElementInclude.FatalIncludeError) as cm:
+ self.include(document, self.xinclude_loader, max_depth=None)
+ self.assertEqual(str(cm.exception),
+ "recursive include of 'Recursive2.xml' detected")
+
+ document = self.xinclude_loader("Recursive1.xml").getroottree()
+ with self.assertRaises(self.ElementInclude.LimitedRecursiveIncludeError) as cm:
+ self.include(document, self.xinclude_loader, max_depth=0)
+ self.assertEqual(str(cm.exception),
+ "maximum xinclude depth reached when including file Recursive2.xml")
+
+ document = self.xinclude_loader("Recursive1.xml").getroottree()
+ with self.assertRaises(self.ElementInclude.LimitedRecursiveIncludeError) as cm:
+ self.include(document, self.xinclude_loader, max_depth=1)
+ self.assertEqual(str(cm.exception),
+ "maximum xinclude depth reached when including file Recursive3.xml")
+
+ document = self.xinclude_loader("Recursive1.xml").getroottree()
+ with self.assertRaises(self.ElementInclude.LimitedRecursiveIncludeError) as cm:
+ self.include(document, self.xinclude_loader, max_depth=2)
+ self.assertEqual(str(cm.exception),
+ "maximum xinclude depth reached when including file Recursive1.xml")
+
+ document = self.xinclude_loader("Recursive1.xml").getroottree()
+ with self.assertRaises(self.ElementInclude.FatalIncludeError) as cm:
+ self.include(document, self.xinclude_loader, max_depth=3)
+ self.assertEqual(str(cm.exception),
+ "recursive include of 'Recursive2.xml' detected")
+
+ def test_multiple_include_of_same_file(self):
+ # Test that including the same file multiple times, but on the same level
+ # is not detected as recursive include
+ document = self.xinclude_loader("NonRecursive3.xml").getroottree()
+ self.include(document, self.xinclude_loader)
+
+ # same but for more than one level
+ document = self.xinclude_loader("NonRecursive1.xml").getroottree()
+ self.include(document, self.xinclude_loader)
+
+ # same but no Leaf.xml in top-level file
+ document = self.xinclude_loader("NonRecursive2.xml").getroottree()
+ self.include(document, self.xinclude_loader)
+
+
+class ETreeC14NTestCase(HelperTestCase):
+ def test_c14n(self):
+ tree = self.parse(_bytes('<a><b/></a>'))
+ f = BytesIO()
+ tree.write_c14n(f)
+ s = f.getvalue()
+ self.assertEqual(_bytes('<a><b></b></a>'),
+ s)
+
+ def test_c14n_gzip(self):
+ tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+ f = BytesIO()
+ tree.write_c14n(f, compression=9)
+ with gzip.GzipFile(fileobj=BytesIO(f.getvalue())) as gzfile:
+ s = gzfile.read()
+ self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'),
+ s)
+
+ def test_c14n_file(self):
+ tree = self.parse(_bytes('<a><b/></a>'))
+ with tmpfile() as filename:
+ tree.write_c14n(filename)
+ data = read_file(filename, 'rb')
+ self.assertEqual(_bytes('<a><b></b></a>'),
+ data)
+
+ def test_c14n_file_gzip(self):
+ tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+ with tmpfile() as filename:
+ tree.write_c14n(filename, compression=9)
+ with gzip.open(filename, 'rb') as f:
+ data = f.read()
+ self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'),
+ data)
+
+ def test_c14n2_file_gzip(self):
+ tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+ with tmpfile() as filename:
+ tree.write(filename, method='c14n2', compression=9)
+ with gzip.open(filename, 'rb') as f:
+ data = f.read()
+ self.assertEqual(_bytes('<a>'+'<b></b>'*200+'</a>'),
+ data)
+
+ def test_c14n2_with_text(self):
+ tree = self.parse(
+ b'<?xml version="1.0"?> <a> abc \n <b> btext </b> btail <c/> ctail </a> ')
+ f = BytesIO()
+ tree.write(f, method='c14n2')
+ s = f.getvalue()
+ self.assertEqual(b'<a> abc \n <b> btext </b> btail <c></c> ctail </a>',
+ s)
+
+ f = BytesIO()
+ tree.write(f, method='c14n2', strip_text=True)
+ s = f.getvalue()
+ self.assertEqual(b'<a>abc<b>btext</b>btail<c></c>ctail</a>',
+ s)
+
+ def test_c14n_with_comments(self):
+ tree = self.parse(_bytes('<!--hi--><a><!--ho--><b/></a><!--hu-->'))
+ f = BytesIO()
+ tree.write_c14n(f)
+ s = f.getvalue()
+ self.assertEqual(_bytes('<!--hi-->\n<a><!--ho--><b></b></a>\n<!--hu-->'),
+ s)
+ f = BytesIO()
+ tree.write_c14n(f, with_comments=True)
+ s = f.getvalue()
+ self.assertEqual(_bytes('<!--hi-->\n<a><!--ho--><b></b></a>\n<!--hu-->'),
+ s)
+ f = BytesIO()
+ tree.write_c14n(f, with_comments=False)
+ s = f.getvalue()
+ self.assertEqual(_bytes('<a><b></b></a>'),
+ s)
+
+ def test_c14n2_with_comments(self):
+ tree = self.parse(b'<!--hi--> <a> <!-- ho --> <b/> </a> <!-- hu -->')
+ self.assertEqual(
+ b'<!--hi-->\n<a> <!-- ho --> <b></b> </a>\n<!-- hu -->',
+ etree.tostring(tree, method='c14n2'))
+
+ self.assertEqual(
+ b'<!--hi-->\n<a> <!-- ho --> <b></b> </a>\n<!-- hu -->',
+ etree.tostring(tree, method='c14n2', with_comments=True))
+
+ self.assertEqual(
+ b'<a> <b></b> </a>',
+ etree.tostring(tree, method='c14n2', with_comments=False))
+
+ def test_c14n2_with_comments_strip_text(self):
+ tree = self.parse(b'<!--hi--> <a> <!-- ho --> <b/> </a> <!-- hu -->')
+ self.assertEqual(
+ b'<!--hi-->\n<a><!-- ho --><b></b></a>\n<!-- hu -->',
+ etree.tostring(tree, method='c14n2', with_comments=True, strip_text=True))
+ self.assertEqual(
+ b'<a><b></b></a>',
+ etree.tostring(tree, method='c14n2', with_comments=False, strip_text=True))
+
+ def test_c14n_tostring_with_comments(self):
+ tree = self.parse(_bytes('<!--hi--><a><!--ho--><b/></a><!--hu-->'))
+ s = etree.tostring(tree, method='c14n')
+ self.assertEqual(_bytes('<!--hi-->\n<a><!--ho--><b></b></a>\n<!--hu-->'),
+ s)
+ s = etree.tostring(tree, method='c14n', with_comments=True)
+ self.assertEqual(_bytes('<!--hi-->\n<a><!--ho--><b></b></a>\n<!--hu-->'),
+ s)
+ s = etree.tostring(tree, method='c14n', with_comments=False)
+ self.assertEqual(_bytes('<a><b></b></a>'),
+ s)
+
+ def test_c14n2_tostring_with_comments(self):
+ tree = self.parse(b'<!--hi--><a><!--ho--><b/></a><!--hu-->')
+ s = etree.tostring(tree, method='c14n2')
+ self.assertEqual(b'<!--hi-->\n<a><!--ho--><b></b></a>\n<!--hu-->',
+ s)
+ s = etree.tostring(tree, method='c14n2', with_comments=True)
+ self.assertEqual(b'<!--hi-->\n<a><!--ho--><b></b></a>\n<!--hu-->',
+ s)
+ s = etree.tostring(tree, method='c14n2', with_comments=False)
+ self.assertEqual(b'<a><b></b></a>',
+ s)
+
+ def test_c14n_element_tostring_with_comments(self):
+ tree = self.parse(_bytes('<!--hi--><a><!--ho--><b/></a><!--hu-->'))
+ s = etree.tostring(tree.getroot(), method='c14n')
+ self.assertEqual(_bytes('<a><!--ho--><b></b></a>'),
+ s)
+ s = etree.tostring(tree.getroot(), method='c14n', with_comments=True)
+ self.assertEqual(_bytes('<a><!--ho--><b></b></a>'),
+ s)
+ s = etree.tostring(tree.getroot(), method='c14n', with_comments=False)
+ self.assertEqual(_bytes('<a><b></b></a>'),
+ s)
+
+ def test_c14n_exclusive(self):
+ tree = self.parse(_bytes(
+ '<a xmlns="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"><z:b/></a>'))
+ f = BytesIO()
+ tree.write_c14n(f)
+ s = f.getvalue()
+ self.assertEqual(_bytes('<a xmlns="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"><z:b></z:b></a>'),
+ s)
+ f = BytesIO()
+ tree.write_c14n(f, exclusive=False)
+ s = f.getvalue()
+ self.assertEqual(_bytes('<a xmlns="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"><z:b></z:b></a>'),
+ s)
+ f = BytesIO()
+ tree.write_c14n(f, exclusive=True)
+ s = f.getvalue()
+ self.assertEqual(_bytes('<a xmlns="http://abc"><z:b xmlns:z="http://cde"></z:b></a>'),
+ s)
+
+ f = BytesIO()
+ tree.write_c14n(f, exclusive=True, inclusive_ns_prefixes=['z'])
+ s = f.getvalue()
+ self.assertEqual(_bytes('<a xmlns="http://abc" xmlns:z="http://cde"><z:b></z:b></a>'),
+ s)
+
+ def test_c14n_tostring_exclusive(self):
+ tree = self.parse(_bytes(
+ '<a xmlns="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"><z:b/></a>'))
+ s = etree.tostring(tree, method='c14n')
+ self.assertEqual(_bytes('<a xmlns="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"><z:b></z:b></a>'),
+ s)
+ s = etree.tostring(tree, method='c14n', exclusive=False)
+ self.assertEqual(_bytes('<a xmlns="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"><z:b></z:b></a>'),
+ s)
+ s = etree.tostring(tree, method='c14n', exclusive=True)
+ self.assertEqual(_bytes('<a xmlns="http://abc"><z:b xmlns:z="http://cde"></z:b></a>'),
+ s)
+
+ s = etree.tostring(tree, method='c14n', exclusive=True, inclusive_ns_prefixes=['y'])
+ self.assertEqual(_bytes('<a xmlns="http://abc" xmlns:y="http://bcd"><z:b xmlns:z="http://cde"></z:b></a>'),
+ s)
+
+ def test_c14n_element_tostring_exclusive(self):
+ tree = self.parse(_bytes(
+ '<a xmlns="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"><z:b/></a>'))
+ s = etree.tostring(tree.getroot(), method='c14n')
+ self.assertEqual(_bytes('<a xmlns="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"><z:b></z:b></a>'),
+ s)
+ s = etree.tostring(tree.getroot(), method='c14n', exclusive=False)
+ self.assertEqual(_bytes('<a xmlns="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"><z:b></z:b></a>'),
+ s)
+ s = etree.tostring(tree.getroot(), method='c14n', exclusive=True)
+ self.assertEqual(_bytes('<a xmlns="http://abc"><z:b xmlns:z="http://cde"></z:b></a>'),
+ s)
+
+ s = etree.tostring(tree.getroot()[0], method='c14n', exclusive=False)
+ self.assertEqual(_bytes('<z:b xmlns="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"></z:b>'),
+ s)
+ s = etree.tostring(tree.getroot()[0], method='c14n', exclusive=True)
+ self.assertEqual(_bytes('<z:b xmlns:z="http://cde"></z:b>'),
+ s)
+
+ s = etree.tostring(tree.getroot()[0], method='c14n', exclusive=True, inclusive_ns_prefixes=['y'])
+ self.assertEqual(_bytes('<z:b xmlns:y="http://bcd" xmlns:z="http://cde"></z:b>'),
+ s)
+
+ def test_c14n_tostring_inclusive_ns_prefixes(self):
+ """ Regression test to fix memory allocation issues (use 3+ inclusive NS spaces)"""
+ tree = self.parse(_bytes(
+ '<a xmlns:x="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"><z:b/></a>'))
+
+ s = etree.tostring(tree, method='c14n', exclusive=True, inclusive_ns_prefixes=['x', 'y', 'z'])
+ self.assertEqual(_bytes('<a xmlns:x="http://abc" xmlns:y="http://bcd" xmlns:z="http://cde"><z:b></z:b></a>'),
+ s)
+
+
+class ETreeWriteTestCase(HelperTestCase):
+ def test_write(self):
+ tree = self.parse(_bytes('<a><b/></a>'))
+ f = BytesIO()
+ tree.write(f)
+ s = f.getvalue()
+ self.assertEqual(_bytes('<a><b/></a>'),
+ s)
+
+ def test_write_doctype(self):
+ tree = self.parse(_bytes('<a><b/></a>'))
+ f = BytesIO()
+ tree.write(f, doctype='HUHU')
+ s = f.getvalue()
+ self.assertEqual(_bytes('HUHU\n<a><b/></a>'),
+ s)
+
+ def test_write_gzip(self):
+ tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+ f = BytesIO()
+ tree.write(f, compression=9)
+ with gzip.GzipFile(fileobj=BytesIO(f.getvalue())) as gzfile:
+ s = gzfile.read()
+ self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
+ s)
+
+ def test_write_gzip_doctype(self):
+ tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+ f = BytesIO()
+ tree.write(f, compression=9, doctype='<!DOCTYPE a>')
+ with gzip.GzipFile(fileobj=BytesIO(f.getvalue())) as gzfile:
+ s = gzfile.read()
+ self.assertEqual(_bytes('<!DOCTYPE a>\n<a>'+'<b/>'*200+'</a>'),
+ s)
+
+ def test_write_gzip_level(self):
+ tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+ f = BytesIO()
+ tree.write(f, compression=0)
+ s0 = f.getvalue()
+
+ f = BytesIO()
+ tree.write(f)
+ self.assertEqual(f.getvalue(), s0)
+
+ f = BytesIO()
+ tree.write(f, compression=1)
+ s = f.getvalue()
+ self.assertTrue(len(s) <= len(s0))
+ with gzip.GzipFile(fileobj=BytesIO(s)) as gzfile:
+ s1 = gzfile.read()
+
+ f = BytesIO()
+ tree.write(f, compression=9)
+ s = f.getvalue()
+ self.assertTrue(len(s) <= len(s0))
+ with gzip.GzipFile(fileobj=BytesIO(s)) as gzfile:
+ s9 = gzfile.read()
+
+ self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
+ s0)
+ self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
+ s1)
+ self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
+ s9)
+
+ def test_write_file(self):
+ tree = self.parse(_bytes('<a><b/></a>'))
+ with tmpfile() as filename:
+ tree.write(filename)
+ data = read_file(filename, 'rb')
+ self.assertEqual(_bytes('<a><b/></a>'),
+ data)
+
+ def test_write_file_gzip(self):
+ tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+ with tmpfile() as filename:
+ tree.write(filename, compression=9)
+ with gzip.open(filename, 'rb') as f:
+ data = f.read()
+ self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
+ data)
+
+ def test_write_file_gzip_parse(self):
+ tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+ with tmpfile() as filename:
+ tree.write(filename, compression=9)
+ data = etree.tostring(etree.parse(filename))
+ self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
+ data)
+
+ def test_write_file_gzipfile_parse(self):
+ tree = self.parse(_bytes('<a>'+'<b/>'*200+'</a>'))
+ with tmpfile() as filename:
+ tree.write(filename, compression=9)
+ with gzip.GzipFile(filename) as f:
+ data = etree.tostring(etree.parse(f))
+ self.assertEqual(_bytes('<a>'+'<b/>'*200+'</a>'),
+ data)
+
+ def test_write_file_url(self):
+ xml = _bytes('<a>'+'<b/>'*200+'</a>')
+ tree = self.parse(xml)
+ with tmpfile(prefix="p+%20", suffix=".xml") as filename:
+ url = 'file://' + (filename if sys.platform != 'win32'
+ else '/' + filename.replace('\\', '/'))
+ tree.write(url)
+ data = read_file(filename, 'rb').replace(_bytes('\n'), _bytes(''))
+ self.assertEqual(data, xml)
+
+
+class ETreeErrorLogTest(HelperTestCase):
+ etree = etree
+
+ def test_parse_error_logging(self):
+ parse = self.etree.parse
+ f = BytesIO('<a><b></c></b></a>')
+ self.etree.clear_error_log()
+ try:
+ parse(f)
+ logs = None
+ except SyntaxError:
+ e = sys.exc_info()[1]
+ logs = e.error_log
+ f.close()
+ self.assertTrue([ log for log in logs
+ if 'mismatch' in log.message ])
+ self.assertTrue([ log for log in logs
+ if 'PARSER' in log.domain_name])
+ self.assertTrue([ log for log in logs
+ if 'ERR_TAG_NAME_MISMATCH' in log.type_name ])
+ self.assertTrue([ log for log in logs
+ if 1 == log.line ])
+ self.assertTrue([ log for log in logs
+ if 15 == log.column ])
+
+ def _test_python_error_logging(self):
+ """This can't really be tested as long as there isn't a way to
+ reset the logging setup ...
+ """
+ parse = self.etree.parse
+
+ messages = []
+ class Logger(self.etree.PyErrorLog):
+ def log(self, entry, message, *args):
+ messages.append(message)
+
+ self.etree.use_global_python_log(Logger())
+ f = BytesIO('<a><b></c></b></a>')
+ try:
+ parse(f)
+ except SyntaxError:
+ pass
+ f.close()
+
+ self.assertTrue([ message for message in messages
+ if 'mismatch' in message ])
+ self.assertTrue([ message for message in messages
+ if ':PARSER:' in message])
+ self.assertTrue([ message for message in messages
+ if ':ERR_TAG_NAME_MISMATCH:' in message ])
+ self.assertTrue([ message for message in messages
+ if ':1:15:' in message ])
+
+
+class XMLPullParserTest(unittest.TestCase):
+ etree = etree
+
+ def assert_event_tags(self, events, expected):
+ self.assertEqual([(action, elem.tag) for action, elem in events],
+ expected)
+
+ def test_pull_from_simple_target(self):
+ class Target(object):
+ def start(self, tag, attrib):
+ return 'start(%s)' % tag
+ def end(self, tag):
+ return 'end(%s)' % tag
+ def close(self):
+ return 'close()'
+
+ parser = self.etree.XMLPullParser(target=Target())
+ events = parser.read_events()
+
+ parser.feed('<root><element>')
+ self.assertFalse(list(events))
+ self.assertFalse(list(events))
+ parser.feed('</element><child>')
+ self.assertEqual([('end', 'end(element)')], list(events))
+ parser.feed('</child>')
+ self.assertEqual([('end', 'end(child)')], list(events))
+ parser.feed('</root>')
+ self.assertEqual([('end', 'end(root)')], list(events))
+ self.assertFalse(list(events))
+ self.assertEqual('close()', parser.close())
+
+ def test_pull_from_simple_target_start_end(self):
+ class Target(object):
+ def start(self, tag, attrib):
+ return 'start(%s)' % tag
+ def end(self, tag):
+ return 'end(%s)' % tag
+ def close(self):
+ return 'close()'
+
+ parser = self.etree.XMLPullParser(
+ ['start', 'end'], target=Target())
+ events = parser.read_events()
+
+ parser.feed('<root><element>')
+ self.assertEqual(
+ [('start', 'start(root)'), ('start', 'start(element)')],
+ list(events))
+ self.assertFalse(list(events))
+ parser.feed('</element><child>')
+ self.assertEqual(
+ [('end', 'end(element)'), ('start', 'start(child)')],
+ list(events))
+ parser.feed('</child>')
+ self.assertEqual(
+ [('end', 'end(child)')],
+ list(events))
+ parser.feed('</root>')
+ self.assertEqual(
+ [('end', 'end(root)')],
+ list(events))
+ self.assertFalse(list(events))
+ self.assertEqual('close()', parser.close())
+
+ def test_pull_from_tree_builder(self):
+ parser = self.etree.XMLPullParser(
+ ['start', 'end'], target=etree.TreeBuilder())
+ events = parser.read_events()
+
+ parser.feed('<root><element>')
+ self.assert_event_tags(
+ events, [('start', 'root'), ('start', 'element')])
+ self.assertFalse(list(events))
+ parser.feed('</element><child>')
+ self.assert_event_tags(
+ events, [('end', 'element'), ('start', 'child')])
+ parser.feed('</child>')
+ self.assert_event_tags(
+ events, [('end', 'child')])
+ parser.feed('</root>')
+ self.assert_event_tags(
+ events, [('end', 'root')])
+ self.assertFalse(list(events))
+ root = parser.close()
+ self.assertEqual('root', root.tag)
+
+ def test_pull_from_tree_builder_subclass(self):
+ class Target(etree.TreeBuilder):
+ def end(self, tag):
+ el = super(Target, self).end(tag)
+ el.tag += '-huhu'
+ return el
+
+ parser = self.etree.XMLPullParser(
+ ['start', 'end'], target=Target())
+ events = parser.read_events()
+
+ parser.feed('<root><element>')
+ self.assert_event_tags(
+ events, [('start', 'root'), ('start', 'element')])
+ self.assertFalse(list(events))
+ parser.feed('</element><child>')
+ self.assert_event_tags(
+ events, [('end', 'element-huhu'), ('start', 'child')])
+ parser.feed('</child>')
+ self.assert_event_tags(
+ events, [('end', 'child-huhu')])
+ parser.feed('</root>')
+ self.assert_event_tags(
+ events, [('end', 'root-huhu')])
+ self.assertFalse(list(events))
+ root = parser.close()
+ self.assertEqual('root-huhu', root.tag)
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ETreeOnlyTestCase)])
+ suite.addTests([unittest.makeSuite(ETreeXIncludeTestCase)])
+ suite.addTests([unittest.makeSuite(ElementIncludeTestCase)])
+ suite.addTests([unittest.makeSuite(ETreeC14NTestCase)])
+ suite.addTests([unittest.makeSuite(ETreeWriteTestCase)])
+ suite.addTests([unittest.makeSuite(ETreeErrorLogTest)])
+ suite.addTests([unittest.makeSuite(XMLPullParserTest)])
+
+ # add original doctests from ElementTree selftest modules
+ from . import selftest, selftest2
+ suite.addTests(doctest.DocTestSuite(selftest))
+ suite.addTests(doctest.DocTestSuite(selftest2))
+
+ # add doctests
+ suite.addTests(doctest.DocTestSuite(etree))
+ suite.addTests(
+ [make_doctest('../../../doc/tutorial.txt')])
+ suite.addTests(
+ [make_doctest('../../../doc/api.txt')])
+ suite.addTests(
+ [make_doctest('../../../doc/FAQ.txt')])
+ suite.addTests(
+ [make_doctest('../../../doc/parsing.txt')])
+ suite.addTests(
+ [make_doctest('../../../doc/resolvers.txt')])
+ return suite
+
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_external_document.py b/src/lxml/tests/test_external_document.py
new file mode 100644
index 0000000..0d1d063
--- /dev/null
+++ b/src/lxml/tests/test_external_document.py
@@ -0,0 +1,106 @@
+# -*- coding: utf-8 -*-
+"""
+Test cases related to direct loading of external libxml2 documents
+"""
+
+from __future__ import absolute_import
+
+import sys
+import unittest
+
+from .common_imports import HelperTestCase, etree
+
+DOC_NAME = b'libxml2:xmlDoc'
+DESTRUCTOR_NAME = b'destructor:xmlFreeDoc'
+
+
+class ExternalDocumentTestCase(HelperTestCase):
+ def setUp(self):
+ try:
+ import ctypes
+ from ctypes import pythonapi
+ from ctypes.util import find_library
+ except ImportError:
+ raise unittest.SkipTest("ctypes support missing")
+
+ def wrap(func, restype, *argtypes):
+ func.restype = restype
+ func.argtypes = list(argtypes)
+ return func
+
+ self.get_capsule_name = wrap(pythonapi.PyCapsule_GetName,
+ ctypes.c_char_p, ctypes.py_object)
+ self.capsule_is_valid = wrap(pythonapi.PyCapsule_IsValid, ctypes.c_int,
+ ctypes.py_object, ctypes.c_char_p)
+ self.new_capsule = wrap(pythonapi.PyCapsule_New, ctypes.py_object,
+ ctypes.c_void_p, ctypes.c_char_p,
+ ctypes.c_void_p)
+ self.set_capsule_name = wrap(pythonapi.PyCapsule_SetName, ctypes.c_int,
+ ctypes.py_object, ctypes.c_char_p)
+ self.set_capsule_context = wrap(pythonapi.PyCapsule_SetContext,
+ ctypes.c_int, ctypes.py_object,
+ ctypes.c_char_p)
+ self.get_capsule_context = wrap(pythonapi.PyCapsule_GetContext,
+ ctypes.c_char_p, ctypes.py_object)
+ self.get_capsule_pointer = wrap(pythonapi.PyCapsule_GetPointer,
+ ctypes.c_void_p, ctypes.py_object,
+ ctypes.c_char_p)
+ self.set_capsule_pointer = wrap(pythonapi.PyCapsule_SetPointer,
+ ctypes.c_int, ctypes.py_object,
+ ctypes.c_void_p)
+ self.set_capsule_destructor = wrap(pythonapi.PyCapsule_SetDestructor,
+ ctypes.c_int, ctypes.py_object,
+ ctypes.c_void_p)
+ self.PyCapsule_Destructor = ctypes.CFUNCTYPE(None, ctypes.py_object)
+ libxml2 = ctypes.CDLL(find_library('xml2'))
+ self.create_doc = wrap(libxml2.xmlReadMemory, ctypes.c_void_p,
+ ctypes.c_char_p, ctypes.c_int, ctypes.c_char_p,
+ ctypes.c_char_p, ctypes.c_int)
+ self.free_doc = wrap(libxml2.xmlFreeDoc, None, ctypes.c_void_p)
+
+ def as_capsule(self, text, capsule_name=DOC_NAME):
+ if not isinstance(text, bytes):
+ text = text.encode('utf-8')
+ doc = self.create_doc(text, len(text), b'base.xml', b'utf-8', 0)
+ ans = self.new_capsule(doc, capsule_name, None)
+ self.set_capsule_context(ans, DESTRUCTOR_NAME)
+ return ans
+
+ def test_external_document_adoption(self):
+ xml = '<r a="1">t</r>'
+ self.assertRaises(TypeError, etree.adopt_external_document, None)
+ capsule = self.as_capsule(xml)
+ self.assertTrue(self.capsule_is_valid(capsule, DOC_NAME))
+ self.assertEqual(DOC_NAME, self.get_capsule_name(capsule))
+ # Create an lxml tree from the capsule (this is a move not a copy)
+ root = etree.adopt_external_document(capsule).getroot()
+ self.assertIsNone(self.get_capsule_name(capsule))
+ self.assertEqual(root.text, 't')
+ root.text = 'new text'
+ # Now reset the capsule so we can copy it
+ self.assertEqual(0, self.set_capsule_name(capsule, DOC_NAME))
+ self.assertEqual(0, self.set_capsule_context(capsule, b'invalid'))
+ # Create an lxml tree from the capsule (this is a copy not a move)
+ root2 = etree.adopt_external_document(capsule).getroot()
+ self.assertEqual(self.get_capsule_context(capsule), b'invalid')
+ # Check that the modification to the tree using the transferred
+ # document was successful
+ self.assertEqual(root.text, root2.text)
+ # Check that further modifications do not show up in the copy (they are
+ # disjoint)
+ root.text = 'other text'
+ self.assertNotEqual(root.text, root2.text)
+ # delete root and ensure root2 survives
+ del root
+ self.assertEqual(root2.text, 'new text')
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ if sys.platform != 'win32':
+ suite.addTests([unittest.makeSuite(ExternalDocumentTestCase)])
+ return suite
+
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_htmlparser.py b/src/lxml/tests/test_htmlparser.py
new file mode 100644
index 0000000..9847d39
--- /dev/null
+++ b/src/lxml/tests/test_htmlparser.py
@@ -0,0 +1,663 @@
+# -*- coding: utf-8 -*-
+
+"""
+HTML parser test cases for etree
+"""
+
+from __future__ import absolute_import
+
+import unittest
+import tempfile, os, os.path, sys
+
+from .common_imports import etree, html, BytesIO, fileInTestDir, _bytes, _str
+from .common_imports import SillyFileLike, HelperTestCase, write_to_file
+
+try:
+ unicode
+except NameError:
+ unicode = str
+
+
+class HtmlParserTestCase(HelperTestCase):
+ """HTML parser test cases
+ """
+ etree = etree
+
+ html_str = _bytes("<html><head><title>test</title></head><body><h1>page title</h1></body></html>")
+ html_str_pretty = _bytes("""\
+<html>
+<head><title>test</title></head>
+<body><h1>page title</h1></body>
+</html>
+""")
+ broken_html_str = _bytes("<html><head><title>test"
+ "<body><h1>page title</h3></p></html>")
+ uhtml_str = _bytes(
+ "<html><head><title>test á</title></head>"
+ "<body><h1>page á title</h1></body></html>").decode('utf8')
+
+ def tearDown(self):
+ super(HtmlParserTestCase, self).tearDown()
+ self.etree.set_default_parser()
+
+ def test_module_HTML(self):
+ element = self.etree.HTML(self.html_str)
+ self.assertEqual(self.etree.tostring(element, method="html"),
+ self.html_str)
+
+ def test_module_HTML_unicode(self):
+ element = self.etree.HTML(self.uhtml_str)
+ self.assertEqual(
+ self.etree.tostring(element, method="html", encoding='unicode'),
+ self.uhtml_str)
+ self.assertEqual(element.findtext('.//h1'),
+ _bytes("page á title").decode('utf8'))
+
+ def test_wide_unicode_xml(self):
+ if sys.maxunicode < 1114111:
+ return # skip test
+ element = self.etree.HTML(_bytes(
+ '<html><body><p>\\U00026007</p></body></html>'
+ ).decode('unicode_escape'))
+ p_text = element.findtext('.//p')
+ self.assertEqual(1, len(p_text))
+ self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'),
+ p_text)
+
+ def test_html_ids(self):
+ parser = self.etree.HTMLParser(recover=False)
+ fromstring = self.etree.fromstring
+ html = fromstring('''
+ <html><body id="bodyID"><p id="pID"></p></body></html>
+ ''', parser=parser)
+ self.assertEqual(len(html.xpath('//p[@id="pID"]')), 1)
+ self.assertEqual(len(html.findall('.//p[@id="pID"]')), 1)
+
+ def test_html_ids_no_collect_ids(self):
+ parser = self.etree.HTMLParser(recover=False, collect_ids=False)
+ fromstring = self.etree.fromstring
+ html = fromstring('''
+ <html><body id="bodyID"><p id="pID"></p></body></html>
+ ''', parser=parser)
+ self.assertEqual(len(html.xpath('//p[@id="pID"]')), 1)
+ self.assertEqual(len(html.findall('.//p[@id="pID"]')), 1)
+
+ def test_module_HTML_pretty_print(self):
+ element = self.etree.HTML(self.html_str)
+ self.assertEqual(self.etree.tostring(element, method="html", pretty_print=True),
+ self.html_str_pretty)
+
+ def test_module_parse_html_error(self):
+ parser = self.etree.HTMLParser(recover=False)
+ parse = self.etree.parse
+ f = BytesIO("<html></body>")
+ self.assertRaises(self.etree.XMLSyntaxError,
+ parse, f, parser)
+
+ def test_html_element_name_empty(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+
+ el = Element('name')
+ self.assertRaises(ValueError, Element, '{}')
+ self.assertRaises(ValueError, setattr, el, 'tag', '{}')
+
+ self.assertRaises(ValueError, Element, '{test}')
+ self.assertRaises(ValueError, setattr, el, 'tag', '{test}')
+
+ def test_html_element_name_colon(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+
+ pname = Element('p:name')
+ self.assertEqual(pname.tag, 'p:name')
+
+ pname = Element('{test}p:name')
+ self.assertEqual(pname.tag, '{test}p:name')
+
+ pname = Element('name')
+ pname.tag = 'p:name'
+ self.assertEqual(pname.tag, 'p:name')
+
+ def test_html_element_name_quote(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+
+ self.assertRaises(ValueError, Element, 'p"name')
+ self.assertRaises(ValueError, Element, "na'me")
+ self.assertRaises(ValueError, Element, '{test}"name')
+ self.assertRaises(ValueError, Element, "{test}name'")
+
+ el = Element('name')
+ self.assertRaises(ValueError, setattr, el, 'tag', "pname'")
+ self.assertRaises(ValueError, setattr, el, 'tag', '"pname')
+ self.assertEqual(el.tag, "name")
+
+ def test_html_element_name_space(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+
+ self.assertRaises(ValueError, Element, ' name ')
+ self.assertRaises(ValueError, Element, 'na me')
+ self.assertRaises(ValueError, Element, '{test} name')
+
+ el = Element('name')
+ self.assertRaises(ValueError, setattr, el, 'tag', ' name ')
+ self.assertEqual(el.tag, "name")
+
+ def test_html_subelement_name_empty(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ self.assertRaises(ValueError, SubElement, el, '{}')
+ self.assertRaises(ValueError, SubElement, el, '{test}')
+
+ def test_html_subelement_name_colon(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ pname = SubElement(el, 'p:name')
+ self.assertEqual(pname.tag, 'p:name')
+
+ pname = SubElement(el, '{test}p:name')
+ self.assertEqual(pname.tag, '{test}p:name')
+
+ def test_html_subelement_name_quote(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ self.assertRaises(ValueError, SubElement, el, "name'")
+ self.assertRaises(ValueError, SubElement, el, 'na"me')
+ self.assertRaises(ValueError, SubElement, el, "{test}na'me")
+ self.assertRaises(ValueError, SubElement, el, '{test}"name')
+
+ def test_html_subelement_name_space(self):
+ parser = self.etree.HTMLParser()
+ Element = parser.makeelement
+ SubElement = self.etree.SubElement
+
+ el = Element('name')
+ self.assertRaises(ValueError, SubElement, el, ' name ')
+ self.assertRaises(ValueError, SubElement, el, 'na me')
+ self.assertRaises(ValueError, SubElement, el, '{test} name')
+
+ def test_module_parse_html_norecover(self):
+ parser = self.etree.HTMLParser(recover=False)
+ parse = self.etree.parse
+ f = BytesIO(self.broken_html_str)
+ self.assertRaises(self.etree.XMLSyntaxError,
+ parse, f, parser)
+
+ def test_module_parse_html_default_doctype(self):
+ parser = self.etree.HTMLParser(default_doctype=False)
+ d = html.fromstring('<!DOCTYPE html><h1>S</h1></html>', parser=parser)
+ self.assertEqual(d.getroottree().docinfo.doctype, '<!DOCTYPE html>')
+
+ d = html.fromstring('<html><h1>S</h1></html>', parser=parser)
+ self.assertEqual(d.getroottree().docinfo.doctype, '')
+
+ def test_parse_encoding_8bit_explicit(self):
+ text = _str('Søk på nettet')
+ html_latin1 = (_str('<p>%s</p>') % text).encode('iso-8859-1')
+
+ tree = self.etree.parse(
+ BytesIO(html_latin1),
+ self.etree.HTMLParser(encoding="iso-8859-1"))
+ p = tree.find("//p")
+ self.assertEqual(p.text, text)
+
+ def test_parse_encoding_8bit_override(self):
+ text = _str('Søk på nettet')
+ wrong_head = _str('''
+ <head>
+ <meta http-equiv="Content-Type"
+ content="text/html; charset=UTF-8" />
+ </head>''')
+ html_latin1 = (_str('<html>%s<body><p>%s</p></body></html>') % (wrong_head,
+ text)
+ ).encode('iso-8859-1')
+
+ self.assertRaises(self.etree.ParseError,
+ self.etree.parse,
+ BytesIO(html_latin1))
+
+ tree = self.etree.parse(
+ BytesIO(html_latin1),
+ self.etree.HTMLParser(encoding="iso-8859-1"))
+ p = tree.find("//p")
+ self.assertEqual(p.text, text)
+
+ def test_module_HTML_broken(self):
+ element = self.etree.HTML(self.broken_html_str)
+ self.assertEqual(self.etree.tostring(element, method="html"),
+ self.html_str)
+
+ def test_module_HTML_cdata(self):
+ # by default, libxml2 generates CDATA nodes for <script> content
+ html = _bytes('<html><head><style>foo</style></head></html>')
+ element = self.etree.HTML(html)
+ self.assertEqual(element[0][0].text, "foo")
+
+ def test_module_HTML_access(self):
+ element = self.etree.HTML(self.html_str)
+ self.assertEqual(element[0][0].tag, 'title')
+
+ def test_module_parse_html(self):
+ parser = self.etree.HTMLParser()
+ filename = tempfile.mktemp(suffix=".html")
+ write_to_file(filename, self.html_str, 'wb')
+ try:
+ with open(filename, 'rb') as f:
+ tree = self.etree.parse(f, parser)
+ self.assertEqual(self.etree.tostring(tree.getroot(), method="html"),
+ self.html_str)
+ finally:
+ os.remove(filename)
+
+ def test_module_parse_html_filelike(self):
+ parser = self.etree.HTMLParser()
+ f = SillyFileLike(self.html_str)
+ tree = self.etree.parse(f, parser)
+ html = self.etree.tostring(tree.getroot(),
+ method="html", encoding='UTF-8')
+ self.assertEqual(html, self.html_str)
+
+## def test_module_parse_html_filelike_unicode(self):
+## parser = self.etree.HTMLParser()
+## f = SillyFileLike(self.uhtml_str)
+## tree = self.etree.parse(f, parser)
+## html = self.etree.tostring(tree.getroot(), encoding='UTF-8')
+## self.assertEqual(unicode(html, 'UTF-8'), self.uhtml_str)
+
+ def test_html_file_error(self):
+ parser = self.etree.HTMLParser()
+ parse = self.etree.parse
+ self.assertRaises(IOError,
+ parse, "__some_hopefully_nonexisting_file__.html",
+ parser)
+
+ def test_default_parser_HTML_broken(self):
+ self.assertRaises(self.etree.XMLSyntaxError,
+ self.etree.parse, BytesIO(self.broken_html_str))
+
+ self.etree.set_default_parser( self.etree.HTMLParser() )
+
+ tree = self.etree.parse(BytesIO(self.broken_html_str))
+ self.assertEqual(self.etree.tostring(tree.getroot(), method="html"),
+ self.html_str)
+
+ self.etree.set_default_parser()
+
+ self.assertRaises(self.etree.XMLSyntaxError,
+ self.etree.parse, BytesIO(self.broken_html_str))
+
+ def test_html_iterparse(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO(
+ '<html><head><title>TITLE</title><body><p>P</p></body></html>')
+
+ iterator = iterparse(f, html=True)
+ self.assertEqual(None, iterator.root)
+
+ events = list(iterator)
+ root = iterator.root
+ self.assertTrue(root is not None)
+ self.assertEqual(
+ [('end', root[0][0]), ('end', root[0]), ('end', root[1][0]),
+ ('end', root[1]), ('end', root)],
+ events)
+
+ def test_html_iterparse_tag(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO(
+ '<html><head><title>TITLE</title><body><p>P</p></body></html>')
+
+ iterator = iterparse(f, html=True, tag=["p", "title"])
+ self.assertEqual(None, iterator.root)
+
+ events = list(iterator)
+ root = iterator.root
+ self.assertTrue(root is not None)
+ self.assertEqual(
+ [('end', root[0][0]), ('end', root[1][0])],
+ events)
+
+ def test_html_iterparse_stop_short(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO(
+ '<html><head><title>TITLE</title><body><p>P</p></body></html>')
+
+ iterator = iterparse(f, html=True)
+ self.assertEqual(None, iterator.root)
+
+ event, element = next(iterator)
+ self.assertEqual('end', event)
+ self.assertEqual('title', element.tag)
+ self.assertEqual(None, iterator.root)
+ del element
+
+ event, element = next(iterator)
+ self.assertEqual('end', event)
+ self.assertEqual('head', element.tag)
+ self.assertEqual(None, iterator.root)
+ del element
+ del iterator
+
+ def test_html_iterparse_broken(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<head><title>TEST></head><p>P<br></div>')
+
+ iterator = iterparse(f, html=True)
+ self.assertEqual(None, iterator.root)
+
+ events = list(iterator)
+ root = iterator.root
+ self.assertTrue(root is not None)
+ self.assertEqual('html', root.tag)
+ self.assertEqual('head', root[0].tag)
+ self.assertEqual('body', root[1].tag)
+ self.assertEqual('p', root[1][0].tag)
+ self.assertEqual('br', root[1][0][0].tag)
+ self.assertEqual(
+ [('end', root[0][0]), ('end', root[0]), ('end', root[1][0][0]),
+ ('end', root[1][0]), ('end', root[1]), ('end', root)],
+ events)
+
+ def test_html_iterparse_broken_no_recover(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO('<p>P<br></div>')
+ iterator = iterparse(f, html=True, recover=False)
+ self.assertRaises(self.etree.XMLSyntaxError, list, iterator)
+
+ def test_html_iterparse_file(self):
+ iterparse = self.etree.iterparse
+ iterator = iterparse(fileInTestDir("shakespeare.html"),
+ html=True)
+
+ self.assertEqual(None, iterator.root)
+ events = list(iterator)
+ root = iterator.root
+ self.assertTrue(root is not None)
+ self.assertEqual(249, len(events))
+ self.assertFalse(
+ [event for (event, element) in events if event != 'end'])
+
+ def test_html_iterparse_start(self):
+ iterparse = self.etree.iterparse
+ f = BytesIO(
+ '<html><head><title>TITLE</title><body><p>P</p></body></html>')
+
+ iterator = iterparse(f, html=True, events=('start',))
+ self.assertEqual(None, iterator.root)
+
+ events = list(iterator)
+ root = iterator.root
+ self.assertNotEqual(None, root)
+ self.assertEqual(
+ [('start', root), ('start', root[0]), ('start', root[0][0]),
+ ('start', root[1]), ('start', root[1][0])],
+ events)
+
+ def test_html_feed_parser(self):
+ parser = self.etree.HTMLParser()
+ parser.feed("<html><body></")
+ parser.feed("body></html>")
+ root = parser.close()
+
+ self.assertEqual('html', root.tag)
+ # test that we find all names in the parser dict
+ self.assertEqual([root], list(root.iter('html')))
+ self.assertEqual([root[0]], list(root.iter('body')))
+
+ def test_html_feed_parser_chunky(self):
+ parser = self.etree.HTMLParser()
+ parser.feed("<htm")
+ parser.feed("l><body")
+ parser.feed("><")
+ parser.feed("p><")
+ parser.feed("strong")
+ parser.feed(">some ")
+ parser.feed("text</strong></p><")
+ parser.feed("/body></html>")
+ root = parser.close()
+
+ self.assertEqual('html', root.tag)
+ # test that we find all names in the parser dict
+ self.assertEqual([root], list(root.iter('html')))
+ self.assertEqual([root[0]], list(root.iter('body')))
+ self.assertEqual([root[0][0]], list(root.iter('p')))
+ self.assertEqual([root[0][0][0]], list(root.iter('strong')))
+
+ def test_html_feed_parser_more_tags(self):
+ parser = self.etree.HTMLParser()
+ parser.feed('<html><head>')
+ parser.feed('<title>TITLE</title><body><p>P</p></body><')
+ parser.feed("/html>")
+ root = parser.close()
+
+ self.assertEqual('html', root.tag)
+ # test that we find all names in the parser dict
+ self.assertEqual([root], list(root.iter('html')))
+ self.assertEqual([root[0]], list(root.iter('head')))
+ self.assertEqual([root[0][0]], list(root.iter('title')))
+ self.assertEqual([root[1]], list(root.iter('body')))
+ self.assertEqual([root[1][0]], list(root.iter('p')))
+
+ def test_html_parser_target_tag(self):
+ assertFalse = self.assertFalse
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append(("start", tag))
+ assertFalse(attrib)
+ def end(self, tag):
+ events.append(("end", tag))
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.HTMLParser(target=Target())
+
+ parser.feed("<html><body></body></html>")
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual([
+ ("start", "html"), ("start", "body"),
+ ("end", "body"), ("end", "html")], events)
+
+ def test_html_parser_target_doctype_empty(self):
+ assertFalse = self.assertFalse
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append(("start", tag))
+ assertFalse(attrib)
+ def end(self, tag):
+ events.append(("end", tag))
+ def doctype(self, *args):
+ events.append(("doctype", args))
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.HTMLParser(target=Target())
+ parser.feed("<!DOCTYPE><html><body></body></html>")
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual([
+ ("doctype", (None, None, None)),
+ ("start", "html"), ("start", "body"),
+ ("end", "body"), ("end", "html")], events)
+
+ def test_html_parser_target_doctype_html(self):
+ assertFalse = self.assertFalse
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append(("start", tag))
+ assertFalse(attrib)
+ def end(self, tag):
+ events.append(("end", tag))
+ def doctype(self, *args):
+ events.append(("doctype", args))
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.HTMLParser(target=Target())
+ parser.feed("<!DOCTYPE html><html><body></body></html>")
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual([
+ ("doctype", ("html", None, None)),
+ ("start", "html"), ("start", "body"),
+ ("end", "body"), ("end", "html")], events)
+
+ def test_html_parser_target_doctype_html_full(self):
+ assertFalse = self.assertFalse
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append(("start", tag))
+ assertFalse(attrib)
+ def end(self, tag):
+ events.append(("end", tag))
+ def doctype(self, *args):
+ events.append(("doctype", args))
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.HTMLParser(target=Target())
+ parser.feed('<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "sys.dtd">'
+ '<html><body></body></html>')
+ done = parser.close()
+
+ self.assertEqual("DONE", done)
+ self.assertEqual([
+ ("doctype", ("html", "-//W3C//DTD HTML 4.01//EN", "sys.dtd")),
+ ("start", "html"), ("start", "body"),
+ ("end", "body"), ("end", "html")], events)
+
+ def test_html_parser_target_exceptions(self):
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append(("start", tag))
+ raise ValueError("START")
+ def end(self, tag):
+ events.append(("end", tag))
+ raise TypeError("END")
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.HTMLParser(target=Target())
+ try:
+ parser.feed('<html><body>')
+ parser.feed('</body></html>')
+ except ValueError as exc:
+ assert "START" in str(exc)
+ except TypeError as exc:
+ assert "END" in str(exc)
+ self.assertTrue(False, "wrong exception raised")
+ else:
+ self.assertTrue(False, "no exception raised")
+
+ self.assertTrue(("start", "html") in events, events)
+ self.assertTrue(("end", "html") not in events, events)
+
+ def test_html_fromstring_target_exceptions(self):
+ events = []
+ class Target(object):
+ def start(self, tag, attrib):
+ events.append(("start", tag))
+ raise ValueError("START")
+ def end(self, tag):
+ events.append(("end", tag))
+ raise TypeError("END")
+ def close(self):
+ return "DONE"
+
+ parser = self.etree.HTMLParser(target=Target())
+ try:
+ self.etree.fromstring('<html><body></body></html>', parser)
+ except ValueError as exc:
+ assert "START" in str(exc), str(exc)
+ except TypeError as exc:
+ assert "END" in str(exc), str(exc)
+ self.assertTrue(False, "wrong exception raised")
+ else:
+ self.assertTrue(False, "no exception raised")
+
+ self.assertTrue(("start", "html") in events, events)
+ self.assertTrue(("end", "html") not in events, events)
+
+ def test_set_decl_html(self):
+ doc = html.Element('html').getroottree()
+ doc.docinfo.public_id = "-//W3C//DTD XHTML 1.0 Strict//EN"
+ doc.docinfo.system_url = \
+ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"
+ self.assertEqual(doc.docinfo.doctype,
+ '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">')
+ self.assertEqual(self.etree.tostring(doc),
+ _bytes('''<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml"></html>'''))
+
+ def test_html5_doctype(self):
+ # document type declaration with neither public if nor system url
+ doc = html.Element('html').getroottree()
+ doc.docinfo.public_id = None
+ doc.docinfo.system_url = None
+ self.assertEqual(doc.docinfo.doctype,
+ '<!DOCTYPE html>')
+ self.assertTrue(doc.docinfo.public_id is None)
+ self.assertEqual(self.etree.tostring(doc),
+ _bytes('<!DOCTYPE html>\n<html/>'))
+
+ def test_ietf_decl(self):
+ # legacy declaration with public id, no system url
+ doc = html.Element('html').getroottree()
+ doc.docinfo.public_id = '-//IETF//DTD HTML//EN'
+ doc.docinfo.system_url = None
+ self.assertEqual(doc.docinfo.doctype,
+ '<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">')
+ self.assertEqual(self.etree.tostring(doc),
+ _bytes('<!DOCTYPE html PUBLIC "-//IETF//DTD HTML//EN">\n<html/>'))
+
+ def test_boolean_attribute(self):
+ # ability to serialize boolean attribute by setting value to None
+ form = html.Element('form')
+ form.set('novalidate', None)
+ self.assertEqual(html.tostring(form),
+ _bytes('<form novalidate></form>'))
+ form.set('custom')
+ self.assertEqual(html.tostring(form),
+ _bytes('<form novalidate custom></form>'))
+
+ def test_boolean_attribute_round_trip(self):
+ # ability to pass boolean attributes unmodified
+ fragment = '<tag attribute></tag>'
+ self.assertEqual(html.tostring(html.fragment_fromstring(fragment)),
+ _bytes(fragment))
+
+ def test_boolean_attribute_xml_adds_empty_string(self):
+ # html serialized as xml converts boolean attributes to empty strings
+ fragment = '<tag attribute></tag>'
+ self.assertEqual(self.etree.tostring(html.fragment_fromstring(fragment)),
+ _bytes('<tag attribute=""/>'))
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(HtmlParserTestCase)])
+ return suite
+
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_http_io.py b/src/lxml/tests/test_http_io.py
new file mode 100644
index 0000000..07f2742
--- /dev/null
+++ b/src/lxml/tests/test_http_io.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+
+"""
+Web IO test cases (wsgiref)
+"""
+
+from __future__ import absolute_import
+
+import unittest
+import textwrap
+import sys
+import gzip
+
+from .common_imports import etree, HelperTestCase, BytesIO, _bytes
+from .dummy_http_server import webserver, HTTPRequestCollector
+
+
+class HttpIOTestCase(HelperTestCase):
+ etree = etree
+
+ def _parse_from_http(self, data, code=200, headers=None, parser=None):
+ handler = HTTPRequestCollector(data, code, headers)
+ with webserver(handler) as host_url:
+ tree = self.etree.parse(host_url + 'TEST', parser=parser)
+ self.assertEqual([('/TEST', [])], handler.requests)
+ return tree
+
+ def test_http_client(self):
+ tree = self._parse_from_http(_bytes('<root><a/></root>'))
+ self.assertEqual('root', tree.getroot().tag)
+ self.assertEqual('a', tree.getroot()[0].tag)
+
+ def test_http_client_404(self):
+ try:
+ self._parse_from_http(_bytes('<root/>'), code=404)
+ except IOError:
+ self.assertTrue(True)
+ else:
+ self.assertTrue(False, "expected IOError")
+
+ def test_http_client_gzip(self):
+ f = BytesIO()
+ gz = gzip.GzipFile(fileobj=f, mode='w', filename='test.xml')
+ gz.write(_bytes('<root><a/></root>'))
+ gz.close()
+ data = f.getvalue()
+ del f, gz
+
+ headers = [('Content-Encoding', 'gzip')]
+ tree = self._parse_from_http(data, headers=headers)
+ self.assertEqual('root', tree.getroot().tag)
+ self.assertEqual('a', tree.getroot()[0].tag)
+
+ def test_parser_input_mix(self):
+ data = _bytes('<root><a/></root>')
+ handler = HTTPRequestCollector(data)
+
+ with webserver(handler) as host_url:
+ tree = self.etree.parse(host_url)
+ root = tree.getroot()
+ self.assertEqual('a', root[0].tag)
+
+ root = self.etree.fromstring(data)
+ self.assertEqual('a', root[0].tag)
+
+ tree = self.etree.parse(host_url)
+ root = tree.getroot()
+ self.assertEqual('a', root[0].tag)
+
+ root = self.etree.fromstring(data)
+ self.assertEqual('a', root[0].tag)
+
+ root = self.etree.fromstring(data)
+ self.assertEqual('a', root[0].tag)
+
+ def test_network_dtd(self):
+ data = [_bytes(textwrap.dedent(s)) for s in [
+ # XML file
+ '''\
+ <?xml version="1.0"?>
+ <!DOCTYPE root SYSTEM "./file.dtd">
+ <root>&myentity;</root>
+ ''',
+ # DTD
+ '<!ENTITY myentity "DEFINED">',
+ ]]
+
+ responses = []
+ def handler(environ, start_response):
+ start_response('200 OK', [])
+ return [responses.pop()]
+
+ with webserver(handler) as host_url:
+ # DTD network loading enabled
+ responses = data[::-1]
+ tree = self.etree.parse(
+ host_url + 'dir/test.xml',
+ parser=self.etree.XMLParser(
+ load_dtd=True, no_network=False))
+ self.assertFalse(responses) # all read
+ root = tree.getroot()
+ self.assertEqual('DEFINED', root.text)
+
+ # DTD network loading disabled
+ responses = data[::-1]
+ try:
+ self.etree.parse(
+ host_url + 'dir/test.xml',
+ parser=self.etree.XMLParser(
+ load_dtd=True, no_network=True))
+ except self.etree.XMLSyntaxError:
+ self.assertTrue("myentity" in str(sys.exc_info()[1]))
+ else:
+ self.assertTrue(False)
+ self.assertEqual(1, len(responses)) # DTD not read
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(HttpIOTestCase)])
+ return suite
+
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_import.xsd b/src/lxml/tests/test_import.xsd
new file mode 100644
index 0000000..3accd05
--- /dev/null
+++ b/src/lxml/tests/test_import.xsd
@@ -0,0 +1,10 @@
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+ targetNamespace="http://codespeak.net/lxml/schema/ns1"
+ xmlns:a="http://codespeak.net/lxml/schema/ns"
+ >
+ <xsd:import
+ namespace="http://codespeak.net/lxml/schema/ns"
+ schemaLocation="test_inc.xsd" />
+
+ <xsd:element name="x" type="a:AType"/>
+</xsd:schema>
diff --git a/src/lxml/tests/test_inc.xsd b/src/lxml/tests/test_inc.xsd
new file mode 100644
index 0000000..0c2cf3d
--- /dev/null
+++ b/src/lxml/tests/test_inc.xsd
@@ -0,0 +1,10 @@
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+ xmlns="http://codespeak.net/lxml/schema/ns"
+ targetNamespace="http://codespeak.net/lxml/schema/ns">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:string" />
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
diff --git a/src/lxml/tests/test_incremental_xmlfile.py b/src/lxml/tests/test_incremental_xmlfile.py
new file mode 100644
index 0000000..ddf8165
--- /dev/null
+++ b/src/lxml/tests/test_incremental_xmlfile.py
@@ -0,0 +1,674 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests for the incremental XML serialisation API.
+"""
+
+from __future__ import absolute_import
+
+import io
+import os
+import sys
+import unittest
+import textwrap
+import tempfile
+
+from lxml.etree import LxmlSyntaxError
+
+from .common_imports import etree, BytesIO, HelperTestCase, skipIf, _str
+
+
+class _XmlFileTestCaseBase(HelperTestCase):
+ _file = None # to be set by specific subtypes below
+
+ def test_element(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test'):
+ pass
+ self.assertXml('<test></test>')
+
+ def test_element_write_text(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test'):
+ xf.write('toast')
+ self.assertXml('<test>toast</test>')
+
+ def test_element_write_empty(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test'):
+ xf.write(None)
+ xf.write('')
+ xf.write('')
+ xf.write(None)
+ self.assertXml('<test></test>')
+
+ def test_element_nested(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test'):
+ with xf.element('toast'):
+ with xf.element('taste'):
+ xf.write('conTent')
+ self.assertXml('<test><toast><taste>conTent</taste></toast></test>')
+
+ def test_element_nested_with_text(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test'):
+ xf.write('con')
+ with xf.element('toast'):
+ xf.write('tent')
+ with xf.element('taste'):
+ xf.write('inside')
+ xf.write('tnet')
+ xf.write('noc')
+ self.assertXml('<test>con<toast>tent<taste>inside</taste>'
+ 'tnet</toast>noc</test>')
+
+ def test_write_Element(self):
+ with etree.xmlfile(self._file) as xf:
+ xf.write(etree.Element('test'))
+ self.assertXml('<test/>')
+
+ def test_write_Element_repeatedly(self):
+ element = etree.Element('test')
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test'):
+ for i in range(100):
+ xf.write(element)
+
+ tree = self._parse_file()
+ self.assertTrue(tree is not None)
+ self.assertEqual(100, len(tree.getroot()))
+ self.assertEqual({'test'}, {el.tag for el in tree.getroot()})
+
+ def test_namespace_nsmap(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('{nsURI}test', nsmap={'x': 'nsURI'}):
+ pass
+ self.assertXml('<x:test xmlns:x="nsURI"></x:test>')
+
+ def test_namespace_nested_nsmap(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test', nsmap={'x': 'nsURI'}):
+ with xf.element('{nsURI}toast'):
+ pass
+ self.assertXml('<test xmlns:x="nsURI"><x:toast></x:toast></test>')
+
+ def test_anonymous_namespace(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('{nsURI}test'):
+ pass
+ self.assertXml('<ns0:test xmlns:ns0="nsURI"></ns0:test>')
+
+ def test_namespace_nested_anonymous(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test'):
+ with xf.element('{nsURI}toast'):
+ pass
+ self.assertXml('<test><ns0:toast xmlns:ns0="nsURI"></ns0:toast></test>')
+
+ def test_default_namespace(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('{nsURI}test', nsmap={None: 'nsURI'}):
+ pass
+ self.assertXml('<test xmlns="nsURI"></test>')
+
+ def test_nested_default_namespace(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('{nsURI}test', nsmap={None: 'nsURI'}):
+ with xf.element('{nsURI}toast'):
+ pass
+ self.assertXml('<test xmlns="nsURI"><toast></toast></test>')
+
+ def test_nested_default_namespace_and_other(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('{nsURI}test', nsmap={None: 'nsURI', 'p': 'ns2'}):
+ with xf.element('{nsURI}toast'):
+ pass
+ with xf.element('{ns2}toast'):
+ pass
+ self.assertXml(
+ '<test xmlns="nsURI" xmlns:p="ns2"><toast></toast><p:toast></p:toast></test>')
+
+ def test_pi(self):
+ with etree.xmlfile(self._file) as xf:
+ xf.write(etree.ProcessingInstruction('pypi'))
+ with xf.element('test'):
+ pass
+ self.assertXml('<?pypi ?><test></test>')
+
+ def test_comment(self):
+ with etree.xmlfile(self._file) as xf:
+ xf.write(etree.Comment('a comment'))
+ with xf.element('test'):
+ pass
+ self.assertXml('<!--a comment--><test></test>')
+
+ def test_attribute(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test', attrib={'k': 'v'}):
+ pass
+ self.assertXml('<test k="v"></test>')
+
+ def test_attribute_extra(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test', attrib={'k': 'v'}, n='N'):
+ pass
+ self.assertXml('<test k="v" n="N"></test>')
+
+ def test_attribute_extra_duplicate(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test', attrib={'k': 'v'}, k='V'):
+ pass
+ self.assertXml('<test k="V"></test>')
+
+ def test_escaping(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test'):
+ xf.write('Comments: <!-- text -->\n')
+ xf.write('Entities: &amp;')
+ self.assertXml(
+ '<test>Comments: &lt;!-- text --&gt;\nEntities: &amp;amp;</test>')
+
+ def test_encoding(self):
+ with etree.xmlfile(self._file, encoding='utf16') as xf:
+ with xf.element('test'):
+ xf.write('toast')
+ self.assertXml('<test>toast</test>', encoding='utf16')
+
+ def test_buffering(self):
+ with etree.xmlfile(self._file, buffered=False) as xf:
+ with xf.element('test'):
+ self.assertXml("<test>")
+ xf.write('toast')
+ self.assertXml("<test>toast")
+ with xf.element('taste'):
+ self.assertXml("<test>toast<taste>")
+ xf.write('some', etree.Element("more"), "toast")
+ self.assertXml("<test>toast<taste>some<more/>toast")
+ self.assertXml("<test>toast<taste>some<more/>toast</taste>")
+ xf.write('end')
+ self.assertXml("<test>toast<taste>some<more/>toast</taste>end")
+ self.assertXml("<test>toast<taste>some<more/>toast</taste>end</test>")
+ self.assertXml("<test>toast<taste>some<more/>toast</taste>end</test>")
+
+ def test_flush(self):
+ with etree.xmlfile(self._file, buffered=True) as xf:
+ with xf.element('test'):
+ self.assertXml("")
+ xf.write('toast')
+ self.assertXml("")
+ with xf.element('taste'):
+ self.assertXml("")
+ xf.flush()
+ self.assertXml("<test>toast<taste>")
+ self.assertXml("<test>toast<taste>")
+ self.assertXml("<test>toast<taste>")
+ self.assertXml("<test>toast<taste></taste></test>")
+
+ def test_non_io_exception_continues_closing(self):
+ try:
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('root'):
+ with xf.element('test'):
+ xf.write("BEFORE")
+ raise TypeError("FAIL!")
+ xf.write("AFTER")
+ except TypeError as exc:
+ self.assertTrue("FAIL" in str(exc), exc)
+ else:
+ self.assertTrue(False, "exception not propagated")
+ self.assertXml("<root><test>BEFORE</test></root>")
+
+ def test_generator_close_continues_closing(self):
+ def gen():
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('root'):
+ while True:
+ content = (yield)
+ with xf.element('entry'):
+ xf.write(content)
+
+ g = gen()
+ next(g)
+ g.send('A')
+ g.send('B')
+ g.send('C')
+ g.close()
+ self.assertXml("<root><entry>A</entry><entry>B</entry><entry>C</entry></root>")
+
+ def test_failure_preceding_text(self):
+ try:
+ with etree.xmlfile(self._file) as xf:
+ xf.write('toast')
+ except etree.LxmlSyntaxError:
+ self.assertTrue(True)
+ else:
+ self.assertTrue(False)
+
+ def test_failure_trailing_text(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test'):
+ pass
+ try:
+ xf.write('toast')
+ except etree.LxmlSyntaxError:
+ self.assertTrue(True)
+ else:
+ self.assertTrue(False)
+
+ def test_failure_trailing_Element(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test'):
+ pass
+ try:
+ xf.write(etree.Element('test'))
+ except etree.LxmlSyntaxError:
+ self.assertTrue(True)
+ else:
+ self.assertTrue(False)
+
+ def test_closing_out_of_order_in_error_case(self):
+ cm_exit = None
+ try:
+ with etree.xmlfile(self._file) as xf:
+ x = xf.element('test')
+ cm_exit = x.__exit__
+ x.__enter__()
+ raise ValueError('123')
+ except ValueError:
+ self.assertTrue(cm_exit)
+ try:
+ cm_exit(ValueError, ValueError("huhu"), None)
+ except etree.LxmlSyntaxError:
+ self.assertTrue(True)
+ else:
+ self.assertTrue(False)
+ else:
+ self.assertTrue(False)
+
+ def _read_file(self):
+ pos = self._file.tell()
+ self._file.seek(0)
+ try:
+ return self._file.read()
+ finally:
+ self._file.seek(pos)
+
+ def _parse_file(self):
+ pos = self._file.tell()
+ self._file.seek(0)
+ try:
+ return etree.parse(self._file)
+ finally:
+ self._file.seek(pos)
+
+ def tearDown(self):
+ if self._file is not None:
+ self._file.close()
+
+ def assertXml(self, expected, encoding='utf8'):
+ self.assertEqual(self._read_file().decode(encoding), expected)
+
+
+class BytesIOXmlFileTestCase(_XmlFileTestCaseBase):
+ def setUp(self):
+ self._file = BytesIO()
+
+ def test_filelike_close(self):
+ with etree.xmlfile(self._file, close=True) as xf:
+ with xf.element('test'):
+ pass
+ self.assertRaises(ValueError, self._file.getvalue)
+
+
+class TempXmlFileTestCase(_XmlFileTestCaseBase):
+ def setUp(self):
+ self._file = tempfile.TemporaryFile()
+
+
+@skipIf(sys.platform.startswith("win"), "Can't reopen temporary files on Windows")
+class TempPathXmlFileTestCase(_XmlFileTestCaseBase):
+ def setUp(self):
+ self._tmpfile = tempfile.NamedTemporaryFile()
+ self._file = self._tmpfile.name
+
+ def tearDown(self):
+ try:
+ self._tmpfile.close()
+ finally:
+ if os.path.exists(self._tmpfile.name):
+ os.unlink(self._tmpfile.name)
+
+ def _read_file(self):
+ self._tmpfile.seek(0)
+ return self._tmpfile.read()
+
+ def _parse_file(self):
+ self._tmpfile.seek(0)
+ return etree.parse(self._tmpfile)
+
+ @skipIf(True, "temp file behaviour is too platform specific here")
+ def test_buffering(self):
+ pass
+
+ @skipIf(True, "temp file behaviour is too platform specific here")
+ def test_flush(self):
+ pass
+
+
+class SimpleFileLikeXmlFileTestCase(_XmlFileTestCaseBase):
+ class SimpleFileLike(object):
+ def __init__(self, target):
+ self._target = target
+ self.write = target.write
+ self.tell = target.tell
+ self.seek = target.seek
+ self.closed = False
+
+ def close(self):
+ assert not self.closed
+ self.closed = True
+ self._target.close()
+
+ def setUp(self):
+ self._target = BytesIO()
+ self._file = self.SimpleFileLike(self._target)
+
+ def _read_file(self):
+ return self._target.getvalue()
+
+ def _parse_file(self):
+ pos = self._file.tell()
+ self._target.seek(0)
+ try:
+ return etree.parse(self._target)
+ finally:
+ self._target.seek(pos)
+
+ def test_filelike_not_closing(self):
+ with etree.xmlfile(self._file) as xf:
+ with xf.element('test'):
+ pass
+ self.assertFalse(self._file.closed)
+
+ def test_filelike_close(self):
+ with etree.xmlfile(self._file, close=True) as xf:
+ with xf.element('test'):
+ pass
+ self.assertTrue(self._file.closed)
+ self._file = None # prevent closing in tearDown()
+
+ def test_write_fails(self):
+ class WriteError(Exception):
+ pass
+
+ class Writer(object):
+ def __init__(self, trigger):
+ self._trigger = trigger
+ self._failed = False
+
+ def write(self, data):
+ assert not self._failed, "write() called again after failure"
+ if self._trigger in data:
+ self._failed = True
+ raise WriteError("FAILED: " + self._trigger.decode('utf8'))
+
+ for trigger in ['text', 'root', 'tag', 'noflush']:
+ try:
+ with etree.xmlfile(Writer(trigger.encode('utf8')), encoding='utf8') as xf:
+ with xf.element('root'):
+ xf.flush()
+ with xf.element('tag'):
+ xf.write('text')
+ xf.flush()
+ xf.write('noflush')
+ xf.flush()
+ xf.flush()
+ except WriteError as exc:
+ self.assertTrue('FAILED: ' + trigger in str(exc))
+ else:
+ self.assertTrue(False, "exception not raised for '%s'" % trigger)
+
+
+class HtmlFileTestCase(_XmlFileTestCaseBase):
+ def setUp(self):
+ self._file = BytesIO()
+
+ def test_void_elements(self):
+ # http://www.w3.org/TR/html5/syntax.html#elements-0
+ void_elements = {
+ "area", "base", "br", "col", "embed", "hr", "img", "input",
+ "keygen", "link", "meta", "param", "source", "track", "wbr"}
+
+ # FIXME: These don't get serialized as void elements.
+ void_elements.difference_update([
+ 'area', 'embed', 'keygen', 'source', 'track', 'wbr'
+ ])
+
+ for tag in sorted(void_elements):
+ with etree.htmlfile(self._file) as xf:
+ xf.write(etree.Element(tag))
+ self.assertXml('<%s>' % tag)
+ self._file = BytesIO()
+
+ def test_method_context_manager_misuse(self):
+ with etree.htmlfile(self._file) as xf:
+ with xf.element('foo'):
+ cm = xf.method('xml')
+ cm.__enter__()
+
+ self.assertRaises(LxmlSyntaxError, cm.__enter__)
+
+ cm2 = xf.method('xml')
+ cm2.__enter__()
+ cm2.__exit__(None, None, None)
+
+ self.assertRaises(LxmlSyntaxError, cm2.__exit__, None, None, None)
+
+ cm3 = xf.method('xml')
+ cm3.__enter__()
+ with xf.method('html'):
+ self.assertRaises(LxmlSyntaxError, cm3.__exit__, None, None, None)
+
+ def test_xml_mode_write_inside_html(self):
+ tag = 'foo'
+ attrib = {'selected': 'bar'}
+ elt = etree.Element(tag, attrib=attrib)
+
+ with etree.htmlfile(self._file) as xf:
+ with xf.element("root"):
+ xf.write(elt) # 1
+
+ assert elt.text is None
+ xf.write(elt, method='xml') # 2
+
+ elt.text = ""
+ xf.write(elt, method='xml') # 3
+
+ with xf.element(tag, attrib=attrib, method='xml'):
+ pass # 4
+
+ xf.write(elt) # 5
+
+ with xf.method('xml'):
+ xf.write(elt) # 6
+
+ self.assertXml(
+ '<root>'
+ '<foo selected></foo>' # 1
+ '<foo selected="bar"/>' # 2
+ '<foo selected="bar"></foo>' # 3
+ '<foo selected="bar"></foo>' # 4
+ '<foo selected></foo>' # 5
+ '<foo selected="bar"></foo>' # 6
+ '</root>')
+ self._file = BytesIO()
+
+ def test_xml_mode_element_inside_html(self):
+ # The htmlfile already outputs in xml mode for .element calls. This
+ # test actually illustrates a bug
+
+ with etree.htmlfile(self._file) as xf:
+ with xf.element("root"):
+ with xf.element('foo', attrib={'selected': 'bar'}):
+ pass
+
+ self.assertXml(
+ '<root>'
+ # '<foo selected></foo>' # FIXME: this is the correct output
+ # in html mode
+ '<foo selected="bar"></foo>'
+ '</root>')
+ self._file = BytesIO()
+
+ def test_attribute_quoting(self):
+ with etree.htmlfile(self._file) as xf:
+ with xf.element("tagname", attrib={"attr": '"misquoted"'}):
+ xf.write("foo")
+
+ self.assertXml('<tagname attr="&quot;misquoted&quot;">foo</tagname>')
+
+ def test_attribute_quoting_unicode(self):
+ with etree.htmlfile(self._file) as xf:
+ with xf.element("tagname", attrib={"attr": _str('"misquöted\\u3344\\U00013344"')}):
+ xf.write("foo")
+
+ self.assertXml('<tagname attr="&quot;misqu&#xF6;ted&#x3344;&#x13344;&quot;">foo</tagname>')
+
+ def test_unescaped_script(self):
+ with etree.htmlfile(self._file) as xf:
+ elt = etree.Element('script')
+ elt.text = "if (a < b);"
+ xf.write(elt)
+ self.assertXml('<script>if (a < b);</script>')
+
+ def test_unescaped_script_incremental(self):
+ with etree.htmlfile(self._file) as xf:
+ with xf.element('script'):
+ xf.write("if (a < b);")
+
+ self.assertXml('<script>if (a < b);</script>')
+
+ def test_write_declaration(self):
+ with etree.htmlfile(self._file) as xf:
+ try:
+ xf.write_declaration()
+ except etree.LxmlSyntaxError:
+ self.assertTrue(True)
+ else:
+ self.assertTrue(False)
+ xf.write(etree.Element('html'))
+
+ def test_write_namespaced_element(self):
+ with etree.htmlfile(self._file) as xf:
+ xf.write(etree.Element('{some_ns}some_tag'))
+ self.assertXml('<ns0:some_tag xmlns:ns0="some_ns"></ns0:some_tag>')
+
+ def test_open_namespaced_element(self):
+ with etree.htmlfile(self._file) as xf:
+ with xf.element("{some_ns}some_tag"):
+ pass
+ self.assertXml('<ns0:some_tag xmlns:ns0="some_ns"></ns0:some_tag>')
+
+
+class AsyncXmlFileTestCase(HelperTestCase):
+ def test_async_api(self):
+ out = io.BytesIO()
+ xf = etree.xmlfile(out)
+ scm = xf.__enter__()
+ acm = xf.__aenter__()
+ list(acm.__await__()) # fake await to avoid destructor warning
+
+ def api_of(obj):
+ return sorted(name for name in dir(scm) if not name.startswith('__'))
+
+ a_api = api_of(acm)
+
+ self.assertEqual(api_of(scm), api_of(acm))
+ self.assertTrue('write' in a_api)
+ self.assertTrue('element' in a_api)
+ self.assertTrue('method' in a_api)
+ self.assertTrue(len(a_api) > 5)
+
+ def _run_async(self, coro):
+ while True:
+ try:
+ coro.send(None)
+ except StopIteration as ex:
+ return ex.value
+
+ @skipIf(sys.version_info < (3, 5), "requires support for async-def (Py3.5+)")
+ def test_async(self):
+ code = textwrap.dedent("""\
+ async def test_async_xmlfile(close=True, buffered=True):
+ class Writer(object):
+ def __init__(self):
+ self._data = []
+ self._all_data = None
+ self._calls = 0
+
+ async def write(self, data):
+ self._calls += 1
+ self._data.append(data)
+
+ async def close(self):
+ assert self._all_data is None
+ assert self._data is not None
+ self._all_data = b''.join(self._data)
+ self._data = None # make writing fail afterwards
+
+ async def generate(out, close=True, buffered=True):
+ async with etree.xmlfile(out, close=close, buffered=buffered) as xf:
+ async with xf.element('root'):
+ await xf.write('root-text')
+ async with xf.method('html'):
+ await xf.write(etree.Element('img', src='http://huhu.org/'))
+ await xf.flush()
+ for i in range(3):
+ async with xf.element('el'):
+ await xf.write('text-%d' % i)
+
+ out = Writer()
+ await generate(out, close=close, buffered=buffered)
+ if not close:
+ await out.close()
+ assert out._data is None, out._data
+ return out._all_data, out._calls
+ """)
+ lns = {}
+ exec(code, globals(), lns)
+ test_async_xmlfile = lns['test_async_xmlfile']
+
+ expected = (
+ b'<root>root-text<img src="http://huhu.org/">'
+ b'<el>text-0</el><el>text-1</el><el>text-2</el></root>'
+ )
+
+ data, calls = self._run_async(test_async_xmlfile(close=True))
+ self.assertEqual(expected, data)
+ self.assertEqual(2, calls) # only flush() and close()
+
+ data, calls = self._run_async(test_async_xmlfile(close=False))
+ self.assertEqual(expected, data)
+ self.assertEqual(2, calls) # only flush() and close()
+
+ data, unbuffered_calls = self._run_async(test_async_xmlfile(buffered=False))
+ self.assertEqual(expected, data)
+ self.assertTrue(unbuffered_calls > calls, unbuffered_calls)
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([
+ unittest.makeSuite(BytesIOXmlFileTestCase),
+ unittest.makeSuite(TempXmlFileTestCase),
+ unittest.makeSuite(TempPathXmlFileTestCase),
+ unittest.makeSuite(SimpleFileLikeXmlFileTestCase),
+ unittest.makeSuite(HtmlFileTestCase),
+ unittest.makeSuite(AsyncXmlFileTestCase),
+ ])
+ return suite
+
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_io.py b/src/lxml/tests/test_io.py
new file mode 100644
index 0000000..cbdbcef
--- /dev/null
+++ b/src/lxml/tests/test_io.py
@@ -0,0 +1,373 @@
+# -*- coding: utf-8 -*-
+
+"""
+IO test cases that apply to both etree and ElementTree
+"""
+
+from __future__ import absolute_import
+
+import unittest
+import tempfile, gzip, os, os.path, gc, shutil
+
+from .common_imports import (
+ etree, ElementTree, _str, _bytes,
+ SillyFileLike, LargeFileLike, HelperTestCase,
+ read_file, write_to_file, BytesIO, tmpfile
+)
+
+
+class _IOTestCaseBase(HelperTestCase):
+ """(c)ElementTree compatibility for IO functions/methods
+ """
+ etree = None
+
+ def setUp(self):
+ """Setting up a minimal tree
+ """
+ self.root = self.etree.Element('a')
+ self.root_str = self.etree.tostring(self.root)
+ self.tree = self.etree.ElementTree(self.root)
+ self._temp_dir = tempfile.mkdtemp()
+
+ def tearDown(self):
+ gc.collect()
+ shutil.rmtree(self._temp_dir)
+
+ def getTestFilePath(self, name):
+ return os.path.join(self._temp_dir, name)
+
+ def buildNodes(self, element, children, depth):
+ Element = self.etree.Element
+
+ if depth == 0:
+ return
+ for i in range(children):
+ new_element = Element('element_%s_%s' % (depth, i))
+ self.buildNodes(new_element, children, depth - 1)
+ element.append(new_element)
+
+ def test_tree_io(self):
+ Element = self.etree.Element
+ ElementTree = self.etree.ElementTree
+
+ element = Element('top')
+ element.text = _str("qwrtioüöä\uAABB")
+ tree = ElementTree(element)
+ self.buildNodes(element, 10, 3)
+ with open(self.getTestFilePath('testdump.xml'), 'wb') as f:
+ tree.write(f, encoding='UTF-8')
+ with open(self.getTestFilePath('testdump.xml'), 'rb') as f:
+ tree = ElementTree(file=f)
+ with open(self.getTestFilePath('testdump2.xml'), 'wb') as f:
+ tree.write(f, encoding='UTF-8')
+ with open(self.getTestFilePath('testdump.xml'), 'rb') as f:
+ data1 = f.read()
+ with open(self.getTestFilePath('testdump2.xml'), 'rb') as f:
+ data2 = f.read()
+ self.assertEqual(data1, data2)
+
+ def test_tree_io_latin1(self):
+ Element = self.etree.Element
+ ElementTree = self.etree.ElementTree
+
+ element = Element('top')
+ element.text = _str("qwrtioüöäßá")
+ tree = ElementTree(element)
+ self.buildNodes(element, 10, 3)
+ with open(self.getTestFilePath('testdump.xml'), 'wb') as f:
+ tree.write(f, encoding='iso-8859-1')
+ with open(self.getTestFilePath('testdump.xml'), 'rb') as f:
+ tree = ElementTree(file=f)
+ with open(self.getTestFilePath('testdump2.xml'), 'wb') as f:
+ tree.write(f, encoding='iso-8859-1')
+ with open(self.getTestFilePath('testdump.xml'), 'rb') as f:
+ data1 = f.read()
+ with open(self.getTestFilePath('testdump2.xml'), 'rb') as f:
+ data2 = f.read()
+ self.assertEqual(data1, data2)
+
+ def test_write_filename(self):
+ # (c)ElementTree supports filename strings as write argument
+ with tmpfile(prefix="p", suffix=".xml") as filename:
+ self.tree.write(filename)
+ self.assertEqual(read_file(filename, 'rb').replace(b'\n', b''),
+ self.root_str)
+
+ def test_write_filename_special_percent(self):
+ # '%20' is a URL escaped space character.
+ before_test = os.listdir(tempfile.gettempdir())
+
+ def difference(filenames):
+ return sorted(
+ fn for fn in set(filenames).difference(before_test)
+ if fn.startswith('lxmltmp-')
+ )
+
+ with tmpfile(prefix="lxmltmp-p%20p", suffix=".xml") as filename:
+ try:
+ before_write = os.listdir(tempfile.gettempdir())
+ self.tree.write(filename)
+ after_write = os.listdir(tempfile.gettempdir())
+ self.assertEqual(read_file(filename, 'rb').replace(b'\n', b''),
+ self.root_str)
+ except (AssertionError, IOError, OSError):
+ print("Before write: %s, after write: %s" % (
+ difference(before_write), difference(after_write))
+ )
+ raise
+
+ def test_write_filename_special_plus(self):
+ # '+' is used as an escaped space character in URLs.
+ with tmpfile(prefix="p+", suffix=".xml") as filename:
+ self.tree.write(filename)
+ self.assertEqual(read_file(filename, 'rb').replace(b'\n', b''),
+ self.root_str)
+
+ def test_write_invalid_filename(self):
+ filename = os.path.join(
+ os.path.join('hopefullynonexistingpathname'),
+ 'invalid_file.xml')
+ try:
+ self.tree.write(filename)
+ except IOError:
+ pass
+ else:
+ self.assertTrue(
+ False, "writing to an invalid file path should fail")
+
+ def test_module_parse_gzipobject(self):
+ # (c)ElementTree supports gzip instance as parse argument
+ with tmpfile(suffix=".xml.gz") as filename:
+ with gzip.open(filename, 'wb') as f:
+ f.write(self.root_str)
+ with gzip.open(filename, 'rb') as f_gz:
+ tree = self.etree.parse(f_gz)
+ self.assertEqual(self.etree.tostring(tree.getroot()), self.root_str)
+
+ def test_class_parse_filename(self):
+ # (c)ElementTree class ElementTree has a 'parse' method that returns
+ # the root of the tree
+
+ # parse from filename
+ with tmpfile(suffix=".xml") as filename:
+ write_to_file(filename, self.root_str, 'wb')
+ tree = self.etree.ElementTree()
+ root = tree.parse(filename)
+ self.assertEqual(self.etree.tostring(root), self.root_str)
+
+ def test_class_parse_filename_remove_previous(self):
+ with tmpfile(suffix=".xml") as filename:
+ write_to_file(filename, self.root_str, 'wb')
+ tree = self.etree.ElementTree()
+ root = tree.parse(filename)
+ # and now do it again; previous content should still be there
+ root2 = tree.parse(filename)
+ self.assertEqual('a', root.tag)
+ self.assertEqual('a', root2.tag)
+ # now remove all references to root2, and parse again
+ del root2
+ root3 = tree.parse(filename)
+ self.assertEqual('a', root.tag)
+ self.assertEqual('a', root3.tag)
+ # root2's memory should've been freed here
+ # XXX how to check?
+
+ def test_class_parse_fileobject(self):
+ # (c)ElementTree class ElementTree has a 'parse' method that returns
+ # the root of the tree
+
+ # parse from file object
+ handle, filename = tempfile.mkstemp(suffix=".xml")
+ try:
+ os.write(handle, self.root_str)
+ with open(filename, 'rb') as f:
+ tree = self.etree.ElementTree()
+ root = tree.parse(f)
+ self.assertEqual(self.etree.tostring(root), self.root_str)
+ finally:
+ os.close(handle)
+ os.remove(filename)
+
+ def test_class_parse_unamed_fileobject(self):
+ # (c)ElementTree class ElementTree has a 'parse' method that returns
+ # the root of the tree
+
+ # parse from unnamed file object
+ f = SillyFileLike()
+ root = self.etree.ElementTree().parse(f)
+ self.assertTrue(root.tag.endswith('foo'))
+
+ def test_module_parse_large_fileobject(self):
+ # parse from unnamed file object
+ f = LargeFileLike()
+ tree = self.etree.parse(f)
+ root = tree.getroot()
+ self.assertTrue(root.tag.endswith('root'))
+
+ def test_module_parse_fileobject_error(self):
+ class LocalError(Exception):
+ pass
+ class TestFile:
+ def read(*args):
+ raise LocalError
+ f = TestFile()
+ self.assertRaises(LocalError, self.etree.parse, f)
+
+ def test_module_parse_fileobject_late_error(self):
+ class LocalError(Exception):
+ pass
+ class TestFile:
+ data = '<root>test</'
+ try:
+ next_char = iter(data).next
+ except AttributeError:
+ # Python 3
+ next_char = iter(data).__next__
+ counter = 0
+ def read(self, amount=None):
+ if amount is None:
+ while True:
+ self.read(1)
+ else:
+ try:
+ self.counter += 1
+ return _bytes(self.next_char())
+ except StopIteration:
+ raise LocalError
+ f = TestFile()
+ self.assertRaises(LocalError, self.etree.parse, f)
+ self.assertEqual(f.counter, len(f.data)+1)
+
+ def test_module_parse_fileobject_type_error(self):
+ class TestFile:
+ def read(*args):
+ return 1
+ f = TestFile()
+
+ try:
+ expect_exc = (TypeError, self.etree.ParseError)
+ except AttributeError:
+ expect_exc = TypeError
+ self.assertRaises(expect_exc, self.etree.parse, f)
+
+ def test_etree_parse_io_error(self):
+ # this is a directory name that contains characters beyond latin-1
+ dirnameEN = _str('Directory')
+ dirnameRU = _str('Úðтðûþó')
+ filename = _str('nosuchfile.xml')
+ dn = tempfile.mkdtemp(prefix=dirnameEN)
+ try:
+ self.assertRaises(IOError, self.etree.parse, os.path.join(dn, filename))
+ finally:
+ os.rmdir(dn)
+ dn = tempfile.mkdtemp(prefix=dirnameRU)
+ try:
+ self.assertRaises(IOError, self.etree.parse, os.path.join(dn, filename))
+ finally:
+ os.rmdir(dn)
+
+ def test_parse_utf8_bom(self):
+ utext = _str('Søk på nettet')
+ uxml = '<?xml version="1.0" encoding="UTF-8"?><p>%s</p>' % utext
+ bom = _bytes('\\xEF\\xBB\\xBF').decode(
+ "unicode_escape").encode("latin1")
+ self.assertEqual(3, len(bom))
+ f = tempfile.NamedTemporaryFile(delete=False)
+ try:
+ try:
+ f.write(bom)
+ f.write(uxml.encode("utf-8"))
+ finally:
+ f.close()
+ tree = self.etree.parse(f.name)
+ finally:
+ os.unlink(f.name)
+ self.assertEqual(utext, tree.getroot().text)
+
+ def test_iterparse_utf8_bom(self):
+ utext = _str('Søk på nettet')
+ uxml = '<?xml version="1.0" encoding="UTF-8"?><p>%s</p>' % utext
+ bom = _bytes('\\xEF\\xBB\\xBF').decode(
+ "unicode_escape").encode("latin1")
+ self.assertEqual(3, len(bom))
+ f = tempfile.NamedTemporaryFile(delete=False)
+ try:
+ try:
+ f.write(bom)
+ f.write(uxml.encode("utf-8"))
+ finally:
+ f.close()
+ elements = [el for _, el in self.etree.iterparse(f.name)]
+ self.assertEqual(1, len(elements))
+ root = elements[0]
+ finally:
+ os.unlink(f.name)
+ self.assertEqual(utext, root.text)
+
+ def test_iterparse_utf16_bom(self):
+ utext = _str('Søk på nettet')
+ uxml = '<?xml version="1.0" encoding="UTF-16"?><p>%s</p>' % utext
+ boms = _bytes('\\xFE\\xFF \\xFF\\xFE').decode(
+ "unicode_escape").encode("latin1")
+ self.assertEqual(5, len(boms))
+ xml = uxml.encode("utf-16")
+ self.assertTrue(xml[:2] in boms, repr(xml[:2]))
+
+ f = tempfile.NamedTemporaryFile(delete=False)
+ try:
+ try:
+ f.write(xml)
+ finally:
+ f.close()
+ elements = [el for _, el in self.etree.iterparse(f.name)]
+ self.assertEqual(1, len(elements))
+ root = elements[0]
+ finally:
+ os.unlink(f.name)
+ self.assertEqual(utext, root.text)
+
+
+class ETreeIOTestCase(_IOTestCaseBase):
+ etree = etree
+
+ def test_write_compressed_text(self):
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ ElementTree = self.etree.ElementTree
+ text = _str("qwrtioüöä")
+
+ root = Element('root')
+ root.text = text
+ child = SubElement(root, 'sub')
+ child.text = 'TEXT'
+ child.tail = 'TAIL'
+ SubElement(root, 'sub').text = text
+
+ tree = ElementTree(root)
+ out = BytesIO()
+ tree.write(out, method='text', encoding='utf8', compression=9)
+ out.seek(0)
+
+ f = gzip.GzipFile(fileobj=out)
+ try:
+ result = f.read().decode('utf8')
+ finally:
+ f.close()
+ self.assertEqual(text+'TEXTTAIL'+text, result)
+
+
+if ElementTree:
+ class ElementTreeIOTestCase(_IOTestCaseBase):
+ etree = ElementTree
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ETreeIOTestCase)])
+ if ElementTree:
+ suite.addTests([unittest.makeSuite(ElementTreeIOTestCase)])
+ return suite
+
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_isoschematron.py b/src/lxml/tests/test_isoschematron.py
new file mode 100644
index 0000000..6d2aa3f
--- /dev/null
+++ b/src/lxml/tests/test_isoschematron.py
@@ -0,0 +1,870 @@
+# -*- coding: utf-8 -*-
+
+"""
+Test cases related to ISO-Schematron parsing and validation
+"""
+
+from __future__ import absolute_import
+
+import unittest
+from lxml import isoschematron
+
+from .common_imports import etree, HelperTestCase, fileInTestDir, doctest, make_doctest
+
+
+class ETreeISOSchematronTestCase(HelperTestCase):
+ def test_schematron(self):
+ tree_valid = self.parse('<AAA><BBB/><CCC/></AAA>')
+ tree_invalid = self.parse('<AAA><BBB/><CCC/><DDD/></AAA>')
+ schema = self.parse('''\
+<schema xmlns="http://purl.oclc.org/dsdl/schematron" >
+ <pattern id="OpenModel">
+ <title>Open Model</title>
+ <rule context="AAA">
+ <assert test="BBB"> BBB element is not present</assert>
+ <assert test="CCC"> CCC element is not present</assert>
+ </rule>
+ </pattern>
+ <pattern id="ClosedModel">
+ <title>Closed model"</title>
+ <rule context="AAA">
+ <assert test="BBB"> BBB element is not present</assert>
+ <assert test="CCC"> CCC element is not present</assert>
+ <assert test="count(BBB|CCC) = count (*)">There is an extra element</assert>
+ </rule>
+ </pattern>
+</schema>
+''')
+
+ schema = isoschematron.Schematron(schema)
+ self.assertTrue(schema.validate(tree_valid))
+ self.assertTrue(not schema.validate(tree_invalid))
+
+ def test_schematron_elementtree_error(self):
+ self.assertRaises(ValueError, isoschematron.Schematron, etree.ElementTree())
+
+ # an empty pattern is valid in iso schematron
+ def test_schematron_empty_pattern(self):
+ schema = self.parse('''\
+<schema xmlns="http://purl.oclc.org/dsdl/schematron" >
+ <pattern id="OpenModel">
+ <title>Open model</title>
+ </pattern>
+</schema>
+''')
+ schema = isoschematron.Schematron(schema)
+ self.assertTrue(schema)
+
+ def test_schematron_invalid_schema_empty(self):
+ schema = self.parse('''\
+<schema xmlns="http://purl.oclc.org/dsdl/schematron" />
+''')
+ self.assertRaises(etree.SchematronParseError,
+ isoschematron.Schematron, schema)
+
+ def test_schematron_invalid_schema_namespace(self):
+ schema = self.parse('''\
+<schema xmlns="mynamespace" />
+''')
+ self.assertRaises(etree.SchematronParseError,
+ isoschematron.Schematron, schema)
+
+ def test_schematron_from_tree(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+</sch:schema>
+''')
+ schematron = isoschematron.Schematron(schema)
+ self.assertTrue(isinstance(schematron, isoschematron.Schematron))
+
+ def test_schematron_from_element(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+</sch:schema>
+''')
+ schematron = isoschematron.Schematron(schema.getroot())
+ self.assertTrue(isinstance(schematron, isoschematron.Schematron))
+
+ def test_schematron_from_file(self):
+ schematron = isoschematron.Schematron(file=fileInTestDir('test.sch'))
+ self.assertTrue(isinstance(schematron, isoschematron.Schematron))
+
+ def test_schematron_call(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+</sch:schema>
+''')
+ tree_valid = self.parse('''\
+<message>
+ <number_of_entries>0</number_of_entries>
+ <entries>
+ </entries>
+</message>
+''')
+ tree_invalid = self.parse('''\
+<message>
+ <number_of_entries>3</number_of_entries>
+ <entries>
+ <entry>Entry 1</entry>
+ <entry>Entry 2</entry>
+ </entries>
+</message>
+''')
+ schematron = isoschematron.Schematron(schema)
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+
+ def test_schematron_validate(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+</sch:schema>
+''')
+ tree_valid = self.parse('''\
+<message>
+ <number_of_entries>0</number_of_entries>
+ <entries>
+ </entries>
+</message>
+''')
+ tree_invalid = self.parse('''\
+<message>
+ <number_of_entries>3</number_of_entries>
+ <entries>
+ <entry>Entry 1</entry>
+ <entry>Entry 2</entry>
+ </entries>
+</message>
+''')
+ schematron = isoschematron.Schematron(schema)
+ self.assertTrue(schematron.validate(tree_valid), schematron.error_log)
+ valid = schematron.validate(tree_invalid)
+ self.assertTrue(not valid)
+
+ def test_schematron_assertValid(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+</sch:schema>
+''')
+ tree_valid = self.parse('''\
+<message>
+ <number_of_entries>0</number_of_entries>
+ <entries>
+ </entries>
+</message>
+''')
+ tree_invalid = self.parse('''\
+<message>
+ <number_of_entries>3</number_of_entries>
+ <entries>
+ <entry>Entry 1</entry>
+ <entry>Entry 2</entry>
+ </entries>
+</message>
+''')
+ schematron = isoschematron.Schematron(schema)
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ self.assertRaises(etree.DocumentInvalid, schematron.assertValid,
+ tree_invalid)
+
+ def test_schematron_error_log(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+</sch:schema>
+''')
+ tree_valid = self.parse('''\
+<message>
+ <number_of_entries>0</number_of_entries>
+ <entries>
+ </entries>
+</message>
+''')
+ tree_invalid = self.parse('''\
+<message>
+ <number_of_entries>3</number_of_entries>
+ <entries>
+ <entry>Entry 1</entry>
+ <entry>Entry 2</entry>
+ </entries>
+</message>
+''')
+ schematron = isoschematron.Schematron(schema)
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertEqual(len(schematron.error_log), 1,
+ 'expected single error: %s (%s errors)' %
+ (schematron.error_log, len(schematron.error_log)))
+
+ def test_schematron_result_report(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+</sch:schema>
+''')
+ tree_valid = self.parse('''\
+<message>
+ <number_of_entries>0</number_of_entries>
+ <entries>
+ </entries>
+</message>
+''')
+ tree_invalid = self.parse('''\
+<message>
+ <number_of_entries>3</number_of_entries>
+ <entries>
+ <entry>Entry 1</entry>
+ <entry>Entry 2</entry>
+ </entries>
+</message>
+''')
+ schematron = isoschematron.Schematron(schema, store_report=True)
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertTrue(
+ isinstance(schematron.validation_report, etree._ElementTree),
+ 'expected a validation report result tree, got: %s' % schematron.validation_report)
+
+ schematron = isoschematron.Schematron(schema, store_report=False)
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertTrue(schematron.validation_report is None,
+ 'validation reporting switched off, still: %s' % schematron.validation_report)
+
+ def test_schematron_store_schematron(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+</sch:schema>
+''')
+ schematron = isoschematron.Schematron(schema)
+ self.assertTrue(schematron.validator_xslt is None)
+
+ schematron = isoschematron.Schematron(schema, store_schematron=True)
+ self.assertTrue(isinstance(schematron.schematron, etree._ElementTree),
+ 'expected schematron schema to be stored')
+
+ def test_schematron_store_xslt(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+</sch:schema>
+''')
+ schematron = isoschematron.Schematron(schema)
+ self.assertTrue(schematron.validator_xslt is None)
+
+ schematron = isoschematron.Schematron(schema, store_xslt=True)
+ self.assertTrue(isinstance(schematron.validator_xslt, etree._ElementTree),
+ 'expected validator xslt to be stored')
+
+ def test_schematron_abstract(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:title>iso schematron validation</sch:title>
+ <sch:ns uri="http://www.w3.org/2001/XMLSchema-instance" prefix="xsi"/>
+ <sch:ns uri="http://codespeak.net/lxml/objectify/pytype" prefix="py"/>
+
+ <!-- of course, these only really make sense when combined with a schema that
+ ensures datatype xs:dateTime -->
+
+ <sch:pattern abstract="true" id="abstract.dateTime.tz_utc">
+ <sch:rule context="$datetime">
+ <sch:let name="tz" value="concat(substring-after(substring-after(./text(), 'T'), '+'), substring-after(substring-after(./text(), 'T'), '-'))"/>
+ <sch:let name="lastchar" value="substring(./text(), string-length(./text()))"/>
+ <sch:assert test="$lastchar='Z' or $tz='00:00'">[ERROR] element (<sch:value-of select="name(.)"/>) dateTime value (<sch:value-of select="."/>) is not qualified as UTC (tz: <sch:value-of select="$tz"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+
+ <sch:pattern abstract="true" id="abstract.dateTime.tz_utc_nillable">
+ <sch:rule context="$datetime">
+ <sch:let name="tz" value="concat(substring-after(substring-after(./text(), 'T'), '+'), substring-after(substring-after(./text(), 'T'), '-'))"/>
+ <sch:let name="lastchar" value="substring(./text(), string-length(./text()))"/>
+ <sch:assert test="@xsi:nil='true' or ($lastchar='Z' or $tz='00:00')">[ERROR] element (<sch:value-of select="name(.)"/>) dateTime value (<sch:value-of select="."/>) is not qualified as UTC (tz: <sch:value-of select="$tz"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+
+ <sch:pattern is-a="abstract.dateTime.tz_utc" id="datetime" >
+ <sch:param name="datetime" value="datetime"/>
+ </sch:pattern>
+
+ <sch:pattern is-a="abstract.dateTime.tz_utc_nillable" id="nillableDatetime">
+ <sch:param name="datetime" value="nillableDatetime"/>
+ </sch:pattern>
+
+</sch:schema>
+''')
+ valid_trees = [
+ self.parse('''\
+<root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <datetime>2009-12-10T15:21:00Z</datetime>
+ <nillableDatetime xsi:nil="true"/>
+</root>
+'''),
+ self.parse('''\
+<root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <datetime>2009-12-10T15:21:00Z</datetime>
+ <nillableDatetime>2009-12-10T15:21:00Z</nillableDatetime>
+</root>
+'''),
+ self.parse('''\
+<root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <datetime>2009-12-10T15:21:00+00:00</datetime>
+ <nillableDatetime>2009-12-10T15:21:00-00:00</nillableDatetime>
+</root>
+'''),
+ ]
+
+ schematron = isoschematron.Schematron(schema)
+ for tree_valid in valid_trees:
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+
+ tree_invalid = self.parse('''\
+<root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <datetime>2009-12-10T16:21:00+01:00</datetime>
+ <nillableDatetime>2009-12-10T16:21:00+01:00</nillableDatetime>
+</root>
+''')
+ expected = 2
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertEqual(
+ len(schematron.error_log), expected,
+ 'expected %s errors: %s (%s errors)' %
+ (expected, schematron.error_log, len(schematron.error_log)))
+
+ tree_invalid = self.parse('''\
+<root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <datetime xsi:nil="true"/>
+ <nillableDatetime>2009-12-10T16:21:00Z</nillableDatetime>
+</root>
+''')
+ expected = 1
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertEqual(
+ len(schematron.error_log), expected,
+ 'expected %s errors: %s (%s errors)' %
+ (expected, schematron.error_log, len(schematron.error_log)))
+
+ def test_schematron_phases(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:title>iso schematron validation</sch:title>
+ <sch:ns uri="http://www.w3.org/2001/XMLSchema-instance" prefix="xsi"/>
+ <sch:ns uri="http://codespeak.net/lxml/objectify/pytype" prefix="py"/>
+
+ <sch:phase id="mandatory">
+ <sch:active pattern="number_of_entries"/>
+ </sch:phase>
+
+ <sch:phase id="datetime_checks">
+ <sch:active pattern="datetime"/>
+ <sch:active pattern="nillableDatetime"/>
+ </sch:phase>
+
+ <sch:phase id="full">
+ <sch:active pattern="number_of_entries"/>
+ <sch:active pattern="datetime"/>
+ <sch:active pattern="nillableDatetime"/>
+ </sch:phase>
+
+ <!-- of course, these only really make sense when combined with a schema that
+ ensures datatype xs:dateTime -->
+
+ <sch:pattern abstract="true" id="abstract.dateTime.tz_utc">
+ <sch:rule context="$datetime">
+ <sch:let name="tz" value="concat(substring-after(substring-after(./text(), 'T'), '+'), substring-after(substring-after(./text(), 'T'), '-'))"/>
+ <sch:let name="lastchar" value="substring(./text(), string-length(./text()))"/>
+ <sch:assert test="$lastchar='Z' or $tz='00:00'">[ERROR] element (<sch:value-of select="name(.)"/>) dateTime value (<sch:value-of select="."/>) is not qualified as UTC (tz: <sch:value-of select="$tz"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+
+ <sch:pattern abstract="true" id="abstract.dateTime.tz_utc_nillable">
+ <sch:rule context="$datetime">
+ <sch:let name="tz" value="concat(substring-after(substring-after(./text(), 'T'), '+'), substring-after(substring-after(./text(), 'T'), '-'))"/>
+ <sch:let name="lastchar" value="substring(./text(), string-length(./text()))"/>
+ <sch:assert test="@xsi:nil='true' or ($lastchar='Z' or $tz='00:00')">[ERROR] element (<sch:value-of select="name(.)"/>) dateTime value (<sch:value-of select="."/>) is not qualified as UTC (tz: <sch:value-of select="$tz"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries test</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+
+ <sch:pattern id="datetime" is-a="abstract.dateTime.tz_utc">
+ <sch:param name="datetime" value="datetime"/>
+ </sch:pattern>
+
+ <sch:pattern id="nillableDatetime" is-a="abstract.dateTime.tz_utc_nillable">
+ <sch:param name="datetime" value="nillableDatetime"/>
+ </sch:pattern>
+
+</sch:schema>
+''')
+ tree_valid = self.parse('''\
+<message xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <datetime>2009-12-10T15:21:00Z</datetime>
+ <nillableDatetime xsi:nil="true"/>
+ <number_of_entries>0</number_of_entries>
+ <entries>
+ </entries>
+</message>
+''')
+ tree_invalid = self.parse('''\
+<message>
+ <datetime>2009-12-10T16:21:00+01:00</datetime>
+ <nillableDatetime>2009-12-10T16:21:00+01:00</nillableDatetime>
+ <number_of_entries>3</number_of_entries>
+ <entries>
+ <entry>Entry 1</entry>
+ <entry>Entry 2</entry>
+ </entries>
+</message>
+''')
+ # check everything (default phase #ALL)
+ schematron = isoschematron.Schematron(schema)
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ expected = 3
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertEqual(
+ len(schematron.error_log), expected,
+ 'expected %s errors: %s (%s errors)' %
+ (expected, schematron.error_log, len(schematron.error_log)))
+
+ # check phase mandatory
+ schematron = isoschematron.Schematron(
+ schema, compile_params={'phase': 'mandatory'})
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ expected = 1
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertEqual(
+ len(schematron.error_log), expected,
+ 'expected %s errors: %s (%s errors)' %
+ (expected, schematron.error_log, len(schematron.error_log)))
+
+ # check phase datetime_checks
+ schematron = isoschematron.Schematron(
+ schema, compile_params={'phase': 'datetime_checks'})
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ expected = 2
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertEqual(
+ len(schematron.error_log), expected,
+ 'expected %s errors: %s (%s errors)' %
+ (expected, schematron.error_log, len(schematron.error_log)))
+
+ # check phase full
+ schematron = isoschematron.Schematron(
+ schema, compile_params={'phase': 'full'})
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ expected = 3
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertEqual(
+ len(schematron.error_log), expected,
+ 'expected %s errors: %s (%s errors)' %
+ (expected, schematron.error_log, len(schematron.error_log)))
+
+ def test_schematron_phases_kwarg(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:title>iso schematron validation</sch:title>
+ <sch:ns uri="http://www.w3.org/2001/XMLSchema-instance" prefix="xsi"/>
+ <sch:ns uri="http://codespeak.net/lxml/objectify/pytype" prefix="py"/>
+
+ <sch:phase id="mandatory">
+ <sch:active pattern="number_of_entries"/>
+ </sch:phase>
+
+ <sch:phase id="datetime_checks">
+ <sch:active pattern="datetime"/>
+ <sch:active pattern="nillableDatetime"/>
+ </sch:phase>
+
+ <sch:phase id="full">
+ <sch:active pattern="number_of_entries"/>
+ <sch:active pattern="datetime"/>
+ <sch:active pattern="nillableDatetime"/>
+ </sch:phase>
+
+ <!-- of course, these only really make sense when combined with a schema that
+ ensures datatype xs:dateTime -->
+
+ <sch:pattern abstract="true" id="abstract.dateTime.tz_utc">
+ <sch:rule context="$datetime">
+ <sch:let name="tz" value="concat(substring-after(substring-after(./text(), 'T'), '+'), substring-after(substring-after(./text(), 'T'), '-'))"/>
+ <sch:let name="lastchar" value="substring(./text(), string-length(./text()))"/>
+ <sch:assert test="$lastchar='Z' or $tz='00:00'">[ERROR] element (<sch:value-of select="name(.)"/>) dateTime value (<sch:value-of select="."/>) is not qualified as UTC (tz: <sch:value-of select="$tz"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+
+ <sch:pattern abstract="true" id="abstract.dateTime.tz_utc_nillable">
+ <sch:rule context="$datetime">
+ <sch:let name="tz" value="concat(substring-after(substring-after(./text(), 'T'), '+'), substring-after(substring-after(./text(), 'T'), '-'))"/>
+ <sch:let name="lastchar" value="substring(./text(), string-length(./text()))"/>
+ <sch:assert test="@xsi:nil='true' or ($lastchar='Z' or $tz='00:00')">[ERROR] element (<sch:value-of select="name(.)"/>) dateTime value (<sch:value-of select="."/>) is not qualified as UTC (tz: <sch:value-of select="$tz"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries test</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+
+ <sch:pattern id="datetime" is-a="abstract.dateTime.tz_utc">
+ <sch:param name="datetime" value="datetime"/>
+ </sch:pattern>
+
+ <sch:pattern id="nillableDatetime" is-a="abstract.dateTime.tz_utc_nillable">
+ <sch:param name="datetime" value="nillableDatetime"/>
+ </sch:pattern>
+
+</sch:schema>
+''')
+ tree_valid = self.parse('''\
+<message xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <datetime>2009-12-10T15:21:00Z</datetime>
+ <nillableDatetime xsi:nil="true"/>
+ <number_of_entries>0</number_of_entries>
+ <entries>
+ </entries>
+</message>
+''')
+ tree_invalid = self.parse('''\
+<message>
+ <datetime>2009-12-10T16:21:00+01:00</datetime>
+ <nillableDatetime>2009-12-10T16:21:00+01:00</nillableDatetime>
+ <number_of_entries>3</number_of_entries>
+ <entries>
+ <entry>Entry 1</entry>
+ <entry>Entry 2</entry>
+ </entries>
+</message>
+''')
+ # check everything (default phase #ALL)
+ schematron = isoschematron.Schematron(schema)
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ expected = 3
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertEqual(
+ len(schematron.error_log), expected,
+ 'expected %s errors: %s (%s errors)' %
+ (expected, schematron.error_log, len(schematron.error_log)))
+
+ # check phase mandatory
+ schematron = isoschematron.Schematron(schema, phase='mandatory')
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ expected = 1
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertEqual(
+ len(schematron.error_log), expected,
+ 'expected %s errors: %s (%s errors)' %
+ (expected, schematron.error_log, len(schematron.error_log)))
+
+ # check phase datetime_checks
+ schematron = isoschematron.Schematron(schema, phase='datetime_checks')
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ expected = 2
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertEqual(
+ len(schematron.error_log), expected,
+ 'expected %s errors: %s (%s errors)' %
+ (expected, schematron.error_log, len(schematron.error_log)))
+
+ # check phase full
+ schematron = isoschematron.Schematron(schema, phase='full')
+ self.assertTrue(schematron(tree_valid), schematron.error_log)
+ expected = 3
+ valid = schematron(tree_invalid)
+ self.assertTrue(not valid)
+ self.assertEqual(
+ len(schematron.error_log), expected, 'expected %s errors: %s (%s errors)' %
+ (expected, schematron.error_log, len(schematron.error_log)))
+
+ def test_schematron_xmlschema_embedded(self):
+ schema = self.parse('''\
+<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
+ xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <xs:element name="message">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element name="number_of_entries" type="xs:positiveInteger">
+ <xs:annotation>
+ <xs:appinfo>
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+ </xs:appinfo>
+ </xs:annotation>
+ </xs:element>
+ <xs:element name="entries">
+ <xs:complexType>
+ <xs:sequence>
+ <xs:element name="entry" type="xs:string" minOccurs="0" maxOccurs="unbounded"/>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+ </xs:sequence>
+ </xs:complexType>
+ </xs:element>
+</xs:schema>
+''')
+ tree_valid = self.parse('''\
+<message>
+ <number_of_entries>2</number_of_entries>
+ <entries>
+ <entry>Entry 1</entry>
+ <entry>Entry 2</entry>
+ </entries>
+</message>
+''')
+ tree_invalid = self.parse('''\
+<message>
+ <number_of_entries>1</number_of_entries>
+ <entries>
+ <entry>Entry 1</entry>
+ <entry>Entry 2</entry>
+ </entries>
+</message>
+''')
+ xmlschema = etree.XMLSchema(schema)
+ schematron = isoschematron.Schematron(schema)
+ # fwiw, this must also be XMLSchema-valid
+ self.assertTrue(xmlschema(tree_valid), xmlschema.error_log)
+ self.assertTrue(schematron(tree_valid))
+ # still schema-valid
+ self.assertTrue(xmlschema(tree_invalid), xmlschema.error_log)
+ self.assertTrue(not schematron(tree_invalid))
+
+ def test_schematron_relaxng_embedded(self):
+ schema = self.parse('''\
+<grammar xmlns="http://relaxng.org/ns/structure/1.0"
+ xmlns:sch="http://purl.oclc.org/dsdl/schematron"
+ datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes">
+ <start>
+ <ref name="message"/>
+ </start>
+ <define name="message">
+ <element name="message">
+ <element name="number_of_entries">
+ <!-- RelaxNG can be mixed freely with stuff from other namespaces -->
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+ <data type="positiveInteger"/>
+ </element>
+ <element name="entries">
+ <zeroOrMore>
+ <element name="entry"><data type="string"/></element>
+ </zeroOrMore>
+ </element>
+ </element>
+ </define>
+</grammar>
+''')
+ tree_valid = self.parse('''\
+<message>
+ <number_of_entries>2</number_of_entries>
+ <entries>
+ <entry>Entry 1</entry>
+ <entry>Entry 2</entry>
+ </entries>
+</message>
+''')
+ tree_invalid = self.parse('''\
+<message>
+ <number_of_entries>1</number_of_entries>
+ <entries>
+ <entry>Entry 1</entry>
+ <entry>Entry 2</entry>
+ </entries>
+</message>
+''')
+ relaxng = etree.RelaxNG(schema)
+ schematron = isoschematron.Schematron(schema)
+ # fwiw, this must also be RelaxNG-valid
+ self.assertTrue(relaxng(tree_valid), relaxng.error_log)
+ self.assertTrue(schematron(tree_valid))
+ # still schema-valid
+ self.assertTrue(relaxng(tree_invalid), relaxng.error_log)
+ self.assertTrue(not schematron(tree_invalid))
+
+ def test_schematron_invalid_args(self):
+ schema = self.parse('''\
+<sch:schema xmlns:sch="http://purl.oclc.org/dsdl/schematron">
+ <sch:pattern id="number_of_entries">
+ <sch:title>mandatory number_of_entries tests</sch:title>
+ <sch:rule context="number_of_entries">
+ <sch:assert test="text()=count(../entries/entry)">[ERROR] number_of_entries (<sch:value-of select="."/>) must equal the number of entries/entry elements (<sch:value-of select="count(../entries/entry)"/>)</sch:assert>
+ </sch:rule>
+ </sch:pattern>
+</sch:schema>
+''')
+ # handing phase as keyword arg will *not* raise the type error
+ self.assertRaises(TypeError, isoschematron.Schematron, schema,
+ compile_params={'phase': None})
+
+ def test_schematron_customization(self):
+ class MySchematron(isoschematron.Schematron):
+ def _extract(self, root):
+ schematron = (root.xpath(
+ '//sch:schema',
+ namespaces={'sch': "http://purl.oclc.org/dsdl/schematron"})
+ or [None])[0]
+ return schematron
+
+ def _include(self, schematron, **kwargs):
+ raise RuntimeError('inclusion unsupported')
+
+ def _expand(self, schematron, **kwargs):
+ raise RuntimeError('expansion unsupported')
+
+ def _validation_errors(self, validationReport):
+ valid = etree.XPath(
+ 'count(//svrl:successful-report[@flag="critical"])=1',
+ namespaces={'svrl': isoschematron.SVRL_NS})(
+ validationReport)
+ if valid:
+ return []
+ error = etree.Element('Error')
+ error.text = 'missing critical condition report'
+ return [error]
+
+ tree_valid = self.parse('<AAA><BBB/><CCC/></AAA>')
+ tree_invalid = self.parse('<AAA><BBB/><CCC/><DDD/></AAA>')
+ schema = self.parse('''\
+<schema xmlns="http://www.example.org/yet/another/schema/dialect">
+ <schema xmlns="http://purl.oclc.org/dsdl/schematron" >
+ <pattern id="OpenModel">
+ <title>Open Model</title>
+ <rule context="AAA">
+ <report test="BBB" flag="info">BBB element must be present</report>
+ <report test="CCC" flag="info">CCC element must be present</report>
+ </rule>
+ </pattern>
+ <pattern id="ClosedModel">
+ <title>Closed model"</title>
+ <rule context="AAA">
+ <report test="BBB" flag="info">BBB element must be present</report>
+ <report test="CCC" flag="info">CCC element must be present</report>
+ <report test="count(BBB|CCC) = count(*)" flag="critical">Only BBB and CCC children must be present</report>
+ </rule>
+ </pattern>
+ </schema>
+</schema>
+''')
+ # check if overridden _include is run
+ self.assertRaises(RuntimeError, MySchematron, schema, store_report=True)
+ # check if overridden _expand is run
+ self.assertRaises(RuntimeError, MySchematron, schema, store_report=True,
+ include=False)
+
+ schema = MySchematron(schema, store_report=True, include=False,
+ expand=False)
+ self.assertTrue(schema.validate(tree_valid))
+ self.assertTrue(not schema.validate(tree_invalid))
+
+ #TODO: test xslt parameters for inclusion, expand & compile steps (?)
+
+ def test_schematron_fail_on_report(self):
+ tree_valid = self.parse('<AAA><BBB/><CCC/></AAA>')
+ tree_invalid = self.parse('<AAA><BBB/><CCC/><DDD/></AAA>')
+ schema = self.parse('''\
+<schema xmlns="http://purl.oclc.org/dsdl/schematron" >
+ <pattern id="OpenModel">
+ <title>Simple Report</title>
+ <rule context="AAA">
+ <report test="DDD"> DDD element must not be present</report>
+ </rule>
+ </pattern>
+</schema>
+''')
+ schema_report = isoschematron.Schematron(
+ schema, error_finder=isoschematron.Schematron.ASSERTS_AND_REPORTS)
+ schema_no_report = isoschematron.Schematron(schema)
+ self.assertTrue(schema_report.validate(tree_valid))
+ self.assertTrue(not schema_report.validate(tree_invalid))
+ self.assertTrue(schema_no_report.validate(tree_valid))
+ self.assertTrue(schema_no_report.validate(tree_invalid))
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ETreeISOSchematronTestCase)])
+ suite.addTests(doctest.DocTestSuite(isoschematron))
+ suite.addTests(
+ [make_doctest('../../../doc/validation.txt')])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_nsclasses.py b/src/lxml/tests/test_nsclasses.py
new file mode 100644
index 0000000..a0aa608
--- /dev/null
+++ b/src/lxml/tests/test_nsclasses.py
@@ -0,0 +1,212 @@
+# -*- coding: utf-8 -*-
+
+"""
+Test cases related to namespace implementation classes and the
+namespace registry mechanism
+"""
+
+from __future__ import absolute_import
+
+import unittest
+
+from .common_imports import etree, HelperTestCase, _bytes, make_doctest
+
+class ETreeNamespaceClassesTestCase(HelperTestCase):
+
+ class default_class(etree.ElementBase):
+ pass
+ class maeh_class(etree.ElementBase):
+ def maeh(self):
+ return 'maeh'
+ class bluff_class(etree.ElementBase):
+ def bluff(self):
+ return 'bluff'
+
+ def setUp(self):
+ super(ETreeNamespaceClassesTestCase, self).setUp()
+ lookup = etree.ElementNamespaceClassLookup()
+ self.Namespace = lookup.get_namespace
+ parser = etree.XMLParser()
+ parser.set_element_class_lookup(lookup)
+ etree.set_default_parser(parser)
+
+ def tearDown(self):
+ etree.set_default_parser()
+ del self.Namespace
+ super(ETreeNamespaceClassesTestCase, self).tearDown()
+
+ def test_registry(self):
+ ns = self.Namespace('ns01')
+ ns['maeh'] = self.maeh_class
+
+ self.Namespace('ns01').clear()
+
+ self.Namespace('ns02').update({'maeh' : self.maeh_class})
+ self.Namespace('ns03').update({'bluff' : self.bluff_class}.items())
+ self.Namespace('ns02').clear()
+ self.Namespace('ns03').clear()
+
+ def test_ns_classes(self):
+ bluff_dict = {'bluff' : self.bluff_class}
+ maeh_dict = {'maeh' : self.maeh_class}
+
+ self.Namespace('ns10').update(bluff_dict)
+
+ tree = self.parse(_bytes('<bluff xmlns="ns10"><ns11:maeh xmlns:ns11="ns11"/></bluff>'))
+
+ el = tree.getroot()
+ self.assertTrue(isinstance(el, etree.ElementBase))
+ self.assertTrue(hasattr(el, 'bluff'))
+ self.assertFalse(hasattr(el[0], 'maeh'))
+ self.assertFalse(hasattr(el[0], 'bluff'))
+ self.assertEqual(el.bluff(), 'bluff')
+ del el
+
+ self.Namespace('ns11').update(maeh_dict)
+ el = tree.getroot()
+ self.assertTrue(hasattr(el, 'bluff'))
+ self.assertTrue(hasattr(el[0], 'maeh'))
+ self.assertEqual(el.bluff(), 'bluff')
+ self.assertEqual(el[0].maeh(), 'maeh')
+ del el
+
+ self.Namespace('ns10').clear()
+
+ tree = self.parse(_bytes('<bluff xmlns="ns10"><ns11:maeh xmlns:ns11="ns11"/></bluff>'))
+ el = tree.getroot()
+ self.assertFalse(hasattr(el, 'bluff'))
+ self.assertFalse(hasattr(el, 'maeh'))
+ self.assertFalse(hasattr(el[0], 'bluff'))
+ self.assertTrue(hasattr(el[0], 'maeh'))
+
+ self.Namespace('ns11').clear()
+
+ def test_default_tagname(self):
+ bluff_dict = {
+ None : self.bluff_class,
+ 'maeh' : self.maeh_class
+ }
+
+ ns = self.Namespace("uri:nsDefClass")
+ ns.update(bluff_dict)
+
+ tree = self.parse(_bytes('''
+ <test xmlns="bla" xmlns:ns1="uri:nsDefClass" xmlns:ns2="uri:nsDefClass">
+ <ns2:el1/><ns1:el2/><ns1:maeh/><ns2:maeh/><maeh/>
+ </test>
+ '''))
+
+ el = tree.getroot()
+ self.assertFalse(isinstance(el, etree.ElementBase))
+ for child in el[:-1]:
+ self.assertTrue(isinstance(child, etree.ElementBase), child.tag)
+ self.assertFalse(isinstance(el[-1], etree.ElementBase))
+
+ self.assertTrue(hasattr(el[0], 'bluff'))
+ self.assertTrue(hasattr(el[1], 'bluff'))
+ self.assertTrue(hasattr(el[2], 'maeh'))
+ self.assertTrue(hasattr(el[3], 'maeh'))
+ self.assertFalse(hasattr(el[4], 'maeh'))
+ del el
+
+ ns.clear()
+
+ def test_create_element(self):
+ bluff_dict = {'bluff' : self.bluff_class}
+ self.Namespace('ns20').update(bluff_dict)
+
+ maeh_dict = {'maeh' : self.maeh_class}
+ self.Namespace('ns21').update(maeh_dict)
+
+ el = etree.Element("{ns20}bluff")
+ self.assertTrue(hasattr(el, 'bluff'))
+
+ child = etree.SubElement(el, "{ns21}maeh")
+ self.assertTrue(hasattr(child, 'maeh'))
+ child = etree.SubElement(el, "{ns20}bluff")
+ self.assertTrue(hasattr(child, 'bluff'))
+ child = etree.SubElement(el, "{ns21}bluff")
+ self.assertFalse(hasattr(child, 'bluff'))
+ self.assertFalse(hasattr(child, 'maeh'))
+
+ self.assertTrue(hasattr(el[0], 'maeh'))
+ self.assertTrue(hasattr(el[1], 'bluff'))
+ self.assertFalse(hasattr(el[2], 'bluff'))
+ self.assertFalse(hasattr(el[2], 'maeh'))
+
+ self.assertEqual(el.bluff(), 'bluff')
+ self.assertEqual(el[0].maeh(), 'maeh')
+ self.assertEqual(el[1].bluff(), 'bluff')
+
+ self.Namespace('ns20').clear()
+ self.Namespace('ns21').clear()
+
+ def test_create_element_default(self):
+ bluff_dict = {None : self.bluff_class}
+ self.Namespace('ns30').update(bluff_dict)
+
+ maeh_dict = {'maeh' : self.maeh_class}
+ self.Namespace(None).update(maeh_dict)
+
+ el = etree.Element("{ns30}bluff")
+ etree.SubElement(el, "maeh")
+ self.assertTrue(hasattr(el, 'bluff'))
+ self.assertTrue(hasattr(el[0], 'maeh'))
+ self.assertEqual(el.bluff(), 'bluff')
+ self.assertEqual(el[0].maeh(), 'maeh')
+
+ self.Namespace(None).clear()
+ self.Namespace('ns30').clear()
+
+ def test_element_creation(self):
+ default, bluff, maeh = (
+ self.default_class, self.bluff_class, self.maeh_class)
+
+ class honk(etree.ElementBase):
+ TAG = 'HONK'
+ NAMESPACE = 'http://a.b/c'
+
+ el = default(
+ "test",
+ "text",
+ bluff(honk, "TaIL", maeh),
+ maeh("TeXT", bluff, honk(), "TAiL"),
+ "Tail")
+
+ self.assertEqual('default_class', el.tag)
+ self.assertEqual('testtext', el.text)
+ self.assertEqual(None, el.tail)
+ self.assertEqual(2, len(el))
+ self.assertEqual(7, len(list(el.iter())))
+
+ self.assertEqual('bluff_class', el[0].tag)
+ self.assertEqual('TaIL', el[0][0].tail)
+ self.assertEqual('TaIL', ''.join(el[0].itertext()))
+ self.assertEqual('{http://a.b/c}HONK',
+ el[0][0].tag)
+ self.assertEqual('maeh_class',
+ el[0][1].tag)
+
+ self.assertEqual('maeh_class', el[1].tag)
+ self.assertEqual('TeXT', el[1].text)
+ self.assertEqual('bluff_class', el[1][0].tag)
+ self.assertEqual('{http://a.b/c}HONK', el[1][1].tag)
+ self.assertEqual('TAiL', el[1][1].tail)
+
+ self.assertEqual('TeXTTAiL',
+ ''.join(el[1].itertext()))
+ self.assertEqual('Tail', el[1].tail)
+ self.assertEqual('TAiL', el[1][1].tail)
+ self.assertEqual('bluff_class', el[1][0].tag)
+ self.assertEqual('{http://a.b/c}HONK', el[1][1].tag)
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ETreeNamespaceClassesTestCase)])
+ suite.addTests(
+ [make_doctest('../../../doc/element_classes.txt')])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_objectify.py b/src/lxml/tests/test_objectify.py
new file mode 100644
index 0000000..a12ae7e
--- /dev/null
+++ b/src/lxml/tests/test_objectify.py
@@ -0,0 +1,2681 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests specific to the lxml.objectify API
+"""
+
+from __future__ import absolute_import
+
+import unittest, operator
+
+from .common_imports import (
+ etree, HelperTestCase, fileInTestDir, doctest, make_doctest, _bytes, _str, BytesIO
+)
+
+from lxml import objectify
+
+PYTYPE_NAMESPACE = "http://codespeak.net/lxml/objectify/pytype"
+XML_SCHEMA_NS = "http://www.w3.org/2001/XMLSchema"
+XML_SCHEMA_INSTANCE_NS = "http://www.w3.org/2001/XMLSchema-instance"
+XML_SCHEMA_INSTANCE_TYPE_ATTR = "{%s}type" % XML_SCHEMA_INSTANCE_NS
+XML_SCHEMA_NIL_ATTR = "{%s}nil" % XML_SCHEMA_INSTANCE_NS
+TREE_PYTYPE = "TREE"
+DEFAULT_NSMAP = { "py" : PYTYPE_NAMESPACE,
+ "xsi" : XML_SCHEMA_INSTANCE_NS,
+ "xsd" : XML_SCHEMA_NS}
+
+objectclass2xsitype = {
+ # objectify built-in
+ objectify.IntElement: ("int", "short", "byte", "unsignedShort",
+ "unsignedByte", "integer", "nonPositiveInteger",
+ "negativeInteger", "long", "nonNegativeInteger",
+ "unsignedLong", "unsignedInt", "positiveInteger",),
+ objectify.FloatElement: ("float", "double"),
+ objectify.BoolElement: ("boolean",),
+ objectify.StringElement: ("string", "normalizedString", "token", "language",
+ "Name", "NCName", "ID", "IDREF", "ENTITY",
+ "NMTOKEN", ),
+ # None: xsi:nil="true"
+ }
+
+xsitype2objclass = dict([ (v, k) for k in objectclass2xsitype
+ for v in objectclass2xsitype[k] ])
+
+objectclass2pytype = {
+ # objectify built-in
+ objectify.IntElement: "int",
+ objectify.FloatElement: "float",
+ objectify.BoolElement: "bool",
+ objectify.StringElement: "str",
+ # None: xsi:nil="true"
+ }
+
+pytype2objclass = dict([ (objectclass2pytype[k], k)
+ for k in objectclass2pytype])
+
+xml_str = '''\
+<obj:root xmlns:obj="objectified" xmlns:other="otherNS">
+ <obj:c1 a1="A1" a2="A2" other:a3="A3">
+ <obj:c2>0</obj:c2>
+ <obj:c2>1</obj:c2>
+ <obj:c2>2</obj:c2>
+ <other:c2>3</other:c2>
+ <c2>4</c2>
+ </obj:c1>
+</obj:root>'''
+
+class ObjectifyTestCase(HelperTestCase):
+ """Test cases for lxml.objectify
+ """
+ etree = etree
+
+ def XML(self, xml):
+ return self.etree.XML(xml, self.parser)
+
+ def setUp(self):
+ super(ObjectifyTestCase, self).setUp()
+ self.parser = self.etree.XMLParser(remove_blank_text=True)
+ self.lookup = etree.ElementNamespaceClassLookup(
+ objectify.ObjectifyElementClassLookup() )
+ self.parser.set_element_class_lookup(self.lookup)
+
+ self.Element = self.parser.makeelement
+
+ ns = self.lookup.get_namespace("otherNS")
+ ns[None] = self.etree.ElementBase
+
+ self._orig_types = objectify.getRegisteredTypes()
+
+ def tearDown(self):
+ self.lookup.get_namespace("otherNS").clear()
+ objectify.set_pytype_attribute_tag()
+ del self.lookup
+ del self.parser
+
+ for pytype in objectify.getRegisteredTypes():
+ pytype.unregister()
+ for pytype in self._orig_types:
+ pytype.register()
+ del self._orig_types
+
+ super(ObjectifyTestCase, self).tearDown()
+
+
+ def test_element_nsmap_default(self):
+ elt = objectify.Element("test")
+ self.assertEqual(elt.nsmap, DEFAULT_NSMAP)
+
+ def test_element_nsmap_empty(self):
+ nsmap = {}
+ elt = objectify.Element("test", nsmap=nsmap)
+ self.assertEqual(list(elt.nsmap.values()), [PYTYPE_NAMESPACE])
+
+ def test_element_nsmap_custom_prefixes(self):
+ nsmap = {"mypy": PYTYPE_NAMESPACE,
+ "myxsi": XML_SCHEMA_INSTANCE_NS,
+ "myxsd": XML_SCHEMA_NS}
+ elt = objectify.Element("test", nsmap=nsmap)
+ self.assertEqual(elt.nsmap, nsmap)
+
+ def test_element_nsmap_custom(self):
+ nsmap = {"my": "someNS",
+ "myother": "someOtherNS",
+ "myxsd": XML_SCHEMA_NS}
+ elt = objectify.Element("test", nsmap=nsmap)
+ self.assertTrue(PYTYPE_NAMESPACE in elt.nsmap.values())
+ for prefix, ns in nsmap.items():
+ self.assertTrue(prefix in elt.nsmap)
+ self.assertEqual(nsmap[prefix], elt.nsmap[prefix])
+
+ def test_sub_element_nsmap_default(self):
+ root = objectify.Element("root")
+ root.sub = objectify.Element("test")
+ self.assertEqual(root.sub.nsmap, DEFAULT_NSMAP)
+
+ def test_sub_element_nsmap_empty(self):
+ root = objectify.Element("root")
+ nsmap = {}
+ root.sub = objectify.Element("test", nsmap=nsmap)
+ self.assertEqual(root.sub.nsmap, DEFAULT_NSMAP)
+
+ def test_sub_element_nsmap_custom_prefixes(self):
+ root = objectify.Element("root")
+ nsmap = {"mypy": PYTYPE_NAMESPACE,
+ "myxsi": XML_SCHEMA_INSTANCE_NS,
+ "myxsd": XML_SCHEMA_NS}
+ root.sub = objectify.Element("test", nsmap=nsmap)
+ self.assertEqual(root.sub.nsmap, DEFAULT_NSMAP)
+
+ def test_sub_element_nsmap_custom(self):
+ root = objectify.Element("root")
+ nsmap = {"my": "someNS",
+ "myother": "someOtherNS",
+ "myxsd": XML_SCHEMA_NS,}
+ root.sub = objectify.Element("test", nsmap=nsmap)
+ expected = nsmap.copy()
+ del expected["myxsd"]
+ expected.update(DEFAULT_NSMAP)
+ self.assertEqual(root.sub.nsmap, expected)
+
+ def test_data_element_nsmap_default(self):
+ value = objectify.DataElement("test this")
+ self.assertEqual(value.nsmap, DEFAULT_NSMAP)
+
+ def test_data_element_nsmap_empty(self):
+ nsmap = {}
+ value = objectify.DataElement("test this", nsmap=nsmap)
+ self.assertEqual(list(value.nsmap.values()), [PYTYPE_NAMESPACE])
+
+ def test_data_element_nsmap_custom_prefixes(self):
+ nsmap = {"mypy": PYTYPE_NAMESPACE,
+ "myxsi": XML_SCHEMA_INSTANCE_NS,
+ "myxsd": XML_SCHEMA_NS}
+ value = objectify.DataElement("test this", nsmap=nsmap)
+ self.assertEqual(value.nsmap, nsmap)
+
+ def test_data_element_nsmap_custom(self):
+ nsmap = {"my": "someNS",
+ "myother": "someOtherNS",
+ "myxsd": XML_SCHEMA_NS,}
+ value = objectify.DataElement("test", nsmap=nsmap)
+ self.assertTrue(PYTYPE_NAMESPACE in value.nsmap.values())
+ for prefix, ns in nsmap.items():
+ self.assertTrue(prefix in value.nsmap)
+ self.assertEqual(nsmap[prefix], value.nsmap[prefix])
+
+ def test_sub_data_element_nsmap_default(self):
+ root = objectify.Element("root")
+ root.value = objectify.DataElement("test this")
+ self.assertEqual(root.value.nsmap, DEFAULT_NSMAP)
+
+ def test_sub_data_element_nsmap_empty(self):
+ root = objectify.Element("root")
+ nsmap = {}
+ root.value = objectify.DataElement("test this", nsmap=nsmap)
+ self.assertEqual(root.value.nsmap, DEFAULT_NSMAP)
+
+ def test_sub_data_element_nsmap_custom_prefixes(self):
+ root = objectify.Element("root")
+ nsmap = {"mypy": PYTYPE_NAMESPACE,
+ "myxsi": XML_SCHEMA_INSTANCE_NS,
+ "myxsd": XML_SCHEMA_NS}
+ root.value = objectify.DataElement("test this", nsmap=nsmap)
+ self.assertEqual(root.value.nsmap, DEFAULT_NSMAP)
+
+ def test_sub_data_element_nsmap_custom(self):
+ root = objectify.Element("root")
+ nsmap = {"my": "someNS",
+ "myother": "someOtherNS",
+ "myxsd": XML_SCHEMA_NS}
+ root.value = objectify.DataElement("test", nsmap=nsmap)
+ expected = nsmap.copy()
+ del expected["myxsd"]
+ expected.update(DEFAULT_NSMAP)
+ self.assertEqual(root.value.nsmap, expected)
+
+ def test_date_element_efactory_text(self):
+ # ObjectifiedDataElement can also be used as E-Factory
+ value = objectify.ObjectifiedDataElement('test', 'toast')
+ self.assertEqual(value.text, 'testtoast')
+
+ def test_date_element_efactory_tail(self):
+ # ObjectifiedDataElement can also be used as E-Factory
+ value = objectify.ObjectifiedElement(objectify.ObjectifiedDataElement(), 'test', 'toast')
+ self.assertEqual(value.ObjectifiedDataElement.tail, 'testtoast')
+
+ def test_data_element_attrib_attributes_precedence(self):
+ # keyword arguments override attrib entries
+ value = objectify.DataElement(23, _pytype="str", _xsi="foobar",
+ attrib={"gnu": "muh", "cat": "meeow",
+ "dog": "wuff"},
+ bird="tchilp", dog="grrr")
+ self.assertEqual(value.get("gnu"), "muh")
+ self.assertEqual(value.get("cat"), "meeow")
+ self.assertEqual(value.get("dog"), "grrr")
+ self.assertEqual(value.get("bird"), "tchilp")
+
+ def test_data_element_data_element_arg(self):
+ # Check that DataElement preserves all attributes ObjectifiedDataElement
+ # arguments
+ arg = objectify.DataElement(23, _pytype="str", _xsi="foobar",
+ attrib={"gnu": "muh", "cat": "meeow",
+ "dog": "wuff"},
+ bird="tchilp", dog="grrr")
+ value = objectify.DataElement(arg)
+ self.assertTrue(isinstance(value, objectify.StringElement))
+ for attr in arg.attrib:
+ self.assertEqual(value.get(attr), arg.get(attr))
+
+ def test_data_element_data_element_arg_pytype_none(self):
+ # Check that _pytype arg overrides original py:pytype of
+ # ObjectifiedDataElement
+ arg = objectify.DataElement(23, _pytype="str", _xsi="foobar",
+ attrib={"gnu": "muh", "cat": "meeow",
+ "dog": "wuff"},
+ bird="tchilp", dog="grrr")
+ value = objectify.DataElement(arg, _pytype="NoneType")
+ self.assertTrue(isinstance(value, objectify.NoneElement))
+ self.assertEqual(value.get(XML_SCHEMA_NIL_ATTR), "true")
+ self.assertEqual(value.text, None)
+ self.assertEqual(value.pyval, None)
+ for attr in arg.attrib:
+ #if not attr == objectify.PYTYPE_ATTRIBUTE:
+ self.assertEqual(value.get(attr), arg.get(attr))
+
+ def test_data_element_data_element_arg_pytype(self):
+ # Check that _pytype arg overrides original py:pytype of
+ # ObjectifiedDataElement
+ arg = objectify.DataElement(23, _pytype="str", _xsi="foobar",
+ attrib={"gnu": "muh", "cat": "meeow",
+ "dog": "wuff"},
+ bird="tchilp", dog="grrr")
+ value = objectify.DataElement(arg, _pytype="int")
+ self.assertTrue(isinstance(value, objectify.IntElement))
+ self.assertEqual(value.get(objectify.PYTYPE_ATTRIBUTE), "int")
+ for attr in arg.attrib:
+ if not attr == objectify.PYTYPE_ATTRIBUTE:
+ self.assertEqual(value.get(attr), arg.get(attr))
+
+ def test_data_element_data_element_arg_xsitype(self):
+ # Check that _xsi arg overrides original xsi:type of given
+ # ObjectifiedDataElement
+ arg = objectify.DataElement(23, _pytype="str", _xsi="foobar",
+ attrib={"gnu": "muh", "cat": "meeow",
+ "dog": "wuff"},
+ bird="tchilp", dog="grrr")
+ value = objectify.DataElement(arg, _xsi="xsd:int")
+ self.assertTrue(isinstance(value, objectify.IntElement))
+ self.assertEqual(value.get(XML_SCHEMA_INSTANCE_TYPE_ATTR), "xsd:int")
+ self.assertEqual(value.get(objectify.PYTYPE_ATTRIBUTE), "int")
+ for attr in arg.attrib:
+ if not attr in [objectify.PYTYPE_ATTRIBUTE,
+ XML_SCHEMA_INSTANCE_TYPE_ATTR]:
+ self.assertEqual(value.get(attr), arg.get(attr))
+
+ def test_data_element_data_element_arg_pytype_xsitype(self):
+ # Check that _pytype and _xsi args override original py:pytype and
+ # xsi:type attributes of given ObjectifiedDataElement
+ arg = objectify.DataElement(23, _pytype="str", _xsi="foobar",
+ attrib={"gnu": "muh", "cat": "meeow",
+ "dog": "wuff"},
+ bird="tchilp", dog="grrr")
+ value = objectify.DataElement(arg, _pytype="int", _xsi="xsd:int")
+ self.assertTrue(isinstance(value, objectify.IntElement))
+ self.assertEqual(value.get(objectify.PYTYPE_ATTRIBUTE), "int")
+ self.assertEqual(value.get(XML_SCHEMA_INSTANCE_TYPE_ATTR), "xsd:int")
+ for attr in arg.attrib:
+ if not attr in [objectify.PYTYPE_ATTRIBUTE,
+ XML_SCHEMA_INSTANCE_TYPE_ATTR]:
+ self.assertEqual(value.get(attr), arg.get(attr))
+
+ def test_data_element_invalid_pytype(self):
+ self.assertRaises(ValueError, objectify.DataElement, 3.1415,
+ _pytype="int")
+
+ def test_data_element_invalid_xsi(self):
+ self.assertRaises(ValueError, objectify.DataElement, 3.1415,
+ _xsi="xsd:int")
+
+ def test_data_element_data_element_arg_invalid_pytype(self):
+ arg = objectify.DataElement(3.1415)
+ self.assertRaises(ValueError, objectify.DataElement, arg,
+ _pytype="int")
+
+ def test_data_element_data_element_arg_invalid_xsi(self):
+ arg = objectify.DataElement(3.1415)
+ self.assertRaises(ValueError, objectify.DataElement, arg,
+ _xsi="xsd:int")
+
+ def test_data_element_element_arg(self):
+ arg = objectify.Element('arg')
+ value = objectify.DataElement(arg)
+ self.assertTrue(isinstance(value, objectify.ObjectifiedElement))
+ for attr in arg.attrib:
+ self.assertEqual(value.get(attr), arg.get(attr))
+
+ def test_root(self):
+ root = self.Element("test")
+ self.assertTrue(isinstance(root, objectify.ObjectifiedElement))
+
+ def test_str(self):
+ root = self.Element("test")
+ self.assertEqual('', str(root))
+
+ def test_child(self):
+ root = self.XML(xml_str)
+ self.assertEqual("0", root.c1.c2.text)
+
+ def test_child_ns_nons(self):
+ root = self.XML("""
+ <root>
+ <foo:x xmlns:foo="/foo/bar">1</foo:x>
+ <x>2</x>
+ </root>
+ """)
+ self.assertEqual(2, root.x)
+
+ def test_countchildren(self):
+ root = self.XML(xml_str)
+ self.assertEqual(1, root.countchildren())
+ self.assertEqual(5, root.c1.countchildren())
+
+ def test_child_getattr(self):
+ root = self.XML(xml_str)
+ self.assertEqual("0", getattr(root.c1, "{objectified}c2").text)
+ self.assertEqual("3", getattr(root.c1, "{otherNS}c2").text)
+
+ def test_child_nonexistant(self):
+ root = self.XML(xml_str)
+ self.assertRaises(AttributeError, getattr, root.c1, "NOT_THERE")
+ self.assertRaises(AttributeError, getattr, root.c1, "{unknownNS}c2")
+
+ def test_child_getattr_empty_ns(self):
+ root = self.XML(xml_str)
+ self.assertEqual("4", getattr(root.c1, "{}c2").text)
+ self.assertEqual("0", getattr(root.c1, "c2").text)
+
+ def test_setattr(self):
+ for val in [
+ 2, 2**32, 1.2, "Won't get fooled again",
+ _str("W\xf6n't get f\xf6\xf6led \xe4g\xe4in", 'ISO-8859-1'), True,
+ False, None]:
+ root = self.Element('root')
+ attrname = 'val'
+ setattr(root, attrname, val)
+ result = getattr(root, attrname)
+ self.assertEqual(val, result)
+ self.assertEqual(type(val), type(result.pyval))
+
+ def test_setattr_nonunicode(self):
+ root = self.Element('root')
+ attrname = 'val'
+ val = _bytes("W\xf6n't get f\xf6\xf6led \xe4g\xe4in", 'ISO-8859-1')
+ self.assertRaises(ValueError, setattr, root, attrname, val)
+ self.assertRaises(AttributeError, getattr, root, attrname)
+
+ def test_addattr(self):
+ root = self.XML(xml_str)
+ self.assertEqual(1, len(root.c1))
+ root.addattr("c1", "test")
+ self.assertEqual(2, len(root.c1))
+ self.assertEqual("test", root.c1[1].text)
+
+ def test_addattr_element(self):
+ root = self.XML(xml_str)
+ self.assertEqual(1, len(root.c1))
+
+ new_el = self.Element("test", myattr="5")
+ root.addattr("c1", new_el)
+ self.assertEqual(2, len(root.c1))
+ self.assertEqual(None, root.c1[0].get("myattr"))
+ self.assertEqual("5", root.c1[1].get("myattr"))
+
+ def test_addattr_list(self):
+ root = self.XML(xml_str)
+ self.assertEqual(1, len(root.c1))
+
+ new_el = self.Element("test")
+ self.etree.SubElement(new_el, "a", myattr="A")
+ self.etree.SubElement(new_el, "a", myattr="B")
+
+ root.addattr("c1", list(new_el.a))
+ self.assertEqual(3, len(root.c1))
+ self.assertEqual(None, root.c1[0].get("myattr"))
+ self.assertEqual("A", root.c1[1].get("myattr"))
+ self.assertEqual("B", root.c1[2].get("myattr"))
+
+ def test_child_addattr(self):
+ root = self.XML(xml_str)
+ self.assertEqual(3, len(root.c1.c2))
+ root.c1.addattr("c2", 3)
+ self.assertEqual(4, len(root.c1.c2))
+ self.assertEqual("3", root.c1.c2[3].text)
+
+ def test_child_index(self):
+ root = self.XML(xml_str)
+ self.assertEqual("0", root.c1.c2[0].text)
+ self.assertEqual("1", root.c1.c2[1].text)
+ self.assertEqual("2", root.c1.c2[2].text)
+ self.assertRaises(IndexError, operator.getitem, root.c1.c2, 3)
+ self.assertEqual(root, root[0])
+ self.assertRaises(IndexError, operator.getitem, root, 1)
+
+ c1 = root.c1
+ del root.c1 # unlink from parent
+ self.assertEqual(c1, c1[0])
+ self.assertRaises(IndexError, operator.getitem, c1, 1)
+
+ def test_child_index_neg(self):
+ root = self.XML(xml_str)
+ self.assertEqual("0", root.c1.c2[0].text)
+ self.assertEqual("0", root.c1.c2[-3].text)
+ self.assertEqual("1", root.c1.c2[-2].text)
+ self.assertEqual("2", root.c1.c2[-1].text)
+ self.assertRaises(IndexError, operator.getitem, root.c1.c2, -4)
+ self.assertEqual(root, root[-1])
+ self.assertRaises(IndexError, operator.getitem, root, -2)
+
+ c1 = root.c1
+ del root.c1 # unlink from parent
+ self.assertEqual(c1, c1[-1])
+ self.assertRaises(IndexError, operator.getitem, c1, -2)
+
+ def test_child_len(self):
+ root = self.XML(xml_str)
+ self.assertEqual(1, len(root))
+ self.assertEqual(1, len(root.c1))
+ self.assertEqual(3, len(root.c1.c2))
+
+ def test_child_iter(self):
+ root = self.XML(xml_str)
+ self.assertEqual([root],
+ list(iter(root)))
+ self.assertEqual([root.c1],
+ list(iter(root.c1)))
+ self.assertEqual([root.c1.c2[0], root.c1.c2[1], root.c1.c2[2]],
+ list(iter(root.c1.c2)))
+
+ def test_class_lookup(self):
+ root = self.XML(xml_str)
+ self.assertTrue(isinstance(root.c1.c2, objectify.ObjectifiedElement))
+ self.assertFalse(isinstance(getattr(root.c1, "{otherNS}c2"),
+ objectify.ObjectifiedElement))
+
+ def test_dir(self):
+ root = self.XML(xml_str)
+ dir_c1 = dir(objectify.ObjectifiedElement) + ['c1']
+ dir_c1.sort()
+ dir_c2 = dir(objectify.ObjectifiedElement) + ['c2']
+ dir_c2.sort()
+
+ self.assertEqual(dir_c1, dir(root))
+ self.assertEqual(dir_c2, dir(root.c1))
+
+ def test_vars(self):
+ root = self.XML(xml_str)
+ self.assertEqual({'c1' : root.c1}, vars(root))
+ self.assertEqual({'c2' : root.c1.c2}, vars(root.c1))
+
+ def test_child_set_ro(self):
+ root = self.XML(xml_str)
+ self.assertRaises(TypeError, setattr, root.c1.c2, 'text', "test")
+ self.assertRaises(TypeError, setattr, root.c1.c2, 'pyval', "test")
+
+ # slicing
+
+ def test_getslice_complete(self):
+ root = self.XML("<root><c>c1</c><c>c2</c></root>")
+ self.assertEqual(["c1", "c2"],
+ [ c.text for c in root.c[:] ])
+
+ def test_getslice_partial(self):
+ root = self.XML("<root><c>c1</c><c>c2</c><c>c3</c><c>c4</c></root>")
+ test_list = ["c1", "c2", "c3", "c4"]
+
+ self.assertEqual(test_list,
+ [ c.text for c in root.c[:] ])
+ self.assertEqual(test_list[1:2],
+ [ c.text for c in root.c[1:2] ])
+ self.assertEqual(test_list[-3:-1],
+ [ c.text for c in root.c[-3:-1] ])
+ self.assertEqual(test_list[-3:3],
+ [ c.text for c in root.c[-3:3] ])
+ self.assertEqual(test_list[-3000:3],
+ [ c.text for c in root.c[-3000:3] ])
+ self.assertEqual(test_list[-3:3000],
+ [ c.text for c in root.c[-3:3000] ])
+
+ def test_getslice_partial_neg(self):
+ root = self.XML("<root><c>c1</c><c>c2</c><c>c3</c><c>c4</c></root>")
+ test_list = ["c1", "c2", "c3", "c4"]
+
+ self.assertEqual(test_list,
+ [ c.text for c in root.c[:] ])
+ self.assertEqual(test_list[2:1:-1],
+ [ c.text for c in root.c[2:1:-1] ])
+ self.assertEqual(test_list[-1:-3:-1],
+ [ c.text for c in root.c[-1:-3:-1] ])
+ self.assertEqual(test_list[2:-3:-1],
+ [ c.text for c in root.c[2:-3:-1] ])
+ self.assertEqual(test_list[2:-3000:-1],
+ [ c.text for c in root.c[2:-3000:-1] ])
+
+ # slice assignment
+
+ def test_setslice_complete(self):
+ Element = self.Element
+ root = Element("root")
+ root.c = ["c1", "c2"]
+
+ c1 = root.c[0]
+ c2 = root.c[1]
+
+ self.assertEqual([c1,c2], list(root.c))
+ self.assertEqual(["c1", "c2"],
+ [ c.text for c in root.c ])
+
+ def test_setslice_elements(self):
+ Element = self.Element
+ root = Element("root")
+ root.c = ["c1", "c2"]
+
+ c1 = root.c[0]
+ c2 = root.c[1]
+
+ self.assertEqual([c1,c2], list(root.c))
+ self.assertEqual(["c1", "c2"],
+ [ c.text for c in root.c ])
+
+ root2 = Element("root2")
+ root2.el = [ "test", "test" ]
+ self.assertEqual(["test", "test"],
+ [ el.text for el in root2.el ])
+
+ root.c = [ root2.el, root2.el ]
+ self.assertEqual(["test", "test"],
+ [ c.text for c in root.c ])
+ self.assertEqual(["test", "test"],
+ [ el.text for el in root2.el ])
+
+ root.c[:] = [ c1, c2, c2, c1 ]
+ self.assertEqual(["c1", "c2", "c2", "c1"],
+ [ c.text for c in root.c ])
+
+ def test_setslice_partial(self):
+ Element = self.Element
+ root = Element("root")
+ l = ["c1", "c2", "c3", "c4"]
+ root.c = l
+
+ self.assertEqual(["c1", "c2", "c3", "c4"],
+ [ c.text for c in root.c ])
+ self.assertEqual(l,
+ [ c.text for c in root.c ])
+
+ new_slice = ["cA", "cB"]
+ l[1:2] = new_slice
+ root.c[1:2] = new_slice
+
+ self.assertEqual(["c1", "cA", "cB", "c3", "c4"], l)
+ self.assertEqual(["c1", "cA", "cB", "c3", "c4"],
+ [ c.text for c in root.c ])
+ self.assertEqual(l,
+ [ c.text for c in root.c ])
+
+ def test_setslice_insert(self):
+ Element = self.Element
+ root = Element("root")
+ l = ["c1", "c2", "c3", "c4"]
+ root.c = l
+
+ self.assertEqual(["c1", "c2", "c3", "c4"],
+ [ c.text for c in root.c ])
+ self.assertEqual(l,
+ [ c.text for c in root.c ])
+
+ new_slice = ["cA", "cB"]
+ l[1:1] = new_slice
+ root.c[1:1] = new_slice
+
+ self.assertEqual(["c1", "cA", "cB", "c2", "c3", "c4"], l)
+ self.assertEqual(["c1", "cA", "cB", "c2", "c3", "c4"],
+ [ c.text for c in root.c ])
+ self.assertEqual(l,
+ [ c.text for c in root.c ])
+
+ def test_setslice_insert_neg(self):
+ Element = self.Element
+ root = Element("root")
+ l = ["c1", "c2", "c3", "c4"]
+ root.c = l
+
+ self.assertEqual(["c1", "c2", "c3", "c4"],
+ [ c.text for c in root.c ])
+ self.assertEqual(l,
+ [ c.text for c in root.c ])
+
+ new_slice = ["cA", "cB"]
+ l[-2:-2] = new_slice
+ root.c[-2:-2] = new_slice
+
+ self.assertEqual(["c1", "c2", "cA", "cB", "c3", "c4"], l)
+ self.assertEqual(["c1", "c2", "cA", "cB", "c3", "c4"],
+ [ c.text for c in root.c ])
+ self.assertEqual(l,
+ [ c.text for c in root.c ])
+
+ def test_setslice_empty(self):
+ Element = self.Element
+ root = Element("root")
+
+ root.c = []
+ self.assertRaises(
+ AttributeError, getattr, root, 'c')
+
+ def test_setslice_partial_wrong_length(self):
+ Element = self.Element
+ root = Element("root")
+ l = ["c1", "c2", "c3", "c4"]
+ root.c = l
+
+ self.assertEqual(["c1", "c2", "c3", "c4"],
+ [ c.text for c in root.c ])
+ self.assertEqual(l,
+ [ c.text for c in root.c ])
+
+ new_slice = ["cA", "cB", "cC"]
+ self.assertRaises(
+ ValueError, operator.setitem,
+ l, slice(1,2,-1), new_slice)
+ self.assertRaises(
+ ValueError, operator.setitem,
+ root.c, slice(1,2,-1), new_slice)
+
+ def test_setslice_partial_neg(self):
+ Element = self.Element
+ root = Element("root")
+ l = ["c1", "c2", "c3", "c4"]
+ root.c = l
+
+ self.assertEqual(["c1", "c2", "c3", "c4"],
+ [ c.text for c in root.c ])
+ self.assertEqual(l,
+ [ c.text for c in root.c ])
+
+ new_slice = ["cA", "cB"]
+ l[-1:1:-1] = new_slice
+ root.c[-1:1:-1] = new_slice
+
+ self.assertEqual(["c1", "c2", "cB", "cA"], l)
+ self.assertEqual(["c1", "c2", "cB", "cA"],
+ [ c.text for c in root.c ])
+ self.assertEqual(l,
+ [ c.text for c in root.c ])
+
+ def test_setslice_partial_allneg(self):
+ Element = self.Element
+ root = Element("root")
+ l = ["c1", "c2", "c3", "c4"]
+ root.c = l
+
+ self.assertEqual(["c1", "c2", "c3", "c4"],
+ [ c.text for c in root.c ])
+ self.assertEqual(l,
+ [ c.text for c in root.c ])
+
+ new_slice = ["cA", "cB"]
+ l[-1:-4:-2] = new_slice
+ root.c[-1:-4:-2] = new_slice
+
+ self.assertEqual(["c1", "cB", "c3", "cA"], l)
+ self.assertEqual(["c1", "cB", "c3", "cA"],
+ [ c.text for c in root.c ])
+ self.assertEqual(l,
+ [ c.text for c in root.c ])
+
+ # other stuff
+
+ def test_setitem_index(self):
+ Element = self.Element
+ root = Element("root")
+ root['child'] = ['CHILD1', 'CHILD2']
+ self.assertEqual(["CHILD1", "CHILD2"],
+ [ c.text for c in root.child ])
+
+ self.assertRaises(IndexError, operator.setitem, root.child, -3, 'oob')
+ self.assertRaises(IndexError, operator.setitem, root.child, -300, 'oob')
+ self.assertRaises(IndexError, operator.setitem, root.child, 2, 'oob')
+ self.assertRaises(IndexError, operator.setitem, root.child, 200, 'oob')
+
+ root.child[0] = "child0"
+ root.child[-1] = "child-1"
+ self.assertEqual(["child0", "child-1"],
+ [ c.text for c in root.child ])
+
+ root.child[1] = "child1"
+ root.child[-2] = "child-2"
+ self.assertEqual(["child-2", "child1"],
+ [ c.text for c in root.child ])
+
+ def test_delitem_index(self):
+ # make sure strings are set as children
+ Element = self.Element
+ root = Element("root")
+ root['child'] = ['CHILD1', 'CHILD2', 'CHILD3', 'CHILD4']
+ self.assertEqual(["CHILD1", "CHILD2", "CHILD3", "CHILD4"],
+ [ c.text for c in root.child ])
+
+ del root.child[-1]
+ self.assertEqual(["CHILD1", "CHILD2", "CHILD3"],
+ [ c.text for c in root.child ])
+ del root.child[-2]
+ self.assertEqual(["CHILD1", "CHILD3"],
+ [ c.text for c in root.child ])
+ del root.child[0]
+ self.assertEqual(["CHILD3"],
+ [ c.text for c in root.child ])
+ del root.child[-1]
+ self.assertRaises(AttributeError, getattr, root, 'child')
+
+ def test_set_string(self):
+ # make sure strings are not handled as sequences
+ Element = self.Element
+ root = Element("root")
+ root.c = "TEST"
+ self.assertEqual(["TEST"],
+ [ c.text for c in root.c ])
+
+ def test_setitem_string(self):
+ # make sure strings are set as children
+ Element = self.Element
+ root = Element("root")
+ root["c"] = "TEST"
+ self.assertEqual(["TEST"],
+ [ c.text for c in root.c ])
+
+ def test_setitem_string_special(self):
+ # make sure 'text' etc. are set as children
+ Element = self.Element
+ root = Element("root")
+
+ root["text"] = "TEST"
+ self.assertEqual(["TEST"],
+ [ c.text for c in root["text"] ])
+
+ root["tail"] = "TEST"
+ self.assertEqual(["TEST"],
+ [ c.text for c in root["tail"] ])
+
+ root["pyval"] = "TEST"
+ self.assertEqual(["TEST"],
+ [ c.text for c in root["pyval"] ])
+
+ root["tag"] = "TEST"
+ self.assertEqual(["TEST"],
+ [ c.text for c in root["tag"] ])
+
+ def test_findall(self):
+ XML = self.XML
+ root = XML('<a><b><c/></b><b/><c><b/></c></a>')
+ self.assertEqual(1, len(root.findall("c")))
+ self.assertEqual(2, len(root.findall(".//c")))
+ self.assertEqual(3, len(root.findall(".//b")))
+ self.assertTrue(root.findall(".//b")[1] is root.getchildren()[1])
+
+ def test_findall_ns(self):
+ XML = self.XML
+ root = XML('<a xmlns:x="X" xmlns:y="Y"><x:b><c/></x:b><b/><c><x:b/><b/></c><b/></a>')
+ self.assertEqual(2, len(root.findall(".//{X}b")))
+ self.assertEqual(3, len(root.findall(".//b")))
+ self.assertEqual(2, len(root.findall("b")))
+
+ def test_build_tree(self):
+ root = self.Element('root')
+ root.a = 5
+ root.b = 6
+ self.assertTrue(isinstance(root, objectify.ObjectifiedElement))
+ self.assertTrue(isinstance(root.a, objectify.IntElement))
+ self.assertTrue(isinstance(root.b, objectify.IntElement))
+
+ def test_type_NoneType(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+
+ nil_attr = XML_SCHEMA_NIL_ATTR
+ root = Element("{objectified}root")
+ SubElement(root, "{objectified}none")
+ SubElement(root, "{objectified}none", {nil_attr : "true"})
+ self.assertFalse(isinstance(root.none, objectify.NoneElement))
+ self.assertFalse(isinstance(root.none[0], objectify.NoneElement))
+ self.assertTrue(isinstance(root.none[1], objectify.NoneElement))
+ self.assertEqual(hash(root.none[1]), hash(None))
+ self.assertEqual(root.none[1], None)
+ self.assertFalse(root.none[1])
+
+ def test_data_element_NoneType(self):
+ value = objectify.DataElement(None)
+ self.assertTrue(isinstance(value, objectify.NoneElement))
+ self.assertEqual(value, None)
+ self.assertEqual(value.get(XML_SCHEMA_NIL_ATTR), "true")
+
+ def test_type_bool(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.bool = True
+ self.assertEqual(root.bool, True)
+ self.assertEqual(root.bool + root.bool, True + True)
+ self.assertEqual(True + root.bool, True + root.bool)
+ self.assertEqual(root.bool * root.bool, True * True)
+ self.assertEqual(int(root.bool), int(True))
+ self.assertEqual(hash(root.bool), hash(True))
+ self.assertEqual(complex(root.bool), complex(True))
+ self.assertTrue(isinstance(root.bool, objectify.BoolElement))
+
+ root.bool = False
+ self.assertEqual(root.bool, False)
+ self.assertEqual(root.bool + root.bool, False + False)
+ self.assertEqual(False + root.bool, False + root.bool)
+ self.assertEqual(root.bool * root.bool, False * False)
+ self.assertEqual(int(root.bool), int(False))
+ self.assertEqual(hash(root.bool), hash(False))
+ self.assertEqual(complex(root.bool), complex(False))
+ self.assertTrue(isinstance(root.bool, objectify.BoolElement))
+
+ def test_data_element_bool(self):
+ value = objectify.DataElement(True)
+ self.assertTrue(isinstance(value, objectify.BoolElement))
+ self.assertEqual(value, True)
+
+ value = objectify.DataElement(False)
+ self.assertTrue(isinstance(value, objectify.BoolElement))
+ self.assertEqual(value, False)
+
+ def test_type_str(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.s = "test"
+ self.assertTrue(isinstance(root.s, objectify.StringElement))
+
+ def test_type_str_intliteral(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.s = "3"
+ self.assertTrue(isinstance(root.s, objectify.StringElement))
+
+ def test_type_str_floatliteral(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.s = "3.72"
+ self.assertTrue(isinstance(root.s, objectify.StringElement))
+
+ def test_type_str_mul(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.s = "test"
+
+ self.assertEqual("test" * 5, root.s * 5)
+ self.assertEqual(5 * "test", 5 * root.s)
+
+ self.assertRaises(TypeError, operator.mul, root.s, "honk")
+ self.assertRaises(TypeError, operator.mul, "honk", root.s)
+
+ def test_type_str_add(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.s = "test"
+
+ s = "toast"
+ self.assertEqual("test" + s, root.s + s)
+ self.assertEqual(s + "test", s + root.s)
+
+ def test_type_str_mod(self):
+ s = "%d %f %s %r"
+ el = objectify.DataElement(s)
+ values = (1, 7.0, "abcd", None)
+ self.assertEqual(s % values, el % values)
+
+ s = "%d"
+ el = objectify.DataElement(s)
+ val = 5
+ self.assertEqual(s % val, el % val)
+
+ s = "%d %s"
+ el = objectify.DataElement(s)
+ val = 5
+ self.assertRaises(TypeError, el.__mod__, val)
+
+ s = ""
+ el = objectify.DataElement(s)
+ val = 5
+ self.assertRaises(TypeError, el.__mod__, val)
+
+ def test_type_str_hash(self):
+ v = "1"
+ el = objectify.DataElement(v)
+ self.assertEqual(hash(el), hash("1"))
+
+ def test_type_str_as_int(self):
+ v = "1"
+ el = objectify.DataElement(v)
+ self.assertEqual(int(el), 1)
+
+ def test_type_str_as_float(self):
+ v = "1"
+ el = objectify.DataElement(v)
+ self.assertEqual(float(el), 1)
+
+ def test_type_str_as_complex(self):
+ v = "1"
+ el = objectify.DataElement(v)
+ self.assertEqual(complex(el), 1)
+
+ def test_type_str_mod_data_elements(self):
+ s = "%d %f %s %r"
+ el = objectify.DataElement(s)
+ values = (objectify.DataElement(1),
+ objectify.DataElement(7.0),
+ objectify.DataElement("abcd"),
+ objectify.DataElement(None))
+ self.assertEqual(s % values, el % values)
+
+ def test_data_element_str(self):
+ value = objectify.DataElement("test")
+ self.assertTrue(isinstance(value, objectify.StringElement))
+ self.assertEqual(value, "test")
+
+ def test_data_element_str_intliteral(self):
+ value = objectify.DataElement("3")
+ self.assertTrue(isinstance(value, objectify.StringElement))
+ self.assertEqual(value, "3")
+
+ def test_data_element_str_floatliteral(self):
+ value = objectify.DataElement("3.20")
+ self.assertTrue(isinstance(value, objectify.StringElement))
+ self.assertEqual(value, "3.20")
+
+ def test_type_ustr(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.s = _str("test")
+ self.assertTrue(isinstance(root.s, objectify.StringElement))
+
+ def test_type_ustr_intliteral(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.s = _str("3")
+ self.assertTrue(isinstance(root.s, objectify.StringElement))
+
+ def test_type_ustr_floatliteral(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.s = _str("3.72")
+ self.assertTrue(isinstance(root.s, objectify.StringElement))
+
+ def test_type_ustr_mul(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.s = _str("test")
+
+ self.assertEqual(_str("test") * 5, root.s * 5)
+ self.assertEqual(5 * _str("test"), 5 * root.s)
+
+ self.assertRaises(TypeError, operator.mul, root.s, _str("honk"))
+ self.assertRaises(TypeError, operator.mul, _str("honk"), root.s)
+
+ def test_type_ustr_add(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.s = _str("test")
+
+ s = _str("toast")
+ self.assertEqual(_str("test") + s, root.s + s)
+ self.assertEqual(s + _str("test"), s + root.s)
+
+ def test_data_element_ustr(self):
+ value = objectify.DataElement(_str("test"))
+ self.assertTrue(isinstance(value, objectify.StringElement))
+ self.assertEqual(value, _str("test"))
+
+ def test_data_element_ustr_intliteral(self):
+ value = objectify.DataElement("3")
+ self.assertTrue(isinstance(value, objectify.StringElement))
+ self.assertEqual(value, _str("3"))
+
+ def test_data_element_ustr_floatliteral(self):
+ value = objectify.DataElement(_str("3.20"))
+ self.assertTrue(isinstance(value, objectify.StringElement))
+ self.assertEqual(value, _str("3.20"))
+
+ def test_type_int(self):
+ Element = self.Element
+ root = Element("{objectified}root")
+ root.none = 5
+ self.assertTrue(isinstance(root.none, objectify.IntElement))
+ self.assertEqual(5, root.none.__index__())
+
+ def test_data_element_int(self):
+ value = objectify.DataElement(5)
+ self.assertTrue(isinstance(value, objectify.IntElement))
+ self.assertEqual(value, 5)
+
+ def test_data_element_int_hash(self):
+ value = objectify.DataElement(123)
+ self.assertEqual(hash(value), hash(123))
+
+ def test_type_float(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ root = Element("{objectified}root")
+ root.none = 5.5
+ self.assertTrue(isinstance(root.none, objectify.FloatElement))
+
+ def test_data_element_float(self):
+ value = objectify.DataElement(5.5)
+ self.assertTrue(isinstance(value, objectify.FloatElement))
+ self.assertEqual(value, 5.5)
+
+ def test_data_element_float_hash(self):
+ value = objectify.DataElement(5.5)
+ self.assertEqual(hash(value), hash(5.5))
+
+ def test_type_float_precision(self):
+ # test not losing precision by shortened float str() value
+ # repr(2.305064300557): '2.305064300557'
+ # str(2.305064300557): '2.30506430056'
+ # "%57.54f" % 2.305064300557:
+ # ' 2.305064300556999956626214043353684246540069580078125000'
+ Element = self.Element
+ root = Element("{objectified}root")
+ s = "2.305064300557"
+ root.f = float(s)
+ self.assertTrue(isinstance(root.f, objectify.FloatElement))
+ self.assertEqual(root.f.text, s)
+ self.assertEqual(root.f.pyval, float(s))
+
+ def test_type_float_instantiation_precision(self):
+ # test precision preservation for FloatElement instantiation
+ s = "2.305064300557"
+ self.assertEqual(objectify.FloatElement(s), float(s))
+
+ def test_type_float_precision_consistency(self):
+ # test consistent FloatElement values for the different instantiation
+ # possibilities
+ Element = self.Element
+ root = Element("{objectified}root")
+ s = "2.305064300557"
+ f = float(s)
+ float_elem = objectify.FloatElement(s)
+ float_data_elem = objectify.DataElement(f)
+ root.float_child = float(f)
+ self.assertTrue(f == float_elem == float_data_elem == root.float_child)
+
+ def test_data_element_float_precision(self):
+ # test not losing precision by shortened float str() value
+ f = 2305064300557.0
+ value = objectify.DataElement(f)
+ self.assertTrue(isinstance(value, objectify.FloatElement))
+ self.assertEqual(value, f)
+
+ def test_data_element_float_hash_repr(self):
+ # test not losing precision by shortened float str() value
+ f = 2305064300557.0
+ value = objectify.DataElement(f)
+ self.assertEqual(hash(value), hash(f))
+
+ def test_data_element_xsitypes(self):
+ for xsi, objclass in xsitype2objclass.items():
+ # 1 is a valid value for all ObjectifiedDataElement classes
+ pyval = 1
+ value = objectify.DataElement(pyval, _xsi=xsi)
+ self.assertTrue(isinstance(value, objclass),
+ "DataElement(%s, _xsi='%s') returns %s, expected %s"
+ % (pyval, xsi, type(value), objclass))
+
+ def test_data_element_xsitypes_xsdprefixed(self):
+ for xsi, objclass in xsitype2objclass.items():
+ # 1 is a valid value for all ObjectifiedDataElement classes
+ pyval = 1
+ value = objectify.DataElement(pyval, _xsi="xsd:%s" % xsi)
+ self.assertTrue(isinstance(value, objclass),
+ "DataElement(%s, _xsi='%s') returns %s, expected %s"
+ % (pyval, xsi, type(value), objclass))
+
+ def test_data_element_xsitypes_prefixed(self):
+ for xsi, objclass in xsitype2objclass.items():
+ # 1 is a valid value for all ObjectifiedDataElement classes
+ self.assertRaises(ValueError, objectify.DataElement, 1,
+ _xsi="foo:%s" % xsi)
+
+ def test_data_element_pytypes(self):
+ for pytype, objclass in pytype2objclass.items():
+ # 1 is a valid value for all ObjectifiedDataElement classes
+ pyval = 1
+ value = objectify.DataElement(pyval, _pytype=pytype)
+ self.assertTrue(isinstance(value, objclass),
+ "DataElement(%s, _pytype='%s') returns %s, expected %s"
+ % (pyval, pytype, type(value), objclass))
+
+ def test_data_element_pytype_none(self):
+ pyval = 1
+ pytype = "NoneType"
+ objclass = objectify.NoneElement
+ value = objectify.DataElement(pyval, _pytype=pytype)
+ self.assertTrue(isinstance(value, objclass),
+ "DataElement(%s, _pytype='%s') returns %s, expected %s"
+ % (pyval, pytype, type(value), objclass))
+ self.assertEqual(value.text, None)
+ self.assertEqual(value.pyval, None)
+
+ def test_data_element_pytype_none_compat(self):
+ # pre-2.0 lxml called NoneElement "none"
+ pyval = 1
+ pytype = "none"
+ objclass = objectify.NoneElement
+ value = objectify.DataElement(pyval, _pytype=pytype)
+ self.assertTrue(isinstance(value, objclass),
+ "DataElement(%s, _pytype='%s') returns %s, expected %s"
+ % (pyval, pytype, type(value), objclass))
+ self.assertEqual(value.text, None)
+ self.assertEqual(value.pyval, None)
+
+ def test_type_unregistered(self):
+ Element = self.Element
+ SubElement = self.etree.SubElement
+ class MyFloat(float):
+ pass
+ root = Element("{objectified}root")
+ root.myfloat = MyFloat(5.5)
+ self.assertTrue(isinstance(root.myfloat, objectify.FloatElement))
+ self.assertEqual(root.myfloat.get(objectify.PYTYPE_ATTRIBUTE), None)
+
+ def test_data_element_unregistered(self):
+ class MyFloat(float):
+ pass
+ value = objectify.DataElement(MyFloat(5.5))
+ self.assertTrue(isinstance(value, objectify.FloatElement))
+ self.assertEqual(value, 5.5)
+ self.assertEqual(value.get(objectify.PYTYPE_ATTRIBUTE), None)
+
+ def test_schema_types(self):
+ XML = self.XML
+ root = XML('''\
+ <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <b xsi:type="boolean">true</b>
+ <b xsi:type="boolean">false</b>
+ <b xsi:type="boolean">1</b>
+ <b xsi:type="boolean">0</b>
+
+ <f xsi:type="float">5</f>
+ <f xsi:type="double">5</f>
+
+ <s xsi:type="string">5</s>
+ <s xsi:type="normalizedString">5</s>
+ <s xsi:type="token">5</s>
+ <s xsi:type="language">5</s>
+ <s xsi:type="Name">5</s>
+ <s xsi:type="NCName">5</s>
+ <s xsi:type="ID">5</s>
+ <s xsi:type="IDREF">5</s>
+ <s xsi:type="ENTITY">5</s>
+ <s xsi:type="NMTOKEN">5</s>
+
+ <l xsi:type="integer">5</l>
+ <l xsi:type="nonPositiveInteger">5</l>
+ <l xsi:type="negativeInteger">5</l>
+ <l xsi:type="long">5</l>
+ <l xsi:type="nonNegativeInteger">5</l>
+ <l xsi:type="unsignedLong">5</l>
+ <l xsi:type="unsignedInt">5</l>
+ <l xsi:type="positiveInteger">5</l>
+
+ <i xsi:type="int">5</i>
+ <i xsi:type="short">5</i>
+ <i xsi:type="byte">5</i>
+ <i xsi:type="unsignedShort">5</i>
+ <i xsi:type="unsignedByte">5</i>
+
+ <n xsi:nil="true"/>
+ </root>
+ ''')
+
+ for b in root.b:
+ self.assertTrue(isinstance(b, objectify.BoolElement))
+ self.assertEqual(True, root.b[0])
+ self.assertEqual(False, root.b[1])
+ self.assertEqual(True, root.b[2])
+ self.assertEqual(False, root.b[3])
+
+ for f in root.f:
+ self.assertTrue(isinstance(f, objectify.FloatElement))
+ self.assertEqual(5, f)
+
+ for s in root.s:
+ self.assertTrue(isinstance(s, objectify.StringElement))
+ self.assertEqual("5", s)
+
+ for i in root.i:
+ self.assertTrue(isinstance(i, objectify.IntElement))
+ self.assertEqual(5, i)
+
+ for l in root.l:
+ self.assertTrue(isinstance(l, objectify.IntElement))
+ self.assertEqual(5, i)
+
+ self.assertTrue(isinstance(root.n, objectify.NoneElement))
+ self.assertEqual(None, root.n)
+
+ def test_schema_types_prefixed(self):
+ XML = self.XML
+ root = XML('''\
+ <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <b xsi:type="xsd:boolean">true</b>
+ <b xsi:type="xsd:boolean">false</b>
+ <b xsi:type="xsd:boolean">1</b>
+ <b xsi:type="xsd:boolean">0</b>
+
+ <f xsi:type="xsd:float">5</f>
+ <f xsi:type="xsd:double">5</f>
+
+ <s xsi:type="xsd:string">5</s>
+ <s xsi:type="xsd:normalizedString">5</s>
+ <s xsi:type="xsd:token">5</s>
+ <s xsi:type="xsd:language">5</s>
+ <s xsi:type="xsd:Name">5</s>
+ <s xsi:type="xsd:NCName">5</s>
+ <s xsi:type="xsd:ID">5</s>
+ <s xsi:type="xsd:IDREF">5</s>
+ <s xsi:type="xsd:ENTITY">5</s>
+ <s xsi:type="xsd:NMTOKEN">5</s>
+
+ <l xsi:type="xsd:integer">5</l>
+ <l xsi:type="xsd:nonPositiveInteger">5</l>
+ <l xsi:type="xsd:negativeInteger">5</l>
+ <l xsi:type="xsd:long">5</l>
+ <l xsi:type="xsd:nonNegativeInteger">5</l>
+ <l xsi:type="xsd:unsignedLong">5</l>
+ <l xsi:type="xsd:unsignedInt">5</l>
+ <l xsi:type="xsd:positiveInteger">5</l>
+
+ <i xsi:type="xsd:int">5</i>
+ <i xsi:type="xsd:short">5</i>
+ <i xsi:type="xsd:byte">5</i>
+ <i xsi:type="xsd:unsignedShort">5</i>
+ <i xsi:type="xsd:unsignedByte">5</i>
+
+ <n xsi:nil="true"/>
+ </root>
+ ''')
+
+ for b in root.b:
+ self.assertTrue(isinstance(b, objectify.BoolElement))
+ self.assertEqual(True, root.b[0])
+ self.assertEqual(False, root.b[1])
+ self.assertEqual(True, root.b[2])
+ self.assertEqual(False, root.b[3])
+
+ for f in root.f:
+ self.assertTrue(isinstance(f, objectify.FloatElement))
+ self.assertEqual(5, f)
+
+ for s in root.s:
+ self.assertTrue(isinstance(s, objectify.StringElement))
+ self.assertEqual("5", s)
+
+ for i in root.i:
+ self.assertTrue(isinstance(i, objectify.IntElement))
+ self.assertEqual(5, i)
+
+ for l in root.l:
+ self.assertTrue(isinstance(l, objectify.IntElement))
+ self.assertEqual(5, l)
+
+ self.assertTrue(isinstance(root.n, objectify.NoneElement))
+ self.assertEqual(None, root.n)
+
+ def test_type_str_sequence(self):
+ XML = self.XML
+ root = XML(_bytes('<root><b>why</b><b>try</b></root>'))
+ strs = [ str(s) for s in root.b ]
+ self.assertEqual(["why", "try"],
+ strs)
+
+ def test_type_str_cmp(self):
+ XML = self.XML
+ root = XML(_bytes('<root><b>test</b><b>taste</b><b></b><b/></root>'))
+ self.assertFalse(root.b[0] < root.b[1])
+ self.assertFalse(root.b[0] <= root.b[1])
+ self.assertFalse(root.b[0] == root.b[1])
+
+ self.assertTrue(root.b[0] != root.b[1])
+ self.assertTrue(root.b[0] >= root.b[1])
+ self.assertTrue(root.b[0] > root.b[1])
+
+ self.assertEqual(root.b[0], "test")
+ self.assertEqual("test", root.b[0])
+
+ self.assertEqual("", root.b[2])
+ self.assertEqual(root.b[2], "")
+ self.assertEqual("", root.b[3])
+ self.assertEqual(root.b[3], "")
+ self.assertEqual(root.b[2], root.b[3])
+
+ root.b = "test"
+ self.assertTrue(root.b)
+ root.b = ""
+ self.assertFalse(root.b)
+ self.assertEqual(root.b, "")
+ self.assertEqual("", root.b)
+
+ def test_type_int_cmp(self):
+ XML = self.XML
+ root = XML(_bytes('<root><b>5</b><b>6</b></root>'))
+ self.assertTrue(root.b[0] < root.b[1])
+ self.assertTrue(root.b[0] <= root.b[1])
+ self.assertTrue(root.b[0] != root.b[1])
+
+ self.assertFalse(root.b[0] == root.b[1])
+ self.assertFalse(root.b[0] >= root.b[1])
+ self.assertFalse(root.b[0] > root.b[1])
+
+ self.assertEqual(root.b[0], 5)
+ self.assertEqual(5, root.b[0])
+ self.assertNotEqual(root.b[0], "5")
+
+ root.b = 5
+ self.assertTrue(root.b)
+ root.b = 0
+ self.assertFalse(root.b)
+
+ # float + long share the NumberElement implementation with int
+
+ def test_type_bool_cmp(self):
+ XML = self.XML
+ root = XML(_bytes('<root><b>false</b><b>true</b></root>'))
+ self.assertTrue(root.b[0] < root.b[1])
+ self.assertTrue(root.b[0] <= root.b[1])
+ self.assertTrue(root.b[0] != root.b[1])
+
+ self.assertFalse(root.b[0] == root.b[1])
+ self.assertFalse(root.b[0] >= root.b[1])
+ self.assertFalse(root.b[0] > root.b[1])
+
+ self.assertFalse(root.b[0])
+ self.assertTrue(root.b[1])
+
+ self.assertEqual(root.b[0], False)
+ self.assertEqual(False, root.b[0])
+ self.assertTrue(root.b[0] < 5)
+ self.assertTrue(5 > root.b[0])
+
+ root.b = True
+ self.assertTrue(root.b)
+ root.b = False
+ self.assertFalse(root.b)
+
+ def test_type_none_cmp(self):
+ XML = self.XML
+ root = XML(_bytes("""
+ <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <b xsi:nil="true"></b><b xsi:nil="true"/>
+ </root>"""))
+ self.assertTrue(root.b[0] == root.b[1])
+ self.assertFalse(root.b[0])
+ self.assertEqual(root.b[0], None)
+ self.assertEqual(None, root.b[0])
+
+ # doesn't work in Py3:
+
+ #for comparison in ["abc", 5, 7.3, True, [], ()]:
+ # none = root.b[1]
+ # self.assertTrue(none < comparison, "%s (%s) should be < %s" %
+ # (none, type(none), comparison) )
+ # self.assertTrue(comparison > none, "%s should be > %s (%s)" %
+ # (comparison, none, type(none)) )
+
+ def test_dataelement_xsi(self):
+ el = objectify.DataElement(1, _xsi="string")
+ self.assertEqual(
+ el.get(XML_SCHEMA_INSTANCE_TYPE_ATTR),
+ 'xsd:string')
+
+ def test_dataelement_xsi_nsmap(self):
+ el = objectify.DataElement(1, _xsi="string",
+ nsmap={'schema': XML_SCHEMA_NS})
+ self.assertEqual(
+ el.get(XML_SCHEMA_INSTANCE_TYPE_ATTR),
+ 'schema:string')
+
+ def test_dataelement_xsi_prefix_error(self):
+ self.assertRaises(ValueError, objectify.DataElement, 1,
+ _xsi="foo:string")
+
+ def test_pytype_annotation(self):
+ XML = self.XML
+ root = XML(_bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <b>5</b>
+ <b>test</b>
+ <c>1.1</c>
+ <c>\uF8D2</c>
+ <x>true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="double">5</b>
+ <b xsi:type="float">5</b>
+ <s xsi:type="string">23</s>
+ <s py:pytype="str">42</s>
+ <f py:pytype="float">300</f>
+ <l py:pytype="long">2</l>
+ <t py:pytype="TREE"></t>
+ </a>
+ '''))
+ objectify.annotate(root)
+
+ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE)
+ for c in root.iterchildren() ]
+ self.assertEqual("int", child_types[ 0])
+ self.assertEqual("str", child_types[ 1])
+ self.assertEqual("float", child_types[ 2])
+ self.assertEqual("str", child_types[ 3])
+ self.assertEqual("bool", child_types[ 4])
+ self.assertEqual("NoneType", child_types[ 5])
+ self.assertEqual(None, child_types[ 6])
+ self.assertEqual("float", child_types[ 7])
+ self.assertEqual("float", child_types[ 8])
+ self.assertEqual("str", child_types[ 9])
+ self.assertEqual("int", child_types[10])
+ self.assertEqual("int", child_types[11])
+ self.assertEqual("int", child_types[12])
+ self.assertEqual(None, child_types[13])
+
+ self.assertEqual("true", root.n.get(XML_SCHEMA_NIL_ATTR))
+
+ def test_pytype_annotation_empty(self):
+ XML = self.XML
+ root = XML(_bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <n></n>
+ </a>
+ '''))
+ objectify.annotate(root)
+
+ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE)
+ for c in root.iterchildren() ]
+ self.assertEqual(None, child_types[0])
+
+ objectify.annotate(root, empty_pytype="str")
+
+ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE)
+ for c in root.iterchildren() ]
+ self.assertEqual("str", child_types[0])
+
+ def test_pytype_annotation_use_old(self):
+ XML = self.XML
+ root = XML(_bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <b>5</b>
+ <b>test</b>
+ <c>1.1</c>
+ <c>\uF8D2</c>
+ <x>true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="double">5</b>
+ <b xsi:type="float">5</b>
+ <s xsi:type="string">23</s>
+ <s py:pytype="str">42</s>
+ <f py:pytype="float">300</f>
+ <l py:pytype="long">2</l>
+ <t py:pytype="TREE"></t>
+ </a>
+ '''))
+ objectify.annotate(root, ignore_old=False)
+
+ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE)
+ for c in root.iterchildren() ]
+ self.assertEqual("int", child_types[ 0])
+ self.assertEqual("str", child_types[ 1])
+ self.assertEqual("float", child_types[ 2])
+ self.assertEqual("str", child_types[ 3])
+ self.assertEqual("bool", child_types[ 4])
+ self.assertEqual("NoneType", child_types[ 5])
+ self.assertEqual(None, child_types[ 6])
+ self.assertEqual("float", child_types[ 7])
+ self.assertEqual("float", child_types[ 8])
+ self.assertEqual("str", child_types[ 9])
+ self.assertEqual("str", child_types[10])
+ self.assertEqual("float", child_types[11])
+ self.assertEqual("int", child_types[12])
+ self.assertEqual(TREE_PYTYPE, child_types[13])
+
+ self.assertEqual("true", root.n.get(XML_SCHEMA_NIL_ATTR))
+
+ def test_pytype_xsitype_annotation(self):
+ XML = self.XML
+ root = XML(_bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <b>5</b>
+ <b>test</b>
+ <c>1.1</c>
+ <c>\uF8D2</c>
+ <x>true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="double">5</b>
+ <b xsi:type="float">5</b>
+ <s xsi:type="string">23</s>
+ <s py:pytype="str">42</s>
+ <f py:pytype="float">300</f>
+ <l py:pytype="long">2</l>
+ <t py:pytype="TREE"></t>
+ </a>
+ '''))
+ objectify.annotate(root, ignore_old=False, ignore_xsi=False,
+ annotate_xsi=1, annotate_pytype=1)
+
+ # check py annotations
+ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE)
+ for c in root.iterchildren() ]
+ self.assertEqual("int", child_types[ 0])
+ self.assertEqual("str", child_types[ 1])
+ self.assertEqual("float", child_types[ 2])
+ self.assertEqual("str", child_types[ 3])
+ self.assertEqual("bool", child_types[ 4])
+ self.assertEqual("NoneType", child_types[ 5])
+ self.assertEqual(None, child_types[ 6])
+ self.assertEqual("float", child_types[ 7])
+ self.assertEqual("float", child_types[ 8])
+ self.assertEqual("str", child_types[ 9])
+ self.assertEqual("str", child_types[10])
+ self.assertEqual("float", child_types[11])
+ self.assertEqual("int", child_types[12])
+ self.assertEqual(TREE_PYTYPE, child_types[13])
+
+ self.assertEqual("true", root.n.get(XML_SCHEMA_NIL_ATTR))
+
+ child_xsitypes = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR)
+ for c in root.iterchildren() ]
+
+ # check xsi annotations
+ child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR)
+ for c in root.iterchildren() ]
+ self.assertEqual("xsd:integer", child_types[ 0])
+ self.assertEqual("xsd:string", child_types[ 1])
+ self.assertEqual("xsd:double", child_types[ 2])
+ self.assertEqual("xsd:string", child_types[ 3])
+ self.assertEqual("xsd:boolean", child_types[ 4])
+ self.assertEqual(None, child_types[ 5])
+ self.assertEqual(None, child_types[ 6])
+ self.assertEqual("xsd:double", child_types[ 7])
+ self.assertEqual("xsd:float", child_types[ 8])
+ self.assertEqual("xsd:string", child_types[ 9])
+ self.assertEqual("xsd:string", child_types[10])
+ self.assertEqual("xsd:double", child_types[11])
+ self.assertEqual("xsd:integer", child_types[12])
+ self.assertEqual(None, child_types[13])
+
+ self.assertEqual("true", root.n.get(XML_SCHEMA_NIL_ATTR))
+
+ def test_xsiannotate_use_old(self):
+ XML = self.XML
+ root = XML(_bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <b>5</b>
+ <b>test</b>
+ <c>1.1</c>
+ <c>\uF8D2</c>
+ <x>true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="double">5</b>
+ <b xsi:type="float">5</b>
+ <s xsi:type="string">23</s>
+ <s py:pytype="str">42</s>
+ <f py:pytype="float">300</f>
+ <l py:pytype="long">2</l>
+ <t py:pytype="TREE"></t>
+ </a>
+ '''))
+ objectify.xsiannotate(root, ignore_old=False)
+
+ child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR)
+ for c in root.iterchildren() ]
+ self.assertEqual("xsd:integer", child_types[ 0])
+ self.assertEqual("xsd:string", child_types[ 1])
+ self.assertEqual("xsd:double", child_types[ 2])
+ self.assertEqual("xsd:string", child_types[ 3])
+ self.assertEqual("xsd:boolean", child_types[ 4])
+ self.assertEqual(None, child_types[ 5])
+ self.assertEqual(None, child_types[ 6])
+ self.assertEqual("xsd:double", child_types[ 7])
+ self.assertEqual("xsd:float", child_types[ 8])
+ self.assertEqual("xsd:string", child_types[ 9])
+ self.assertEqual("xsd:string", child_types[10])
+ self.assertEqual("xsd:double", child_types[11])
+ self.assertEqual("xsd:integer", child_types[12])
+ self.assertEqual(None, child_types[13])
+
+ def test_pyannotate_ignore_old(self):
+ XML = self.XML
+ root = XML(_bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <b>5</b>
+ <b>test</b>
+ <c>1.1</c>
+ <c>\uF8D2</c>
+ <x>true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="double">5</b>
+ <b xsi:type="float">5</b>
+ <s xsi:type="string">23</s>
+ <s py:pytype="str">42</s>
+ <f py:pytype="float">300</f>
+ <l py:pytype="long">2</l>
+ <t py:pytype="TREE"></t>
+ </a>
+ '''))
+ objectify.pyannotate(root, ignore_old=True)
+
+ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE)
+ for c in root.iterchildren() ]
+ self.assertEqual("int", child_types[ 0])
+ self.assertEqual("str", child_types[ 1])
+ self.assertEqual("float", child_types[ 2])
+ self.assertEqual("str", child_types[ 3])
+ self.assertEqual("bool", child_types[ 4])
+ self.assertEqual("NoneType", child_types[ 5])
+ self.assertEqual(None, child_types[ 6])
+ self.assertEqual("float", child_types[ 7])
+ self.assertEqual("float", child_types[ 8])
+ self.assertEqual("str", child_types[ 9])
+ self.assertEqual("int", child_types[10])
+ self.assertEqual("int", child_types[11])
+ self.assertEqual("int", child_types[12])
+ self.assertEqual(None, child_types[13])
+
+ self.assertEqual("true", root.n.get(XML_SCHEMA_NIL_ATTR))
+
+ def test_pyannotate_empty(self):
+ XML = self.XML
+ root = XML('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <n></n>
+ </a>
+ ''')
+ objectify.pyannotate(root)
+
+ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE)
+ for c in root.iterchildren() ]
+ self.assertEqual(None, child_types[0])
+
+ objectify.annotate(root, empty_pytype="str")
+
+ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE)
+ for c in root.iterchildren() ]
+ self.assertEqual("str", child_types[0])
+
+ def test_pyannotate_use_old(self):
+ XML = self.XML
+ root = XML('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <b>5</b>
+ <b>test</b>
+ <c>1.1</c>
+ <c>\uF8D2</c>
+ <x>true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="double">5</b>
+ <b xsi:type="float">5</b>
+ <s xsi:type="string">23</s>
+ <s py:pytype="str">42</s>
+ <f py:pytype="float">300</f>
+ <l py:pytype="long">2</l>
+ <t py:pytype="TREE"></t>
+ </a>
+ ''')
+ objectify.pyannotate(root)
+
+ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE)
+ for c in root.iterchildren() ]
+ self.assertEqual("int", child_types[ 0])
+ self.assertEqual("str", child_types[ 1])
+ self.assertEqual("float", child_types[ 2])
+ self.assertEqual("str", child_types[ 3])
+ self.assertEqual("bool", child_types[ 4])
+ self.assertEqual("NoneType", child_types[ 5])
+ self.assertEqual(None, child_types[ 6])
+ self.assertEqual("float", child_types[ 7])
+ self.assertEqual("float", child_types[ 8])
+ self.assertEqual("str", child_types[ 9])
+ self.assertEqual("str", child_types[10])
+ self.assertEqual("float", child_types[11])
+ self.assertEqual("int", child_types[12])
+ self.assertEqual(TREE_PYTYPE, child_types[13])
+
+ self.assertEqual("true", root.n.get(XML_SCHEMA_NIL_ATTR))
+
+ def test_xsiannotate_ignore_old(self):
+ XML = self.XML
+ root = XML(_bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <b>5</b>
+ <b>test</b>
+ <c>1.1</c>
+ <c>\uF8D2</c>
+ <x>true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="double">5</b>
+ <b xsi:type="float">5</b>
+ <s xsi:type="string">23</s>
+ <s py:pytype="str">42</s>
+ <f py:pytype="float">300</f>
+ <l py:pytype="long">2</l>
+ <t py:pytype="TREE"></t>
+ </a>
+ '''))
+ objectify.xsiannotate(root, ignore_old=True)
+
+ child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR)
+ for c in root.iterchildren() ]
+ self.assertEqual("xsd:integer", child_types[ 0])
+ self.assertEqual("xsd:string", child_types[ 1])
+ self.assertEqual("xsd:double", child_types[ 2])
+ self.assertEqual("xsd:string", child_types[ 3])
+ self.assertEqual("xsd:boolean", child_types[ 4])
+ self.assertEqual(None, child_types[ 5])
+ self.assertEqual(None, child_types[ 6])
+ self.assertEqual("xsd:integer", child_types[ 7])
+ self.assertEqual("xsd:integer", child_types[ 8])
+ self.assertEqual("xsd:integer", child_types[ 9])
+ self.assertEqual("xsd:string", child_types[10])
+ self.assertEqual("xsd:double", child_types[11])
+ self.assertEqual("xsd:integer", child_types[12])
+ self.assertEqual(None, child_types[13])
+
+ self.assertEqual("true", root.n.get(XML_SCHEMA_NIL_ATTR))
+
+ def test_deannotate(self):
+ XML = self.XML
+ root = XML(_bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <b>5</b>
+ <b>test</b>
+ <c>1.1</c>
+ <c>\uF8D2</c>
+ <x>true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="double">5</b>
+ <b xsi:type="float">5</b>
+ <s xsi:type="string">23</s>
+ <s py:pytype="str">42</s>
+ <f py:pytype="float">300</f>
+ <l py:pytype="long">2</l>
+ <t py:pytype="TREE"></t>
+ </a>
+ '''))
+ objectify.deannotate(root)
+
+ for c in root.getiterator():
+ self.assertEqual(None, c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR))
+ self.assertEqual(None, c.get(objectify.PYTYPE_ATTRIBUTE))
+
+ self.assertEqual("true", root.n.get(XML_SCHEMA_NIL_ATTR))
+
+ def test_xsinil_deannotate(self):
+ XML = self.XML
+ root = XML(_bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype">
+ <b>5</b>
+ <b>test</b>
+ <c>1.1</c>
+ <c>\uF8D2</c>
+ <x>true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="double">5</b>
+ <b xsi:type="float">5</b>
+ <s xsi:type="string">23</s>
+ <s py:pytype="str">42</s>
+ <f py:pytype="float">300</f>
+ <l py:pytype="long">2</l>
+ <t py:pytype="TREE"></t>
+ </a>
+ '''))
+ objectify.annotate(
+ root, ignore_old=False, ignore_xsi=False, annotate_xsi=True,
+ empty_pytype='str', empty_type='string')
+ objectify.deannotate(root, pytype=False, xsi=False, xsi_nil=True)
+
+ child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR)
+ for c in root.iterchildren() ]
+ self.assertEqual("xsd:integer", child_types[ 0])
+ self.assertEqual("xsd:string", child_types[ 1])
+ self.assertEqual("xsd:double", child_types[ 2])
+ self.assertEqual("xsd:string", child_types[ 3])
+ self.assertEqual("xsd:boolean", child_types[ 4])
+ self.assertEqual(None, child_types[ 5])
+ self.assertEqual("xsd:string", child_types[ 6])
+ self.assertEqual("xsd:double", child_types[ 7])
+ self.assertEqual("xsd:float", child_types[ 8])
+ self.assertEqual("xsd:string", child_types[ 9])
+ self.assertEqual("xsd:string", child_types[10])
+ self.assertEqual("xsd:double", child_types[11])
+ self.assertEqual("xsd:integer", child_types[12])
+ self.assertEqual(None, child_types[13])
+
+ self.assertEqual(None, root.n.get(XML_SCHEMA_NIL_ATTR))
+
+ for c in root.iterchildren():
+ self.assertNotEqual(None, c.get(objectify.PYTYPE_ATTRIBUTE))
+ # these have no equivalent in xsi:type
+ if (c.get(objectify.PYTYPE_ATTRIBUTE) not in [TREE_PYTYPE,
+ "NoneType"]):
+ self.assertNotEqual(
+ None, c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR))
+
+ def test_xsitype_deannotate(self):
+ XML = self.XML
+ root = XML(_bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype"
+ xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <b>5</b>
+ <b>test</b>
+ <c>1.1</c>
+ <c>\uF8D2</c>
+ <x>true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="xsd:double">5</b>
+ <b xsi:type="xsd:float">5</b>
+ <s xsi:type="xsd:string">23</s>
+ <s py:pytype="str">42</s>
+ <f py:pytype="float">300</f>
+ <l py:pytype="long">2</l>
+ <t py:pytype="TREE"></t>
+ </a>
+ '''))
+ objectify.annotate(root)
+ objectify.deannotate(root, pytype=False)
+
+ child_types = [ c.get(objectify.PYTYPE_ATTRIBUTE)
+ for c in root.iterchildren() ]
+ self.assertEqual("int", child_types[ 0])
+ self.assertEqual("str", child_types[ 1])
+ self.assertEqual("float", child_types[ 2])
+ self.assertEqual("str", child_types[ 3])
+ self.assertEqual("bool", child_types[ 4])
+ self.assertEqual("NoneType", child_types[ 5])
+ self.assertEqual(None, child_types[ 6])
+ self.assertEqual("float", child_types[ 7])
+ self.assertEqual("float", child_types[ 8])
+ self.assertEqual("str", child_types[ 9])
+ self.assertEqual("int", child_types[10])
+ self.assertEqual("int", child_types[11])
+ self.assertEqual("int", child_types[12])
+ self.assertEqual(None, child_types[13])
+
+ self.assertEqual("true", root.n.get(XML_SCHEMA_NIL_ATTR))
+
+ for c in root.getiterator():
+ self.assertEqual(None, c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR))
+
+ def test_pytype_deannotate(self):
+ XML = self.XML
+ root = XML(_bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xmlns:py="http://codespeak.net/lxml/objectify/pytype"
+ xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <b xsi:type="xsd:int">5</b>
+ <b xsi:type="xsd:string">test</b>
+ <c xsi:type="xsd:float">1.1</c>
+ <c xsi:type="xsd:string">\uF8D2</c>
+ <x xsi:type="xsd:boolean">true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="xsd:double">5</b>
+ <b xsi:type="xsd:float">5</b>
+ <s xsi:type="xsd:string">23</s>
+ <s xsi:type="xsd:string">42</s>
+ <f xsi:type="xsd:float">300</f>
+ <l xsi:type="xsd:long">2</l>
+ <t py:pytype="TREE"></t>
+ </a>
+ '''))
+ objectify.annotate(root)
+ objectify.deannotate(root, xsi=False)
+
+ child_types = [ c.get(XML_SCHEMA_INSTANCE_TYPE_ATTR)
+ for c in root.iterchildren() ]
+ self.assertEqual("xsd:int", child_types[ 0])
+ self.assertEqual("xsd:string", child_types[ 1])
+ self.assertEqual("xsd:float", child_types[ 2])
+ self.assertEqual("xsd:string", child_types[ 3])
+ self.assertEqual("xsd:boolean", child_types[ 4])
+ self.assertEqual(None, child_types[ 5])
+ self.assertEqual(None, child_types[ 6])
+ self.assertEqual("xsd:double", child_types[ 7])
+ self.assertEqual("xsd:float", child_types[ 8])
+ self.assertEqual("xsd:string", child_types[ 9])
+ self.assertEqual("xsd:string", child_types[10])
+ self.assertEqual("xsd:float", child_types[11])
+ self.assertEqual("xsd:long", child_types[12])
+ self.assertEqual(None, child_types[13])
+
+ self.assertEqual("true", root.n.get(XML_SCHEMA_NIL_ATTR))
+
+ for c in root.getiterator():
+ self.assertEqual(None, c.get(objectify.PYTYPE_ATTRIBUTE))
+
+ def test_change_pytype_attribute(self):
+ XML = self.XML
+
+ xml = _bytes('''\
+ <a xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <b>5</b>
+ <b>test</b>
+ <c>1.1</c>
+ <c>\uF8D2</c>
+ <x>true</x>
+ <n xsi:nil="true" />
+ <n></n>
+ <b xsi:type="double">5</b>
+ </a>
+ ''')
+
+ pytype_ns, pytype_name = objectify.PYTYPE_ATTRIBUTE[1:].split('}')
+ objectify.set_pytype_attribute_tag("{TEST}test")
+
+ root = XML(xml)
+ objectify.annotate(root)
+
+ attribs = root.xpath("//@py:%s" % pytype_name,
+ namespaces={"py" : pytype_ns})
+ self.assertEqual(0, len(attribs))
+ attribs = root.xpath("//@py:test",
+ namespaces={"py" : "TEST"})
+ self.assertEqual(7, len(attribs))
+
+ objectify.set_pytype_attribute_tag()
+ pytype_ns, pytype_name = objectify.PYTYPE_ATTRIBUTE[1:].split('}')
+
+ self.assertNotEqual("test", pytype_ns.lower())
+ self.assertNotEqual("test", pytype_name.lower())
+
+ root = XML(xml)
+ attribs = root.xpath("//@py:%s" % pytype_name,
+ namespaces={"py" : pytype_ns})
+ self.assertEqual(0, len(attribs))
+
+ objectify.annotate(root)
+ attribs = root.xpath("//@py:%s" % pytype_name,
+ namespaces={"py" : pytype_ns})
+ self.assertEqual(7, len(attribs))
+
+ def test_registered_types(self):
+ orig_types = objectify.getRegisteredTypes()
+ orig_types[0].unregister()
+ self.assertEqual(orig_types[1:], objectify.getRegisteredTypes())
+
+ class NewType(objectify.ObjectifiedDataElement):
+ pass
+
+ def checkMyType(s):
+ return True
+
+ pytype = objectify.PyType("mytype", checkMyType, NewType)
+ self.assertTrue(pytype not in objectify.getRegisteredTypes())
+ pytype.register()
+ self.assertTrue(pytype in objectify.getRegisteredTypes())
+ pytype.unregister()
+ self.assertTrue(pytype not in objectify.getRegisteredTypes())
+
+ pytype.register(before = [objectify.getRegisteredTypes()[0].name])
+ self.assertEqual(pytype, objectify.getRegisteredTypes()[0])
+ pytype.unregister()
+
+ pytype.register(after = [objectify.getRegisteredTypes()[0].name])
+ self.assertNotEqual(pytype, objectify.getRegisteredTypes()[0])
+ pytype.unregister()
+
+ self.assertRaises(ValueError, pytype.register,
+ before = [objectify.getRegisteredTypes()[0].name],
+ after = [objectify.getRegisteredTypes()[1].name])
+
+ def test_registered_type_stringify(self):
+ from datetime import datetime
+ def parse_date(value):
+ if len(value) != 14:
+ raise ValueError(value)
+ Y = int(value[0:4])
+ M = int(value[4:6])
+ D = int(value[6:8])
+ h = int(value[8:10])
+ m = int(value[10:12])
+ s = int(value[12:14])
+ return datetime(Y, M, D, h, m, s)
+
+ def stringify_date(date):
+ return date.strftime("%Y%m%d%H%M%S")
+
+ class DatetimeElement(objectify.ObjectifiedDataElement):
+ def pyval(self):
+ return parse_date(self.text)
+ pyval = property(pyval)
+
+ datetime_type = objectify.PyType(
+ "datetime", parse_date, DatetimeElement, stringify_date)
+ datetime_type.xmlSchemaTypes = "dateTime"
+ datetime_type.register()
+
+ NAMESPACE = "http://foo.net/xmlns"
+ NAMESPACE_MAP = {'ns': NAMESPACE}
+
+ r = objectify.Element("{%s}root" % NAMESPACE, nsmap=NAMESPACE_MAP)
+ time = datetime.now()
+ r.date = time
+
+ self.assertTrue(isinstance(r.date, DatetimeElement))
+ self.assertTrue(isinstance(r.date.pyval, datetime))
+
+ self.assertEqual(r.date.pyval, parse_date(stringify_date(time)))
+ self.assertEqual(r.date.text, stringify_date(time))
+
+ r.date = objectify.E.date(time)
+
+ self.assertTrue(isinstance(r.date, DatetimeElement))
+ self.assertTrue(isinstance(r.date.pyval, datetime))
+
+ self.assertEqual(r.date.pyval, parse_date(stringify_date(time)))
+ self.assertEqual(r.date.text, stringify_date(time))
+
+ date = objectify.DataElement(time)
+
+ self.assertTrue(isinstance(date, DatetimeElement))
+ self.assertTrue(isinstance(date.pyval, datetime))
+
+ self.assertEqual(date.pyval, parse_date(stringify_date(time)))
+ self.assertEqual(date.text, stringify_date(time))
+
+ def test_object_path(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c2" )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ self.assertEqual(root.c1.c2.text, path(root).text)
+
+ def test_object_path_list(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( ['root', 'c1', 'c2'] )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ self.assertEqual(root.c1.c2.text, path(root).text)
+
+ def test_object_path_fail(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c99" )
+ self.assertRaises(AttributeError, path, root)
+
+ def test_object_path_default_absolute(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c99" )
+ self.assertEqual(None, path(root, None))
+ path = objectify.ObjectPath( "root.c99.c2" )
+ self.assertEqual(None, path(root, None))
+ path = objectify.ObjectPath( "notroot.c99.c2" )
+ self.assertEqual(None, path(root, None))
+
+ def test_object_path_default_relative(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( ".c1.c99" )
+ self.assertEqual(None, path(root, None))
+ path = objectify.ObjectPath( ".c99.c2" )
+ self.assertEqual(None, path(root, None))
+
+ def test_object_path_syntax(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath("root . {objectified}c1. c2")
+ self.assertEqual(root.c1.c2.text, path(root).text)
+
+ path = objectify.ObjectPath(" root.{objectified} c1.c2 [ 0 ] ")
+ self.assertEqual(root.c1.c2.text, path(root).text)
+
+ def test_object_path_fail_parse_empty(self):
+ self.assertRaises(ValueError, objectify.ObjectPath, "")
+
+ def test_object_path_fail_parse_empty_list(self):
+ self.assertRaises(ValueError, objectify.ObjectPath, [])
+
+ def test_object_path_hasattr(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root" )
+ self.assertTrue(path.hasattr(root))
+ path = objectify.ObjectPath( "root.c1" )
+ self.assertTrue(path.hasattr(root))
+ path = objectify.ObjectPath( "root.c1.c2" )
+ self.assertTrue(path.hasattr(root))
+ path = objectify.ObjectPath( "root.c1.{otherNS}c2" )
+ self.assertTrue(path.hasattr(root))
+ path = objectify.ObjectPath( "root.c1.c2[1]" )
+ self.assertTrue(path.hasattr(root))
+ path = objectify.ObjectPath( "root.c1.c2[2]" )
+ self.assertTrue(path.hasattr(root))
+ path = objectify.ObjectPath( "root.c1.c2[3]" )
+ self.assertFalse(path.hasattr(root))
+ path = objectify.ObjectPath( "root.c1[1].c2" )
+ self.assertFalse(path.hasattr(root))
+
+ def test_object_path_dot(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "." )
+ self.assertEqual(root.c1.c2.text, path(root).c1.c2.text)
+
+ def test_object_path_dot_list(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( [''] )
+ self.assertEqual(root.c1.c2.text, path(root).c1.c2.text)
+
+ def test_object_path_dot_root(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( ".c1.c2" )
+ self.assertEqual(root.c1.c2.text, path(root).text)
+
+ def test_object_path_dot_root_list(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( ['', 'c1', 'c2'] )
+ self.assertEqual(root.c1.c2.text, path(root).text)
+
+ def test_object_path_index(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1[0].c2[0]" )
+ self.assertEqual(root.c1.c2.text, path(root).text)
+
+ path = objectify.ObjectPath( "root.c1[0].c2" )
+ self.assertEqual(root.c1.c2.text, path(root).text)
+
+ path = objectify.ObjectPath( "root.c1[0].c2[1]" )
+ self.assertEqual(root.c1.c2[1].text, path(root).text)
+
+ path = objectify.ObjectPath( "root.c1.c2[2]" )
+ self.assertEqual(root.c1.c2[2].text, path(root).text)
+
+ path = objectify.ObjectPath( "root.c1.c2[-1]" )
+ self.assertEqual(root.c1.c2[-1].text, path(root).text)
+
+ path = objectify.ObjectPath( "root.c1.c2[-3]" )
+ self.assertEqual(root.c1.c2[-3].text, path(root).text)
+
+ def test_object_path_index_list(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( ['root', 'c1[0]', 'c2[0]'] )
+ self.assertEqual(root.c1.c2.text, path(root).text)
+
+ path = objectify.ObjectPath( ['root', 'c1[0]', 'c2[2]'] )
+ self.assertEqual(root.c1.c2[2].text, path(root).text)
+
+ path = objectify.ObjectPath( ['root', 'c1', 'c2[2]'] )
+ self.assertEqual(root.c1.c2[2].text, path(root).text)
+
+ path = objectify.ObjectPath( ['root', 'c1', 'c2[-1]'] )
+ self.assertEqual(root.c1.c2[-1].text, path(root).text)
+
+ path = objectify.ObjectPath( ['root', 'c1', 'c2[-3]'] )
+ self.assertEqual(root.c1.c2[-3].text, path(root).text)
+
+ def test_object_path_index_fail_parse(self):
+ self.assertRaises(ValueError, objectify.ObjectPath,
+ "root.c1[0].c2[-1-2]")
+ self.assertRaises(ValueError, objectify.ObjectPath,
+ ['root', 'c1[0]', 'c2[-1-2]'])
+
+ self.assertRaises(ValueError, objectify.ObjectPath,
+ "root[2].c1.c2")
+ self.assertRaises(ValueError, objectify.ObjectPath,
+ ['root[2]', 'c1', 'c2'])
+
+ self.assertRaises(ValueError, objectify.ObjectPath,
+ [])
+ self.assertRaises(ValueError, objectify.ObjectPath,
+ ['', '', ''])
+
+ def test_object_path_index_fail_lookup(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath("root.c1[9999].c2")
+ self.assertRaises(AttributeError, path, root)
+
+ path = objectify.ObjectPath("root.c1[0].c2[9999]")
+ self.assertRaises(AttributeError, path, root)
+
+ path = objectify.ObjectPath(".c1[9999].c2[0]")
+ self.assertRaises(AttributeError, path, root)
+
+ path = objectify.ObjectPath("root.c1[-2].c2")
+ self.assertRaises(AttributeError, path, root)
+
+ path = objectify.ObjectPath("root.c1[0].c2[-4]")
+ self.assertRaises(AttributeError, path, root)
+
+ def test_object_path_ns(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "{objectified}root.c1.c2" )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ path = objectify.ObjectPath( "{objectified}root.{objectified}c1.c2" )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ path = objectify.ObjectPath( "root.{objectified}c1.{objectified}c2" )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ path = objectify.ObjectPath( "root.c1.{objectified}c2" )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ path = objectify.ObjectPath( "root.c1.{otherNS}c2" )
+ self.assertEqual(getattr(root.c1, '{otherNS}c2').text,
+ path.find(root).text)
+
+ def test_object_path_ns_list(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( ['{objectified}root', 'c1', 'c2'] )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ path = objectify.ObjectPath( ['{objectified}root', '{objectified}c1', 'c2'] )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ path = objectify.ObjectPath( ['root', '{objectified}c1', '{objectified}c2'] )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ path = objectify.ObjectPath( ['root', '{objectified}c1', '{objectified}c2[2]'] )
+ self.assertEqual(root.c1.c2[2].text, path.find(root).text)
+ path = objectify.ObjectPath( ['root', 'c1', '{objectified}c2'] )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ path = objectify.ObjectPath( ['root', 'c1', '{objectified}c2[2]'] )
+ self.assertEqual(root.c1.c2[2].text, path.find(root).text)
+ path = objectify.ObjectPath( ['root', 'c1', '{otherNS}c2'] )
+ self.assertEqual(getattr(root.c1, '{otherNS}c2').text,
+ path.find(root).text)
+
+ def test_object_path_set(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c2" )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ self.assertEqual("1", root.c1.c2[1].text)
+
+ new_value = "my new value"
+ path.setattr(root, new_value)
+
+ self.assertEqual(new_value, root.c1.c2.text)
+ self.assertEqual(new_value, path(root).text)
+ self.assertEqual("1", root.c1.c2[1].text)
+
+ def test_object_path_set_element(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c2" )
+ self.assertEqual(root.c1.c2.text, path.find(root).text)
+ self.assertEqual("1", root.c1.c2[1].text)
+
+ new_el = self.Element("{objectified}test")
+ etree.SubElement(new_el, "{objectified}sub", myattr="ATTR").a = "TEST"
+ path.setattr(root, new_el.sub)
+
+ self.assertEqual("ATTR", root.c1.c2.get("myattr"))
+ self.assertEqual("TEST", root.c1.c2.a.text)
+ self.assertEqual("TEST", path(root).a.text)
+ self.assertEqual("1", root.c1.c2[1].text)
+
+ def test_object_path_set_create(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c99" )
+ self.assertRaises(AttributeError, path.find, root)
+
+ new_value = "my new value"
+ path.setattr(root, new_value)
+
+ self.assertEqual(1, len(root.c1.c99))
+ self.assertEqual(new_value, root.c1.c99.text)
+ self.assertEqual(new_value, path(root).text)
+
+ def test_object_path_set_create_element(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c99" )
+ self.assertRaises(AttributeError, path.find, root)
+
+ new_el = self.Element("{objectified}test")
+ etree.SubElement(new_el, "{objectified}sub", myattr="ATTR").a = "TEST"
+ path.setattr(root, new_el.sub)
+
+ self.assertEqual(1, len(root.c1.c99))
+ self.assertEqual("ATTR", root.c1.c99.get("myattr"))
+ self.assertEqual("TEST", root.c1.c99.a.text)
+ self.assertEqual("TEST", path(root).a.text)
+
+ def test_object_path_set_create_list(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c99" )
+ self.assertRaises(AttributeError, path.find, root)
+
+ new_el = self.Element("{objectified}test")
+ new_el.a = ["TEST1", "TEST2"]
+ new_el.a[0].set("myattr", "ATTR1")
+ new_el.a[1].set("myattr", "ATTR2")
+
+ path.setattr(root, list(new_el.a))
+
+ self.assertEqual(2, len(root.c1.c99))
+ self.assertEqual("ATTR1", root.c1.c99[0].get("myattr"))
+ self.assertEqual("TEST1", root.c1.c99[0].text)
+ self.assertEqual("ATTR2", root.c1.c99[1].get("myattr"))
+ self.assertEqual("TEST2", root.c1.c99[1].text)
+ self.assertEqual("TEST1", path(root).text)
+
+ def test_object_path_addattr(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c2" )
+ self.assertEqual(3, len(root.c1.c2))
+ path.addattr(root, "test")
+ self.assertEqual(4, len(root.c1.c2))
+ self.assertEqual(["0", "1", "2", "test"],
+ [el.text for el in root.c1.c2])
+
+ def test_object_path_addattr_element(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c2" )
+ self.assertEqual(3, len(root.c1.c2))
+
+ new_el = self.Element("{objectified}test")
+ etree.SubElement(new_el, "{objectified}sub").a = "TEST"
+
+ path.addattr(root, new_el.sub)
+ self.assertEqual(4, len(root.c1.c2))
+ self.assertEqual("TEST", root.c1.c2[3].a.text)
+ self.assertEqual(["0", "1", "2"],
+ [el.text for el in root.c1.c2[:3]])
+
+ def test_object_path_addattr_create(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c99" )
+ self.assertRaises(AttributeError, path.find, root)
+
+ new_value = "my new value"
+ path.addattr(root, new_value)
+
+ self.assertEqual(1, len(root.c1.c99))
+ self.assertEqual(new_value, root.c1.c99.text)
+ self.assertEqual(new_value, path(root).text)
+
+ def test_object_path_addattr_create_element(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c99" )
+ self.assertRaises(AttributeError, path.find, root)
+
+ new_el = self.Element("{objectified}test")
+ etree.SubElement(new_el, "{objectified}sub", myattr="ATTR").a = "TEST"
+
+ path.addattr(root, new_el.sub)
+ self.assertEqual(1, len(root.c1.c99))
+ self.assertEqual("TEST", root.c1.c99.a.text)
+ self.assertEqual("TEST", path(root).a.text)
+ self.assertEqual("ATTR", root.c1.c99.get("myattr"))
+
+ def test_object_path_addattr_create_list(self):
+ root = self.XML(xml_str)
+ path = objectify.ObjectPath( "root.c1.c99" )
+ self.assertRaises(AttributeError, path.find, root)
+
+ new_el = self.Element("{objectified}test")
+ new_el.a = ["TEST1", "TEST2"]
+
+ self.assertEqual(2, len(new_el.a))
+
+ path.addattr(root, list(new_el.a))
+ self.assertEqual(2, len(root.c1.c99))
+ self.assertEqual("TEST1", root.c1.c99.text)
+ self.assertEqual("TEST2", path(root)[1].text)
+
+ def test_descendant_paths(self):
+ root = self.XML(xml_str)
+ self.assertEqual(
+ ['{objectified}root', '{objectified}root.c1',
+ '{objectified}root.c1.c2',
+ '{objectified}root.c1.c2[1]', '{objectified}root.c1.c2[2]',
+ '{objectified}root.c1.{otherNS}c2', '{objectified}root.c1.{}c2'],
+ root.descendantpaths())
+
+ def test_descendant_paths_child(self):
+ root = self.XML(xml_str)
+ self.assertEqual(
+ ['{objectified}c1', '{objectified}c1.c2',
+ '{objectified}c1.c2[1]', '{objectified}c1.c2[2]',
+ '{objectified}c1.{otherNS}c2', '{objectified}c1.{}c2'],
+ root.c1.descendantpaths())
+
+ def test_descendant_paths_prefix(self):
+ root = self.XML(xml_str)
+ self.assertEqual(
+ ['root.{objectified}c1', 'root.{objectified}c1.c2',
+ 'root.{objectified}c1.c2[1]', 'root.{objectified}c1.c2[2]',
+ 'root.{objectified}c1.{otherNS}c2',
+ 'root.{objectified}c1.{}c2'],
+ root.c1.descendantpaths('root'))
+
+ def test_pickle(self):
+ import pickle
+
+ root = self.XML(xml_str)
+ out = BytesIO()
+ pickle.dump(root, out)
+
+ new_root = pickle.loads(out.getvalue())
+ self.assertEqual(
+ etree.tostring(new_root),
+ etree.tostring(root))
+
+ def test_pickle_elementtree(self):
+ import pickle
+
+ tree = etree.ElementTree(self.XML(xml_str + "<?my pi?>"))
+ out = BytesIO()
+ pickle.dump(tree, out)
+
+ new_tree = pickle.loads(out.getvalue())
+ self.assertTrue(isinstance(new_tree, etree._ElementTree))
+ self.assertEqual(
+ etree.tostring(new_tree),
+ etree.tostring(tree))
+
+ def test_pickle_intelement(self):
+ self._test_pickle('<x>42</x>')
+ self._test_pickle(objectify.DataElement(42))
+
+ def test_pickle_floattelement(self):
+ self._test_pickle('<x>42.0</x>')
+ self._test_pickle(objectify.DataElement(42.0))
+
+ def test_pickle_strelement(self):
+ self._test_pickle('<x>Pickle me!</x>')
+ self._test_pickle(objectify.DataElement('Pickle me!'))
+
+ def test_pickle_boolelement(self):
+ self._test_pickle('<x>true</x>')
+ self._test_pickle('<x>false</x>')
+ self._test_pickle(objectify.DataElement(True))
+ self._test_pickle(objectify.DataElement(False))
+
+ def test_pickle_noneelement(self):
+ self._test_pickle('''
+<x xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:nil="true"/>''')
+ self._test_pickle(objectify.DataElement(None))
+
+ def _test_pickle(self, stringOrElt):
+ import pickle
+ if isinstance(stringOrElt, (etree._Element, etree._ElementTree)):
+ elt = stringOrElt
+ else:
+ elt = self.XML(stringOrElt)
+ out = BytesIO()
+ pickle.dump(elt, out)
+
+ new_elt = pickle.loads(out.getvalue())
+ self.assertEqual(
+ etree.tostring(new_elt),
+ etree.tostring(elt))
+
+ # E-Factory tests, need to use sub-elements as root element is always
+ # type-looked-up as ObjectifiedElement (no annotations)
+ def test_efactory_int(self):
+ E = objectify.E
+ root = E.root(E.val(23))
+ self.assertTrue(isinstance(root.val, objectify.IntElement))
+
+ def test_efactory_float(self):
+ E = objectify.E
+ root = E.root(E.val(233.23))
+ self.assertTrue(isinstance(root.val, objectify.FloatElement))
+
+ def test_efactory_str(self):
+ E = objectify.E
+ root = E.root(E.val("what?"))
+ self.assertTrue(isinstance(root.val, objectify.StringElement))
+
+ def test_efactory_unicode(self):
+ E = objectify.E
+ root = E.root(E.val(_str("blöödy häll", encoding="ISO-8859-1")))
+ self.assertTrue(isinstance(root.val, objectify.StringElement))
+
+ def test_efactory_bool(self):
+ E = objectify.E
+ root = E.root(E.val(True))
+ self.assertTrue(isinstance(root.val, objectify.BoolElement))
+
+ def test_efactory_none(self):
+ E = objectify.E
+ root = E.root(E.val(None))
+ self.assertTrue(isinstance(root.val, objectify.NoneElement))
+
+ def test_efactory_value_concatenation(self):
+ E = objectify.E
+ root = E.root(E.val(1, "foo", 2.0, "bar ", True, None))
+ self.assertTrue(isinstance(root.val, objectify.StringElement))
+
+ def test_efactory_attrib(self):
+ E = objectify.E
+ root = E.root(foo="bar")
+ self.assertEqual(root.get("foo"), "bar")
+
+ def test_efactory_nested(self):
+ E = objectify.E
+ DataElement = objectify.DataElement
+ root = E.root("text", E.sub(E.subsub()), "tail", DataElement(1),
+ DataElement(2.0))
+ self.assertTrue(isinstance(root, objectify.ObjectifiedElement))
+ self.assertEqual(root.text, "text")
+ self.assertTrue(isinstance(root.sub, objectify.ObjectifiedElement))
+ self.assertEqual(root.sub.tail, "tail")
+ self.assertTrue(isinstance(root.sub.subsub, objectify.StringElement))
+ self.assertEqual(len(root.value), 2)
+ self.assertTrue(isinstance(root.value[0], objectify.IntElement))
+ self.assertTrue(isinstance(root.value[1], objectify.FloatElement))
+
+ def test_efactory_subtype(self):
+ class Attribute(objectify.ObjectifiedDataElement):
+ def __init__(self):
+ objectify.ObjectifiedDataElement.__init__(self)
+ self.set("datatype", "TYPE")
+ self.set("range", "0.,1.")
+
+ attr = Attribute()
+ self.assertEqual(attr.text, None)
+ self.assertEqual(attr.get("datatype"), "TYPE")
+ self.assertEqual(attr.get("range"), "0.,1.")
+
+ def test_XML_base_url_docinfo(self):
+ root = objectify.XML(_bytes("<root/>"), base_url="http://no/such/url")
+ docinfo = root.getroottree().docinfo
+ self.assertEqual(docinfo.URL, "http://no/such/url")
+
+ def test_XML_set_base_url_docinfo(self):
+ root = objectify.XML(_bytes("<root/>"), base_url="http://no/such/url")
+ docinfo = root.getroottree().docinfo
+ self.assertEqual(docinfo.URL, "http://no/such/url")
+ docinfo.URL = "https://secret/url"
+ self.assertEqual(docinfo.URL, "https://secret/url")
+
+ def test_parse_stringio_base_url(self):
+ tree = objectify.parse(BytesIO("<root/>"), base_url="http://no/such/url")
+ docinfo = tree.docinfo
+ self.assertEqual(docinfo.URL, "http://no/such/url")
+
+ def test_parse_base_url_docinfo(self):
+ tree = objectify.parse(fileInTestDir('include/test_xinclude.xml'),
+ base_url="http://no/such/url")
+ docinfo = tree.docinfo
+ self.assertEqual(docinfo.URL, "http://no/such/url")
+
+ def test_xml_base(self):
+ root = objectify.XML(_bytes("<root/>"), base_url="http://no/such/url")
+ self.assertEqual(root.base, "http://no/such/url")
+ self.assertEqual(
+ root.get('{http://www.w3.org/XML/1998/namespace}base'), None)
+ root.base = "https://secret/url"
+ self.assertEqual(root.base, "https://secret/url")
+ self.assertEqual(
+ root.get('{http://www.w3.org/XML/1998/namespace}base'),
+ "https://secret/url")
+
+ def test_xml_base_attribute(self):
+ root = objectify.XML(_bytes("<root/>"), base_url="http://no/such/url")
+ self.assertEqual(root.base, "http://no/such/url")
+ self.assertEqual(
+ root.get('{http://www.w3.org/XML/1998/namespace}base'), None)
+ root.set('{http://www.w3.org/XML/1998/namespace}base',
+ "https://secret/url")
+ self.assertEqual(root.base, "https://secret/url")
+ self.assertEqual(
+ root.get('{http://www.w3.org/XML/1998/namespace}base'),
+ "https://secret/url")
+
+ def test_standard_lookup(self):
+ XML = self.XML
+
+ xml = _bytes('''\
+ <root xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
+ <i>5</i>
+ <i>-5</i>
+ <l>4294967296</l>
+ <l>-4294967296</l>
+ <f>1.1</f>
+ <b>true</b>
+ <b>false</b>
+ <s>Strange things happen, where strings collide</s>
+ <s>True</s>
+ <s>False</s>
+ <s>t</s>
+ <s>f</s>
+ <s></s>
+ <s>None</s>
+ <n xsi:nil="true" />
+ </root>
+ ''')
+ root = XML(xml)
+
+ for i in root.i:
+ self.assertTrue(isinstance(i, objectify.IntElement))
+ for l in root.l:
+ self.assertTrue(isinstance(l, objectify.IntElement))
+ for f in root.f:
+ self.assertTrue(isinstance(f, objectify.FloatElement))
+ for b in root.b:
+ self.assertTrue(isinstance(b, objectify.BoolElement))
+ self.assertEqual(True, root.b[0])
+ self.assertEqual(False, root.b[1])
+ for s in root.s:
+ self.assertTrue(isinstance(s, objectify.StringElement))
+ self.assertTrue(isinstance(root.n, objectify.NoneElement))
+ self.assertEqual(None, root.n)
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ObjectifyTestCase)])
+ suite.addTests(doctest.DocTestSuite(objectify))
+ suite.addTests([make_doctest('../../../doc/objectify.txt')])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_pyclasslookup.py b/src/lxml/tests/test_pyclasslookup.py
new file mode 100644
index 0000000..d650870
--- /dev/null
+++ b/src/lxml/tests/test_pyclasslookup.py
@@ -0,0 +1,351 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests specific to the Python based class lookup.
+"""
+
+from __future__ import absolute_import
+
+import unittest
+
+from .common_imports import etree, HelperTestCase, _bytes
+
+from lxml.etree import PythonElementClassLookup
+
+
+xml_str = _bytes('''\
+<obj:root xmlns:obj="objectified" xmlns:other="otherNS">
+ <obj:c1 a1="A1" a2="A2" other:a3="A3">
+ <obj:c2>0</obj:c2>
+ <obj:c2>1</obj:c2>
+ <obj:c2>2</obj:c2>
+ <other:c2>3</other:c2>
+ <c2>3</c2>
+ </obj:c1>
+</obj:root>''')
+
+
+class PyClassLookupTestCase(HelperTestCase):
+ """Test cases for the lxml.pyclasslookup class lookup mechanism.
+ """
+ etree = etree
+ parser = etree.XMLParser()
+ Element = parser.makeelement
+
+ def tearDown(self):
+ self.parser.set_element_class_lookup(None)
+ super(PyClassLookupTestCase, self).tearDown()
+
+ def _setClassLookup(self, lookup_function):
+ class Lookup(PythonElementClassLookup):
+ def lookup(self, *args):
+ return lookup_function(*args)
+ self.parser.set_element_class_lookup( Lookup() )
+
+ def _buildElementClass(self):
+ class LocalElement(etree.ElementBase):
+ pass
+ return LocalElement
+
+ def XML(self, xml):
+ return self.etree.XML(xml, self.parser)
+
+ # --- Test cases
+
+ def test_lookup(self):
+ el_class = self._buildElementClass()
+ el_class.i = 1
+ def lookup(*args):
+ if el_class.i == 1:
+ el_class.i = 2
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertEqual(2, el_class.i)
+
+ def test_lookup_keep_ref_assertion(self):
+ el_class = self._buildElementClass()
+ el_class.EL = None
+ def lookup(doc, el):
+ if el_class.EL is None:
+ el_class.EL = el
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertNotEqual(None, el_class.EL)
+ self.assertRaises(ReferenceError, el_class.EL.getchildren)
+
+ def test_lookup_tag(self):
+ el_class = self._buildElementClass()
+ el_class.TAG = None
+ def lookup(doc, el):
+ if el_class.TAG is None:
+ el_class.TAG = el.tag
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertNotEqual(None, root.TAG)
+ self.assertEqual(root.tag, root.TAG)
+
+ def test_lookup_text(self):
+ el_class = self._buildElementClass()
+ el_class.TEXT = None
+ def lookup(doc, el):
+ if el_class.TEXT is None:
+ el_class.TEXT = el.text
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertNotEqual(None, root.TEXT)
+ self.assertEqual(root.text, root.TEXT)
+
+ def test_lookup_tail(self):
+ el_class = self._buildElementClass()
+ el_class.TAIL = None
+ def lookup(doc, el):
+ if el_class.TAIL is None:
+ el_class.TAIL = el.tail
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertEqual(root.tail, root.TAIL)
+
+ def test_lookup_attrib(self):
+ el_class = self._buildElementClass()
+ el_class.ATTRIB = None
+ def lookup(doc, el):
+ if el_class.ATTRIB is None:
+ el_class.ATTRIB = el[0].attrib
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ items1 = list(root[0].attrib.items())
+ items1.sort()
+ items2 = list(root.ATTRIB.items())
+ items2.sort()
+ self.assertEqual(items1, items2)
+
+ def test_lookup_prefix(self):
+ el_class = self._buildElementClass()
+ el_class.PREFIX = None
+ def lookup(doc, el):
+ if el_class.PREFIX is None:
+ el_class.PREFIX = el.prefix
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertEqual(root.prefix, root.PREFIX)
+
+ def test_lookup_sourceline(self):
+ el_class = self._buildElementClass()
+ el_class.LINE = None
+ def lookup(doc, el):
+ if el_class.LINE is None:
+ el_class.LINE = el.sourceline
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertEqual(root.sourceline, root.LINE)
+
+ def test_lookup_getitem(self):
+ el_class = self._buildElementClass()
+ el_class.CHILD_TAG = None
+ def lookup(doc, el):
+ el_class.CHILD_TAG = el[0].tag
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ child_tag = root.CHILD_TAG
+ self.assertNotEqual(None, child_tag)
+ self.assertEqual(root[0].tag, child_tag)
+
+ def test_lookup_getitem_neg(self):
+ el_class = self._buildElementClass()
+ el_class.CHILD_TAG = None
+ def lookup(doc, el):
+ if el_class.CHILD_TAG is None:
+ el_class.CHILD_TAG = el[-1].tag
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ child_tag = root.CHILD_TAG
+ self.assertNotEqual(None, child_tag)
+ self.assertEqual(root[-1].tag, child_tag)
+
+ def test_lookup_getslice(self):
+ el_class = self._buildElementClass()
+ el_class.CHILD_TAGS = None
+ def lookup(doc, el):
+ if el_class.CHILD_TAGS is None:
+ el_class.CHILD_TAGS = [ c.tag for c in el[1:-1] ]
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ child_tags = root.CHILD_TAGS
+ self.assertNotEqual(None, child_tags)
+ self.assertEqual([ c.tag for c in root[1:-1] ],
+ child_tags)
+
+ def test_lookup_len(self):
+ el_class = self._buildElementClass()
+ el_class.LEN = None
+ def lookup(doc, el):
+ if el_class.LEN is None:
+ el_class.LEN = len(el)
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertEqual(1, el_class.LEN)
+
+ def test_lookup_bool(self):
+ el_class = self._buildElementClass()
+ el_class.TRUE = None
+ def lookup(doc, el):
+ if el_class.TRUE is None:
+ el_class.TRUE = bool(el)
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertTrue(el_class.TRUE)
+
+ def test_lookup_get(self):
+ el_class = self._buildElementClass()
+ el_class.VAL = None
+ def lookup(doc, el):
+ if el_class.VAL is None:
+ el_class.VAL = el[0].get('a1')
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertNotEqual(None, el_class.VAL)
+ self.assertEqual(root[0].get('a1'), el_class.VAL)
+
+ def test_lookup_get_default(self):
+ el_class = self._buildElementClass()
+ default = str(id(el_class))
+ el_class.VAL = None
+ def lookup(doc, el):
+ if el_class.VAL is None:
+ el_class.VAL = el[0].get('unknownattribute', default)
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertEqual(default, el_class.VAL)
+
+ def test_lookup_getchildren(self):
+ el_class = self._buildElementClass()
+ el_class.CHILD_TAGS = None
+ def lookup(doc, el):
+ if el_class.CHILD_TAGS is None:
+ el_class.CHILD_TAGS = [ c.tag for c in el.getchildren() ]
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ child_tags = root.CHILD_TAGS
+ self.assertNotEqual(None, child_tags)
+ self.assertEqual([ c.tag for c in root.getchildren() ],
+ child_tags)
+
+ def test_lookup_iter_children(self):
+ el_class = self._buildElementClass()
+ el_class.CHILD_TAGS = None
+ def lookup(doc, el):
+ if el_class.CHILD_TAGS is None:
+ el_class.CHILD_TAGS = [ c.tag for c in el ]
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ child_tags = root.CHILD_TAGS
+ self.assertNotEqual(None, child_tags)
+ self.assertEqual([ c.tag for c in root.getchildren() ],
+ child_tags)
+
+ def test_lookup_iterchildren(self):
+ el_class = self._buildElementClass()
+ el_class.CHILD_TAGS = None
+ def lookup(doc, el):
+ if el_class.CHILD_TAGS is None:
+ el_class.CHILD_TAGS = [ c.tag for c in el.iterchildren() ]
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ child_tags = root.CHILD_TAGS
+ self.assertNotEqual(None, child_tags)
+ self.assertEqual([ c.tag for c in root.getchildren() ],
+ child_tags)
+
+ def test_lookup_iterchildren_tag(self):
+ el_class = self._buildElementClass()
+ el_class.CHILD_TAGS = None
+ def lookup(doc, el):
+ if not el_class.CHILD_TAGS:
+ el_class.CHILD_TAGS = [
+ c.tag for c in el.iterchildren(tag='{objectified}c2') ]
+ return el_class
+ self._setClassLookup(lookup)
+
+ root = self.XML(xml_str)
+ child_tags = root.CHILD_TAGS
+ self.assertNotEqual(None, child_tags)
+ self.assertEqual([], child_tags)
+
+ c1 = root[0]
+ child_tags = root.CHILD_TAGS
+ self.assertNotEqual(None, child_tags)
+ self.assertNotEqual([], child_tags)
+ self.assertEqual(
+ [ c.tag for c in root[0].iterchildren(tag='{objectified}c2') ],
+ child_tags)
+
+ def test_lookup_getparent(self):
+ el_class = self._buildElementClass()
+ el_class.PARENT = None
+ def lookup(doc, el):
+ if el_class.PARENT is None:
+ el_class.PARENT = el[0].getparent().tag
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertEqual(root.tag, root.PARENT)
+
+ def test_lookup_getnext(self):
+ el_class = self._buildElementClass()
+ el_class.NEXT = None
+ def lookup(doc, el):
+ if el_class.NEXT is None:
+ el_class.NEXT = el[0][1].getnext().tag
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertNotEqual(None, el_class.NEXT)
+ self.assertEqual(root[0][1].getnext().tag, el_class.NEXT)
+
+ def test_lookup_getprevious(self):
+ el_class = self._buildElementClass()
+ el_class.PREV = None
+ def lookup(doc, el):
+ if el_class.PREV is None:
+ el_class.PREV = el[0][1].getprevious().tag
+ return el_class
+ self._setClassLookup(lookup)
+ root = self.XML(xml_str)
+ self.assertNotEqual(None, el_class.PREV)
+ self.assertEqual(root[0][1].getprevious().tag, el_class.PREV)
+
+ def test_comments_fallback(self):
+ def return_none(*args):
+ return None
+
+ self._setClassLookup(return_none)
+ el = self.XML('<a><!-- hello world --></a>')
+ self.assertEqual(el[0].tag, self.etree.Comment)
+ self.assertEqual(el[0].text, " hello world ")
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(PyClassLookupTestCase)])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_relaxng.py b/src/lxml/tests/test_relaxng.py
new file mode 100644
index 0000000..3c589c1
--- /dev/null
+++ b/src/lxml/tests/test_relaxng.py
@@ -0,0 +1,260 @@
+# -*- coding: utf-8 -*-
+
+"""
+Test cases related to RelaxNG parsing and validation
+"""
+
+from __future__ import absolute_import
+
+import unittest
+
+from .common_imports import (
+ etree, BytesIO, _bytes, HelperTestCase, fileInTestDir, make_doctest, skipif
+)
+
+try:
+ import rnc2rng
+except ImportError:
+ rnc2rng = None
+
+
+class ETreeRelaxNGTestCase(HelperTestCase):
+ def test_relaxng(self):
+ tree_valid = self.parse('<a><b></b></a>')
+ tree_invalid = self.parse('<a><c></c></a>')
+ schema = self.parse('''\
+<element name="a" xmlns="http://relaxng.org/ns/structure/1.0">
+ <zeroOrMore>
+ <element name="b">
+ <text />
+ </element>
+ </zeroOrMore>
+</element>
+''')
+ schema = etree.RelaxNG(schema)
+ self.assertTrue(schema.validate(tree_valid))
+ self.assertFalse(schema.error_log.filter_from_errors())
+
+ self.assertFalse(schema.validate(tree_invalid))
+ self.assertTrue(schema.error_log.filter_from_errors())
+
+ self.assertTrue(schema.validate(tree_valid)) # repeat valid
+ self.assertFalse(schema.error_log.filter_from_errors()) # repeat valid
+
+ def test_relaxng_stringio(self):
+ tree_valid = self.parse('<a><b></b></a>')
+ tree_invalid = self.parse('<a><c></c></a>')
+ schema_file = BytesIO('''\
+<element name="a" xmlns="http://relaxng.org/ns/structure/1.0">
+ <zeroOrMore>
+ <element name="b">
+ <text />
+ </element>
+ </zeroOrMore>
+</element>
+''')
+ schema = etree.RelaxNG(file=schema_file)
+ self.assertTrue(schema.validate(tree_valid))
+ self.assertFalse(schema.validate(tree_invalid))
+
+ def test_relaxng_elementtree_error(self):
+ self.assertRaises(ValueError, etree.RelaxNG, etree.ElementTree())
+
+ def test_relaxng_error(self):
+ tree_invalid = self.parse('<a><c></c></a>')
+ schema = self.parse('''\
+<element name="a" xmlns="http://relaxng.org/ns/structure/1.0">
+ <zeroOrMore>
+ <element name="b">
+ <text />
+ </element>
+ </zeroOrMore>
+</element>
+''')
+ schema = etree.RelaxNG(schema)
+ self.assertFalse(schema.validate(tree_invalid))
+ errors = schema.error_log
+ self.assertTrue([log for log in errors
+ if log.level_name == "ERROR"])
+ self.assertTrue([log for log in errors
+ if "not expect" in log.message])
+
+ def test_relaxng_generic_error(self):
+ tree_invalid = self.parse('''\
+ <test>
+ <reference id="my-ref">This is my unique ref.</reference>
+ <data ref="my-ref">Valid data</data>
+ <data ref="myref">Invalid data</data>
+ </test>
+ ''')
+ schema = self.parse('''\
+ <grammar datatypeLibrary="http://www.w3.org/2001/XMLSchema-datatypes"
+ xmlns="http://relaxng.org/ns/structure/1.0">
+ <define name="by-ref">
+ <data type="IDREF"/>
+ </define>
+ <start>
+ <element name="test">
+ <zeroOrMore>
+ <element name="reference">
+ <attribute name="id">
+ <data type="ID"/>
+ </attribute>
+ <text/>
+ </element>
+ </zeroOrMore>
+ <zeroOrMore>
+ <element name="data">
+ <attribute name="ref">
+ <data type="IDREF"/>
+ </attribute>
+ <text/>
+ </element>
+ </zeroOrMore>
+ </element>
+ </start>
+ </grammar>
+ ''')
+
+ schema = etree.RelaxNG(schema)
+ self.assertFalse(schema.validate(tree_invalid))
+ errors = schema.error_log
+ self.assertTrue(errors)
+ self.assertTrue([log for log in errors if "IDREF" in log.message])
+ self.assertTrue([log for log in errors if "myref" in log.message])
+
+ def test_relaxng_invalid_schema(self):
+ schema = self.parse('''\
+<element name="a" xmlns="http://relaxng.org/ns/structure/1.0">
+ <zeroOrMore>
+ <element name="b" />
+ </zeroOrMore>
+</element>
+''')
+ self.assertRaises(etree.RelaxNGParseError,
+ etree.RelaxNG, schema)
+
+ def test_relaxng_invalid_schema2(self):
+ schema = self.parse('''\
+<grammar xmlns="http://relaxng.org/ns/structure/1.0" />
+''')
+ self.assertRaises(etree.RelaxNGParseError,
+ etree.RelaxNG, schema)
+
+ def test_relaxng_invalid_schema3(self):
+ schema = self.parse('''\
+<grammar xmlns="http://relaxng.org/ns/structure/1.0">
+ <define name="test">
+ <element name="test"/>
+ </define>
+</grammar>
+''')
+ self.assertRaises(etree.RelaxNGParseError,
+ etree.RelaxNG, schema)
+
+ def test_relaxng_invalid_schema4(self):
+ # segfault
+ schema = self.parse('''\
+<element name="a" xmlns="mynamespace" />
+''')
+ self.assertRaises(etree.RelaxNGParseError,
+ etree.RelaxNG, schema)
+
+ def test_relaxng_include(self):
+ # this will only work if we access the file through path or
+ # file object..
+ f = open(fileInTestDir('test1.rng'), 'rb')
+ try:
+ schema = etree.RelaxNG(file=f)
+ finally:
+ f.close()
+
+ def test_relaxng_shortcut(self):
+ tree_valid = self.parse('<a><b></b></a>')
+ tree_invalid = self.parse('<a><c></c></a>')
+ schema = self.parse('''\
+<element name="a" xmlns="http://relaxng.org/ns/structure/1.0">
+ <zeroOrMore>
+ <element name="b">
+ <text />
+ </element>
+ </zeroOrMore>
+</element>
+''')
+ self.assertTrue(tree_valid.relaxng(schema))
+ self.assertFalse(tree_invalid.relaxng(schema))
+
+ def test_multiple_elementrees(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ schema = etree.RelaxNG( self.parse('''\
+<element name="a" xmlns="http://relaxng.org/ns/structure/1.0">
+ <element name="b">
+ <text />
+ </element>
+ <element name="c">
+ <text />
+ </element>
+</element>
+''') )
+ self.assertTrue(schema.validate(tree))
+ self.assertFalse(schema.error_log.filter_from_errors())
+
+ self.assertTrue(schema.validate(tree)) # repeat valid
+ self.assertFalse(schema.error_log.filter_from_errors()) # repeat valid
+
+ schema = etree.RelaxNG( self.parse('''\
+<element name="b" xmlns="http://relaxng.org/ns/structure/1.0">
+ <text />
+</element>
+''') )
+ c_tree = etree.ElementTree(tree.getroot()[1])
+ self.assertEqual(self._rootstring(c_tree), _bytes('<c>C</c>'))
+ self.assertFalse(schema.validate(c_tree))
+ self.assertTrue(schema.error_log.filter_from_errors())
+
+ b_tree = etree.ElementTree(tree.getroot()[0])
+ self.assertEqual(self._rootstring(b_tree), _bytes('<b>B</b>'))
+ self.assertTrue(schema.validate(b_tree))
+ self.assertFalse(schema.error_log.filter_from_errors())
+
+
+class RelaxNGCompactTestCase(HelperTestCase):
+
+ pytestmark = skipif('rnc2rng is None')
+
+ def test_relaxng_compact(self):
+ tree_valid = self.parse('<a><b>B</b><c>C</c></a>')
+ tree_invalid = self.parse('<a><b></b></a>')
+ schema = etree.RelaxNG(file=fileInTestDir('test.rnc'))
+ self.assertTrue(schema.validate(tree_valid))
+ self.assertFalse(schema.validate(tree_invalid))
+
+ def test_relaxng_compact_file_obj(self):
+ with open(fileInTestDir('test.rnc'), 'r') as f:
+ schema = etree.RelaxNG(file=f)
+
+ tree_valid = self.parse('<a><b>B</b><c>C</c></a>')
+ tree_invalid = self.parse('<a><b></b></a>')
+ self.assertTrue(schema.validate(tree_valid))
+ self.assertFalse(schema.validate(tree_invalid))
+
+ def test_relaxng_compact_str(self):
+ tree_valid = self.parse('<a><b>B</b></a>')
+ tree_invalid = self.parse('<a><b>X</b></a>')
+ rnc_str = 'element a { element b { "B" } }'
+ schema = etree.RelaxNG.from_rnc_string(rnc_str)
+ self.assertTrue(schema.validate(tree_valid))
+ self.assertFalse(schema.validate(tree_invalid))
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ETreeRelaxNGTestCase)])
+ suite.addTests(
+ [make_doctest('../../../doc/validation.txt')])
+ if rnc2rng is not None:
+ suite.addTests([unittest.makeSuite(RelaxNGCompactTestCase)])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_sax.py b/src/lxml/tests/test_sax.py
new file mode 100644
index 0000000..2ed1e51
--- /dev/null
+++ b/src/lxml/tests/test_sax.py
@@ -0,0 +1,416 @@
+# -*- coding: utf-8 -*-
+
+"""
+Test cases related to SAX I/O
+"""
+
+from __future__ import absolute_import
+
+import unittest
+from xml.dom import pulldom
+from xml.sax.handler import ContentHandler
+
+from .common_imports import HelperTestCase, make_doctest, BytesIO, _bytes
+from lxml import sax
+
+
+class ETreeSaxTestCase(HelperTestCase):
+
+ def test_etree_sax_simple(self):
+ tree = self.parse('<a>ab<b/>ba</a>')
+ xml_out = self._saxify_serialize(tree)
+ self.assertEqual(_bytes('<a>ab<b/>ba</a>'),
+ xml_out)
+
+ def test_etree_sax_double(self):
+ tree = self.parse('<a>ab<b>bb</b>ba</a>')
+ xml_out = self._saxify_serialize(tree)
+ self.assertEqual(_bytes('<a>ab<b>bb</b>ba</a>'),
+ xml_out)
+
+ def test_etree_sax_comment(self):
+ tree = self.parse('<a>ab<!-- TEST -->ba</a>')
+ xml_out = self._saxify_serialize(tree)
+ self.assertEqual(_bytes('<a>abba</a>'),
+ xml_out)
+
+ def test_etree_sax_pi(self):
+ tree = self.parse('<a>ab<?this and that?>ba</a>')
+ xml_out = self._saxify_serialize(tree)
+ self.assertEqual(_bytes('<a>ab<?this and that?>ba</a>'),
+ xml_out)
+
+ def test_etree_sax_comment_root(self):
+ tree = self.parse('<!-- TEST --><a>ab</a>')
+ xml_out = self._saxify_serialize(tree)
+ self.assertEqual(_bytes('<a>ab</a>'),
+ xml_out)
+
+ def test_etree_sax_pi_root(self):
+ tree = self.parse('<?this and that?><a>ab</a>')
+ xml_out = self._saxify_serialize(tree)
+ self.assertEqual(_bytes('<?this and that?><a>ab</a>'),
+ xml_out)
+
+ def test_etree_sax_attributes(self):
+ tree = self.parse('<a aa="5">ab<b b="5"/>ba</a>')
+ xml_out = self._saxify_serialize(tree)
+ self.assertEqual(_bytes('<a aa="5">ab<b b="5"/>ba</a>'),
+ xml_out)
+
+ def test_etree_sax_ns1(self):
+ tree = self.parse('<a xmlns="bla">ab<b>bb</b>ba</a>')
+ new_tree = self._saxify_unsaxify(tree)
+ root = new_tree.getroot()
+ self.assertEqual('{bla}a',
+ root.tag)
+ self.assertEqual('{bla}b',
+ root[0].tag)
+
+ def test_etree_sax_ns2(self):
+ tree = self.parse('<a xmlns="blaA">ab<b:b xmlns:b="blaB">bb</b:b>ba</a>')
+ new_tree = self._saxify_unsaxify(tree)
+ root = new_tree.getroot()
+ self.assertEqual('{blaA}a',
+ root.tag)
+ self.assertEqual('{blaB}b',
+ root[0].tag)
+
+ def test_sax_to_pulldom(self):
+ tree = self.parse('<a xmlns="blaA">ab<b:b xmlns:b="blaB">bb</b:b>ba</a>')
+ handler = pulldom.SAX2DOM()
+ sax.saxify(tree, handler)
+ dom = handler.document
+
+ self.assertEqual('a',
+ dom.firstChild.localName)
+ self.assertEqual('blaA',
+ dom.firstChild.namespaceURI)
+ self.assertEqual(None,
+ dom.firstChild.prefix)
+
+ children = dom.firstChild.childNodes
+ self.assertEqual('ab',
+ children[0].nodeValue)
+ self.assertEqual('blaB',
+ children[1].namespaceURI)
+ self.assertEqual('ba',
+ children[2].nodeValue)
+
+ def test_sax_to_pulldom_multiple_namespaces(self):
+ tree = self.parse('<a xmlns="blaA" xmlns:a="blaA"></a>')
+ handler = pulldom.SAX2DOM()
+ sax.saxify(tree, handler)
+ dom = handler.document
+
+ # With multiple prefix definitions, the node should keep the one
+ # that was actually used, even if the others also are valid.
+ self.assertEqual('a',
+ dom.firstChild.localName)
+ self.assertEqual('blaA',
+ dom.firstChild.namespaceURI)
+ self.assertEqual(None,
+ dom.firstChild.prefix)
+
+ tree = self.parse('<a:a xmlns="blaA" xmlns:a="blaA"></a:a>')
+ handler = pulldom.SAX2DOM()
+ sax.saxify(tree, handler)
+ dom = handler.document
+
+ self.assertEqual('a',
+ dom.firstChild.localName)
+ self.assertEqual('blaA',
+ dom.firstChild.namespaceURI)
+ self.assertEqual('a',
+ dom.firstChild.prefix)
+
+ def test_element_sax(self):
+ tree = self.parse('<a><b/></a>')
+ a = tree.getroot()
+ b = a[0]
+
+ xml_out = self._saxify_serialize(a)
+ self.assertEqual(_bytes('<a><b/></a>'),
+ xml_out)
+
+ xml_out = self._saxify_serialize(b)
+ self.assertEqual(_bytes('<b/>'),
+ xml_out)
+
+ def test_element_sax_ns(self):
+ tree = self.parse('<a:a xmlns:a="blaA"><b/></a:a>')
+ a = tree.getroot()
+ b = a[0]
+
+ new_tree = self._saxify_unsaxify(a)
+ root = new_tree.getroot()
+ self.assertEqual('{blaA}a',
+ root.tag)
+ self.assertEqual('b',
+ root[0].tag)
+
+ new_tree = self._saxify_unsaxify(b)
+ root = new_tree.getroot()
+ self.assertEqual('b',
+ root.tag)
+ self.assertEqual(0,
+ len(root))
+
+ def test_etree_sax_handler_default_ns(self):
+ handler = sax.ElementTreeContentHandler()
+ handler.startDocument()
+ handler.startPrefixMapping(None, 'blaA')
+ handler.startElementNS(('blaA', 'a'), 'a', {})
+ handler.startPrefixMapping(None, 'blaB')
+ handler.startElementNS(('blaB', 'b'), 'b', {})
+ handler.endElementNS( ('blaB', 'b'), 'b')
+ handler.endPrefixMapping(None)
+ handler.startElementNS(('blaA', 'c'), 'c', {})
+ handler.endElementNS( ('blaA', 'c'), 'c')
+ handler.endElementNS( ('blaA', 'a'), 'a')
+ handler.endPrefixMapping(None)
+ handler.endDocument()
+
+ new_tree = handler.etree
+ root = new_tree.getroot()
+ self.assertEqual('{blaA}a',
+ root.tag)
+ self.assertEqual('{blaB}b',
+ root[0].tag)
+ self.assertEqual('{blaA}c',
+ root[1].tag)
+
+ def test_etree_sax_handler_default_ns_None(self):
+ handler = sax.ElementTreeContentHandler()
+ handler.startDocument()
+ handler.startPrefixMapping(None, 'blaA')
+ handler.startElementNS((None, 'a'), 'a', {})
+ handler.startPrefixMapping(None, 'blaB')
+ handler.startElementNS((None, 'b'), 'b', {})
+ handler.endElementNS( (None, 'b'), 'b')
+ handler.endPrefixMapping(None)
+ handler.startElementNS((None, 'c'), 'c', {})
+ handler.endElementNS( (None, 'c'), 'c')
+ handler.endElementNS( (None, 'a'), 'a')
+ handler.endPrefixMapping(None)
+ handler.endDocument()
+
+ new_tree = handler.etree
+ root = new_tree.getroot()
+ self.assertEqual('{blaA}a',
+ root.tag)
+ self.assertEqual('{blaB}b',
+ root[0].tag)
+ self.assertEqual('{blaA}c',
+ root[1].tag)
+
+ def test_etree_sax_redefine_ns(self):
+ handler = sax.ElementTreeContentHandler()
+ handler.startDocument()
+ handler.startPrefixMapping('ns', 'blaA')
+ handler.startElementNS(('blaA', 'a'), 'ns:a', {})
+ handler.startPrefixMapping('ns', 'blaB')
+ handler.startElementNS(('blaB', 'b'), 'ns:b', {})
+ handler.endElementNS( ('blaB', 'b'), 'ns:b')
+ handler.endPrefixMapping('ns')
+ handler.startElementNS(('blaA', 'c'), 'ns:c', {})
+ handler.endElementNS( ('blaA', 'c'), 'ns:c')
+ handler.endElementNS( ('blaA', 'a'), 'ns:a')
+ handler.endPrefixMapping('ns')
+ handler.endDocument()
+
+ new_tree = handler.etree
+ root = new_tree.getroot()
+ self.assertEqual('{blaA}a',
+ root.tag)
+ self.assertEqual('{blaB}b',
+ root[0].tag)
+ self.assertEqual('{blaA}c',
+ root[1].tag)
+
+ def test_etree_sax_no_ns(self):
+ handler = sax.ElementTreeContentHandler()
+ handler.startDocument()
+ handler.startElement('a', {})
+ handler.startElement('b', {})
+ handler.endElement('b')
+ handler.startElement('c') # with empty attributes
+ handler.endElement('c')
+ handler.endElement('a')
+ handler.endDocument()
+
+ new_tree = handler.etree
+ root = new_tree.getroot()
+ self.assertEqual('a', root.tag)
+ self.assertEqual('b', root[0].tag)
+ self.assertEqual('c', root[1].tag)
+
+ def test_etree_sax_no_ns_attributes(self):
+ handler = sax.ElementTreeContentHandler()
+ handler.startDocument()
+ handler.startElement('a', {"attr_a1": "a1"})
+ handler.startElement('b', {"attr_b1": "b1"})
+ handler.endElement('b')
+ handler.endElement('a')
+ handler.endDocument()
+
+ new_tree = handler.etree
+ root = new_tree.getroot()
+ self.assertEqual('a', root.tag)
+ self.assertEqual('b', root[0].tag)
+ self.assertEqual('a1', root.attrib["attr_a1"])
+ self.assertEqual('b1', root[0].attrib["attr_b1"])
+
+ def test_etree_sax_ns_attributes(self):
+ handler = sax.ElementTreeContentHandler()
+ handler.startDocument()
+
+ self.assertRaises(ValueError,
+ handler.startElement,
+ 'a', {"blaA:attr_a1": "a1"}
+ )
+
+ def test_etree_sax_error(self):
+ handler = sax.ElementTreeContentHandler()
+ handler.startDocument()
+ handler.startElement('a')
+ self.assertRaises(sax.SaxError, handler.endElement, 'b')
+
+ def test_etree_sax_error2(self):
+ handler = sax.ElementTreeContentHandler()
+ handler.startDocument()
+ handler.startElement('a')
+ handler.startElement('b')
+ self.assertRaises(sax.SaxError, handler.endElement, 'a')
+
+ def _saxify_unsaxify(self, saxifiable):
+ handler = sax.ElementTreeContentHandler()
+ sax.ElementTreeProducer(saxifiable, handler).saxify()
+ return handler.etree
+
+ def _saxify_serialize(self, tree):
+ new_tree = self._saxify_unsaxify(tree)
+ f = BytesIO()
+ new_tree.write(f)
+ return f.getvalue().replace(_bytes('\n'), _bytes(''))
+
+
+class SimpleContentHandler(ContentHandler, object):
+ """A SAX content handler that just stores the events"""
+
+ def __init__(self):
+ self.sax_events = []
+ super(SimpleContentHandler, self).__init__()
+
+ def startDocument(self):
+ self.sax_events.append(('startDocument',))
+
+ def endDocument(self):
+ self.sax_events.append(('endDocument',))
+
+ def startPrefixMapping(self, prefix, uri):
+ self.sax_events.append(('startPrefixMapping', prefix, uri))
+
+ def endPrefixMapping(self, prefix):
+ self.sax_events.append(('endPrefixMapping', prefix))
+
+ def startElement(self, name, attrs):
+ self.sax_events.append(('startElement', name, dict(attrs)))
+
+ def endElement(self, name):
+ self.sax_events.append(('endElement', name))
+
+ def startElementNS(self, name, qname, attrs):
+ self.sax_events.append(('startElementNS', name, qname, attrs._qnames))
+
+ def endElementNS(self, name, qname):
+ self.sax_events.append(('endElementNS', name, qname))
+
+ def characters(self, content):
+ self.sax_events.append(('characters', content))
+
+ def ignorableWhitespace(self, whitespace):
+ self.sax_events.append(('ignorableWhitespace', whitespace))
+
+ def processingInstruction(self, target, data):
+ self.sax_events.append(('processingInstruction', target, data))
+
+ def skippedEntity(self, name):
+ self.sax_events.append(('skippedEntity', name))
+
+
+class NSPrefixSaxTestCase(HelperTestCase):
+ """Testing that namespaces generate the right SAX events"""
+
+ def _saxify(self, tree):
+ handler = SimpleContentHandler()
+ sax.ElementTreeProducer(tree, handler).saxify()
+ return handler.sax_events
+
+ def test_element_sax_ns_prefix(self):
+ # The name of the prefix should be preserved, if the uri is unique
+ tree = self.parse('<a:a xmlns:a="blaA" xmlns:c="blaC">'
+ '<d a:attr="value" c:attr="value" /></a:a>')
+ a = tree.getroot()
+
+ self.assertEqual(
+ [('startElementNS', ('blaA', 'a'), 'a:a', {}),
+ ('startElementNS', (None, 'd'), 'd',
+ {('blaA', 'attr'): 'a:attr', ('blaC', 'attr'): 'c:attr'}),
+ ('endElementNS', (None, 'd'), 'd'),
+ ('endElementNS', ('blaA', 'a'), 'a:a'),
+ ],
+ self._saxify(a)[3:7])
+
+ def test_element_sax_default_ns_prefix(self):
+ # Default prefixes should also not get a generated prefix
+ tree = self.parse('<a xmlns="blaA"><b attr="value" /></a>')
+ a = tree.getroot()
+
+ self.assertEqual(
+ [('startDocument',),
+ # NS prefix should be None:
+ ('startPrefixMapping', None, 'blaA'),
+ ('startElementNS', ('blaA', 'a'), 'a', {}),
+ # Attribute prefix should be None:
+ ('startElementNS', ('blaA', 'b'), 'b', {(None, 'attr'): 'attr'}),
+ ('endElementNS', ('blaA', 'b'), 'b'),
+ ('endElementNS', ('blaA', 'a'), 'a'),
+ # Prefix should be None again:
+ ('endPrefixMapping', None),
+ ('endDocument',)],
+ self._saxify(a))
+
+ # Except for attributes, if there is both a default namespace
+ # and a named namespace with the same uri
+ tree = self.parse('<a xmlns="bla" xmlns:a="bla">'
+ '<b a:attr="value" /></a>')
+ a = tree.getroot()
+
+ self.assertEqual(
+ ('startElementNS', ('bla', 'b'), 'b', {('bla', 'attr'): 'a:attr'}),
+ self._saxify(a)[4])
+
+ def test_element_sax_twin_ns_prefix(self):
+ # Make an element with an doubly registered uri
+ tree = self.parse('<a xmlns:b="bla" xmlns:c="bla">'
+ '<d c:attr="attr" /></a>')
+ a = tree.getroot()
+
+ self.assertEqual(
+ # It should get the b prefix in this case
+ ('startElementNS', (None, 'd'), 'd', {('bla', 'attr'): 'b:attr'}),
+ self._saxify(a)[4])
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ETreeSaxTestCase)])
+ suite.addTests([unittest.makeSuite(NSPrefixSaxTestCase)])
+ suite.addTests(
+ [make_doctest('../../../doc/sax.txt')])
+ return suite
+
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_schematron.py b/src/lxml/tests/test_schematron.py
new file mode 100644
index 0000000..2096346
--- /dev/null
+++ b/src/lxml/tests/test_schematron.py
@@ -0,0 +1,82 @@
+# -*- coding: utf-8 -*-
+
+"""
+Test cases related to Schematron parsing and validation
+"""
+
+from __future__ import absolute_import
+
+import unittest
+
+from .common_imports import etree, HelperTestCase, make_doctest
+
+
+class ETreeSchematronTestCase(HelperTestCase):
+ def test_schematron(self):
+ tree_valid = self.parse('<AAA><BBB/><CCC/></AAA>')
+ tree_invalid = self.parse('<AAA><BBB/><CCC/><DDD/></AAA>')
+ schema = self.parse('''\
+<schema xmlns="http://purl.oclc.org/dsdl/schematron" >
+ <pattern name="Open model">
+ <rule context="AAA">
+ <assert test="BBB"> BBB element is not present</assert>
+ <assert test="CCC"> CCC element is not present</assert>
+ </rule>
+ </pattern>
+ <pattern name="Closed model">
+ <rule context="AAA">
+ <assert test="BBB"> BBB element is not present</assert>
+ <assert test="CCC"> CCC element is not present</assert>
+ <assert test="count(BBB|CCC) = count (*)">There is an extra element</assert>
+ </rule>
+ </pattern>
+</schema>
+''')
+ schema = etree.Schematron(schema)
+ self.assertTrue(schema.validate(tree_valid))
+ self.assertFalse(schema.error_log.filter_from_errors())
+
+ self.assertFalse(schema.validate(tree_invalid))
+ self.assertTrue(schema.error_log.filter_from_errors())
+
+ self.assertTrue(schema.validate(tree_valid)) # repeat valid
+ self.assertFalse(schema.error_log.filter_from_errors()) # repeat valid
+
+ def test_schematron_elementtree_error(self):
+ self.assertRaises(ValueError, etree.Schematron, etree.ElementTree())
+
+ def test_schematron_invalid_schema(self):
+ schema = self.parse('''\
+<schema xmlns="http://purl.oclc.org/dsdl/schematron" >
+ <pattern name="Open model">
+ </pattern>
+</schema>
+''')
+ self.assertRaises(etree.SchematronParseError,
+ etree.Schematron, schema)
+
+ def test_schematron_invalid_schema_empty(self):
+ schema = self.parse('''\
+<schema xmlns="http://purl.oclc.org/dsdl/schematron" />
+''')
+ self.assertRaises(etree.SchematronParseError,
+ etree.Schematron, schema)
+
+ def test_schematron_invalid_schema_namespace(self):
+ # segfault
+ schema = self.parse('''\
+<schema xmlns="mynamespace" />
+''')
+ self.assertRaises(etree.SchematronParseError,
+ etree.Schematron, schema)
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ETreeSchematronTestCase)])
+ suite.addTests(
+ [make_doctest('../../../doc/validation.txt')])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_threading.py b/src/lxml/tests/test_threading.py
new file mode 100644
index 0000000..2a16858
--- /dev/null
+++ b/src/lxml/tests/test_threading.py
@@ -0,0 +1,590 @@
+# -*- coding: utf-8 -*-
+
+"""
+Tests for thread usage in lxml.etree.
+"""
+
+from __future__ import absolute_import
+
+import re
+import sys
+import unittest
+import threading
+
+from .common_imports import etree, HelperTestCase, BytesIO, _bytes
+
+try:
+ from Queue import Queue
+except ImportError:
+ from queue import Queue # Py3
+
+
+class ThreadingTestCase(HelperTestCase):
+ """Threading tests"""
+ etree = etree
+
+ def _run_thread(self, func):
+ thread = threading.Thread(target=func)
+ thread.start()
+ thread.join()
+
+ def _run_threads(self, count, func, main_func=None):
+ sync = threading.Event()
+ lock = threading.Lock()
+ counter = dict(started=0, finished=0, failed=0)
+
+ def sync_start(func):
+ with lock:
+ started = counter['started'] + 1
+ counter['started'] = started
+ if started < count + (main_func is not None):
+ sync.wait(4) # wait until the other threads have started up
+ assert sync.is_set()
+ sync.set() # all waiting => go!
+ try:
+ func()
+ except:
+ with lock:
+ counter['failed'] += 1
+ raise
+ else:
+ with lock:
+ counter['finished'] += 1
+
+ threads = [threading.Thread(target=sync_start, args=(func,)) for _ in range(count)]
+ for thread in threads:
+ thread.start()
+ if main_func is not None:
+ sync_start(main_func)
+ for thread in threads:
+ thread.join()
+
+ self.assertEqual(0, counter['failed'])
+ self.assertEqual(counter['finished'], counter['started'])
+
+ def test_subtree_copy_thread(self):
+ tostring = self.etree.tostring
+ XML = self.etree.XML
+ xml = _bytes("<root><threadtag/></root>")
+ main_root = XML(_bytes("<root/>"))
+
+ def run_thread():
+ thread_root = XML(xml)
+ main_root.append(thread_root[0])
+ del thread_root
+
+ self._run_thread(run_thread)
+ self.assertEqual(xml, tostring(main_root))
+
+ def test_main_xslt_in_thread(self):
+ XML = self.etree.XML
+ style = XML(_bytes('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*">
+ <foo><xsl:copy><xsl:value-of select="/a/b/text()" /></xsl:copy></foo>
+ </xsl:template>
+</xsl:stylesheet>'''))
+ st = etree.XSLT(style)
+
+ result = []
+
+ def run_thread():
+ root = XML(_bytes('<a><b>B</b><c>C</c></a>'))
+ result.append( st(root) )
+
+ self._run_thread(run_thread)
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo><a>B</a></foo>
+''',
+ str(result[0]))
+
+ def test_thread_xslt(self):
+ XML = self.etree.XML
+ tostring = self.etree.tostring
+ root = XML(_bytes('<a><b>B</b><c>C</c></a>'))
+
+ def run_thread():
+ style = XML(_bytes('''\
+ <xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*">
+ <foo><xsl:copy><xsl:value-of select="/a/b/text()" /></xsl:copy></foo>
+ </xsl:template>
+ </xsl:stylesheet>'''))
+ st = etree.XSLT(style)
+ root.append( st(root).getroot() )
+
+ self._run_thread(run_thread)
+ self.assertEqual(_bytes('<a><b>B</b><c>C</c><foo><a>B</a></foo></a>'),
+ tostring(root))
+
+ def test_thread_xslt_parsing_error_log(self):
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="tag" />
+ <!-- extend time for parsing + transform -->
+''' + '\n'.join('<xsl:template match="tag%x" />' % i for i in range(200)) + '''
+ <xsl:UnExpectedElement />
+</xsl:stylesheet>''')
+ self.assertRaises(etree.XSLTParseError,
+ etree.XSLT, style)
+
+ error_logs = []
+
+ def run_thread():
+ try:
+ etree.XSLT(style)
+ except etree.XSLTParseError as e:
+ error_logs.append(e.error_log)
+ else:
+ self.assertFalse(True, "XSLT parsing should have failed but didn't")
+
+ self._run_threads(16, run_thread)
+
+ self.assertEqual(16, len(error_logs))
+ last_log = None
+ for log in error_logs:
+ self.assertTrue(len(log))
+ if last_log is not None:
+ self.assertEqual(len(last_log), len(log))
+ self.assertTrue(len(log) >= 2, len(log))
+ for error in log:
+ self.assertTrue(':ERROR:XSLT:' in str(error), str(error))
+ self.assertTrue(any('UnExpectedElement' in str(error) for error in log), log)
+ last_log = log
+
+ def test_thread_xslt_apply_error_log(self):
+ tree = self.parse('<tagFF/>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template name="tag0">
+ <xsl:message terminate="yes">FAIL</xsl:message>
+ </xsl:template>
+ <!-- extend time for parsing + transform -->
+''' + '\n'.join('<xsl:template match="tag%X" name="tag%x"> <xsl:call-template name="tag%x" /> </xsl:template>' % (i, i, i-1)
+ for i in range(1, 256)) + '''
+</xsl:stylesheet>''')
+ self.assertRaises(etree.XSLTApplyError,
+ etree.XSLT(style), tree)
+
+ error_logs = []
+
+ def run_thread():
+ transform = etree.XSLT(style)
+ try:
+ transform(tree)
+ except etree.XSLTApplyError:
+ error_logs.append(transform.error_log)
+ else:
+ self.assertFalse(True, "XSLT parsing should have failed but didn't")
+
+ self._run_threads(16, run_thread)
+
+ self.assertEqual(16, len(error_logs))
+ last_log = None
+ for log in error_logs:
+ self.assertTrue(len(log))
+ if last_log is not None:
+ self.assertEqual(len(last_log), len(log))
+ self.assertEqual(1, len(log))
+ for error in log:
+ self.assertTrue(':ERROR:XSLT:' in str(error))
+ last_log = log
+
+ def test_thread_xslt_attr_replace(self):
+ # this is the only case in XSLT where the result tree can be
+ # modified in-place
+ XML = self.etree.XML
+ tostring = self.etree.tostring
+ style = self.etree.XSLT(XML(_bytes('''\
+ <xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*">
+ <root class="abc">
+ <xsl:copy-of select="@class" />
+ <xsl:attribute name="class">xyz</xsl:attribute>
+ </root>
+ </xsl:template>
+ </xsl:stylesheet>''')))
+
+ result = []
+ def run_thread():
+ root = XML(_bytes('<ROOT class="ABC" />'))
+ result.append( style(root).getroot() )
+
+ self._run_thread(run_thread)
+ self.assertEqual(_bytes('<root class="xyz"/>'),
+ tostring(result[0]))
+
+ def test_thread_create_xslt(self):
+ XML = self.etree.XML
+ tostring = self.etree.tostring
+ root = XML(_bytes('<a><b>B</b><c>C</c></a>'))
+
+ stylesheets = []
+
+ def run_thread():
+ style = XML(_bytes('''\
+ <xsl:stylesheet
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ version="1.0">
+ <xsl:output method="xml" />
+ <xsl:template match="/">
+ <div id="test">
+ <xsl:apply-templates/>
+ </div>
+ </xsl:template>
+ </xsl:stylesheet>'''))
+ stylesheets.append( etree.XSLT(style) )
+
+ self._run_thread(run_thread)
+
+ st = stylesheets[0]
+ result = tostring( st(root) )
+
+ self.assertEqual(_bytes('<div id="test">BC</div>'),
+ result)
+
+ def test_thread_error_log(self):
+ XML = self.etree.XML
+ expected_error = [self.etree.ErrorTypes.ERR_TAG_NAME_MISMATCH]
+ children = "<a>test</a>" * 100
+
+ def parse_error_test(thread_no):
+ tag = "tag%d" % thread_no
+ xml = "<%s>%s</%s>" % (tag, children, tag.upper())
+ parser = self.etree.XMLParser()
+ for _ in range(10):
+ errors = None
+ try:
+ XML(xml, parser)
+ except self.etree.ParseError:
+ e = sys.exc_info()[1]
+ errors = e.error_log.filter_types(expected_error)
+ self.assertTrue(errors, "Expected error not found")
+ for error in errors:
+ self.assertTrue(
+ tag in error.message and tag.upper() in error.message,
+ "%s and %s not found in '%s'" % (
+ tag, tag.upper(), error.message))
+
+ self.etree.clear_error_log()
+ threads = []
+ for thread_no in range(1, 10):
+ t = threading.Thread(target=parse_error_test,
+ args=(thread_no,))
+ threads.append(t)
+ t.start()
+
+ parse_error_test(0)
+
+ for t in threads:
+ t.join()
+
+ def test_thread_mix(self):
+ XML = self.etree.XML
+ Element = self.etree.Element
+ SubElement = self.etree.SubElement
+ tostring = self.etree.tostring
+ xml = _bytes('<a><b>B</b><c xmlns="test">C</c></a>')
+ root = XML(xml)
+ fragment = XML(_bytes("<other><tags/></other>"))
+
+ result = self.etree.Element("{myns}root", att = "someval")
+
+ def run_XML():
+ thread_root = XML(xml)
+ result.append(thread_root[0])
+ result.append(thread_root[-1])
+
+ def run_parse():
+ thread_root = self.etree.parse(BytesIO(xml)).getroot()
+ result.append(thread_root[0])
+ result.append(thread_root[-1])
+
+ def run_move_main():
+ result.append(fragment[0])
+
+ def run_build():
+ result.append(
+ Element("{myns}foo", attrib={'{test}attr':'val'}))
+ SubElement(result, "{otherns}tasty")
+
+ def run_xslt():
+ style = XML(_bytes('''\
+ <xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*">
+ <xsl:copy><foo><xsl:value-of select="/a/b/text()" /></foo></xsl:copy>
+ </xsl:template>
+ </xsl:stylesheet>'''))
+ st = etree.XSLT(style)
+ result.append( st(root).getroot() )
+
+ for test in (run_XML, run_parse, run_move_main, run_xslt, run_build):
+ tostring(result)
+ self._run_thread(test)
+
+ self.assertEqual(
+ _bytes('<ns0:root xmlns:ns0="myns" att="someval"><b>B</b>'
+ '<c xmlns="test">C</c><b>B</b><c xmlns="test">C</c><tags/>'
+ '<a><foo>B</foo></a>'
+ '<ns0:foo xmlns:ns1="test" ns1:attr="val"/>'
+ '<ns1:tasty xmlns:ns1="otherns"/></ns0:root>'),
+ tostring(result))
+
+ def strip_first():
+ root = Element("newroot")
+ root.append(result[0])
+
+ while len(result):
+ self._run_thread(strip_first)
+
+ self.assertEqual(
+ _bytes('<ns0:root xmlns:ns0="myns" att="someval"/>'),
+ tostring(result))
+
+ def test_concurrent_attribute_names_in_dicts(self):
+ SubElement = self.etree.SubElement
+ names = list('abcdefghijklmnop')
+ runs_per_name = range(50)
+ result_matches = re.compile(
+ br'<thread_root>'
+ br'(?:<[a-p]{5} thread_attr_[a-p]="value" thread_attr2_[a-p]="value2"\s?/>)+'
+ br'</thread_root>').match
+
+ def testrun():
+ for _ in range(3):
+ root = self.etree.Element('thread_root')
+ for name in names:
+ tag_name = name * 5
+ new = []
+ for _ in runs_per_name:
+ el = SubElement(root, tag_name, {'thread_attr_' + name: 'value'})
+ new.append(el)
+ for el in new:
+ el.set('thread_attr2_' + name, 'value2')
+ s = etree.tostring(root)
+ self.assertTrue(result_matches(s))
+
+ # first, run only in sub-threads
+ self._run_threads(10, testrun)
+
+ # then, additionally include the main thread (and its parent dict)
+ self._run_threads(10, testrun, main_func=testrun)
+
+ def test_concurrent_proxies(self):
+ XML = self.etree.XML
+ root = XML(_bytes('<root><a>A</a><b xmlns="test">B</b><c/></root>'))
+ child_count = len(root)
+ def testrun():
+ for i in range(10000):
+ el = root[i%child_count]
+ del el
+ self._run_threads(10, testrun)
+
+ def test_concurrent_class_lookup(self):
+ XML = self.etree.XML
+
+ class TestElement(etree.ElementBase):
+ pass
+
+ class MyLookup(etree.CustomElementClassLookup):
+ repeat = range(100)
+ def lookup(self, t, d, ns, name):
+ count = 0
+ for i in self.repeat:
+ # allow other threads to run
+ count += 1
+ return TestElement
+
+ parser = self.etree.XMLParser()
+ parser.set_element_class_lookup(MyLookup())
+
+ root = XML(_bytes('<root><a>A</a><b xmlns="test">B</b><c/></root>'),
+ parser)
+
+ child_count = len(root)
+ def testrun():
+ for i in range(1000):
+ el = root[i%child_count]
+ del el
+ self._run_threads(10, testrun)
+
+
+class ThreadPipelineTestCase(HelperTestCase):
+ """Threading tests based on a thread worker pipeline.
+ """
+ etree = etree
+ item_count = 40
+
+ class Worker(threading.Thread):
+ def __init__(self, in_queue, in_count, **kwargs):
+ threading.Thread.__init__(self)
+ self.in_queue = in_queue
+ self.in_count = in_count
+ self.out_queue = Queue(in_count)
+ self.__dict__.update(kwargs)
+
+ def run(self):
+ get, put = self.in_queue.get, self.out_queue.put
+ handle = self.handle
+ for _ in range(self.in_count):
+ put(handle(get()))
+
+ def handle(self, data):
+ raise NotImplementedError()
+
+ class ParseWorker(Worker):
+ def handle(self, xml, _fromstring=etree.fromstring):
+ return _fromstring(xml)
+
+ class RotateWorker(Worker):
+ def handle(self, element):
+ first = element[0]
+ element[:] = element[1:]
+ element.append(first)
+ return element
+
+ class ReverseWorker(Worker):
+ def handle(self, element):
+ element[:] = element[::-1]
+ return element
+
+ class ParseAndExtendWorker(Worker):
+ def handle(self, element, _fromstring=etree.fromstring):
+ element.extend(_fromstring(self.xml))
+ return element
+
+ class ParseAndInjectWorker(Worker):
+ def handle(self, element, _fromstring=etree.fromstring):
+ root = _fromstring(self.xml)
+ root.extend(element)
+ return root
+
+ class Validate(Worker):
+ def handle(self, element):
+ element.getroottree().docinfo.internalDTD.assertValid(element)
+ return element
+
+ class SerialiseWorker(Worker):
+ def handle(self, element):
+ return etree.tostring(element)
+
+ xml = (b'''\
+<!DOCTYPE threadtest [
+ <!ELEMENT threadtest (thread-tag1,thread-tag2)+>
+ <!ATTLIST threadtest
+ version CDATA "1.0"
+ >
+ <!ELEMENT thread-tag1 EMPTY>
+ <!ELEMENT thread-tag2 (div)>
+ <!ELEMENT div (threaded)>
+ <!ATTLIST div
+ huhu CDATA #IMPLIED
+ >
+ <!ELEMENT threaded EMPTY>
+ <!ATTLIST threaded
+ host CDATA #REQUIRED
+ >
+]>
+<threadtest version="123">
+''' + (b'''
+ <thread-tag1 />
+ <thread-tag2>
+ <div huhu="true">
+ <threaded host="here" />
+ </div>
+ </thread-tag2>
+''') * 20 + b'''
+</threadtest>''')
+
+ def _build_pipeline(self, item_count, *classes, **kwargs):
+ in_queue = Queue(item_count)
+ start = last = classes[0](in_queue, item_count, **kwargs)
+ start.setDaemon(True)
+ for worker_class in classes[1:]:
+ last = worker_class(last.out_queue, item_count, **kwargs)
+ last.setDaemon(True)
+ last.start()
+ return in_queue, start, last
+
+ def test_thread_pipeline_thread_parse(self):
+ item_count = self.item_count
+ xml = self.xml.replace(b'thread', b'THREAD') # use fresh tag names
+
+ # build and start the pipeline
+ in_queue, start, last = self._build_pipeline(
+ item_count,
+ self.ParseWorker,
+ self.RotateWorker,
+ self.ReverseWorker,
+ self.ParseAndExtendWorker,
+ self.Validate,
+ self.ParseAndInjectWorker,
+ self.SerialiseWorker,
+ xml=xml)
+
+ # fill the queue
+ put = start.in_queue.put
+ for _ in range(item_count):
+ put(xml)
+
+ # start the first thread and thus everything
+ start.start()
+ # make sure the last thread has terminated
+ last.join(60) # time out after 60 seconds
+ self.assertEqual(item_count, last.out_queue.qsize())
+ # read the results
+ get = last.out_queue.get
+ results = [get() for _ in range(item_count)]
+
+ comparison = results[0]
+ for i, result in enumerate(results[1:]):
+ self.assertEqual(comparison, result)
+
+ def test_thread_pipeline_global_parse(self):
+ item_count = self.item_count
+ xml = self.xml.replace(b'thread', b'GLOBAL') # use fresh tag names
+ XML = self.etree.XML
+ # build and start the pipeline
+ in_queue, start, last = self._build_pipeline(
+ item_count,
+ self.RotateWorker,
+ self.ReverseWorker,
+ self.ParseAndExtendWorker,
+ self.Validate,
+ self.SerialiseWorker,
+ xml=xml)
+
+ # fill the queue
+ put = start.in_queue.put
+ for _ in range(item_count):
+ put(XML(xml))
+
+ # start the first thread and thus everything
+ start.start()
+ # make sure the last thread has terminated
+ last.join(60) # time out after 90 seconds
+ self.assertEqual(item_count, last.out_queue.qsize())
+ # read the results
+ get = last.out_queue.get
+ results = [get() for _ in range(item_count)]
+
+ comparison = results[0]
+ for i, result in enumerate(results[1:]):
+ self.assertEqual(comparison, result)
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ThreadingTestCase)])
+ suite.addTests([unittest.makeSuite(ThreadPipelineTestCase)])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_unicode.py b/src/lxml/tests/test_unicode.py
new file mode 100644
index 0000000..03ffcba
--- /dev/null
+++ b/src/lxml/tests/test_unicode.py
@@ -0,0 +1,211 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+
+import unittest
+import sys
+
+from .common_imports import StringIO, etree, HelperTestCase, _str, _bytes, _chr
+
+try:
+ unicode
+except NameError:
+ unicode = str
+
+ascii_uni = _bytes('a').decode('utf8')
+
+klingon = _bytes("\\uF8D2").decode("unicode_escape") # not valid for XML names
+
+invalid_tag = _bytes("test").decode('utf8') + klingon
+
+uni = _bytes('\\xc3\\u0680\\u3120').decode("unicode_escape") # some non-ASCII characters
+
+uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>"
+ ).decode("unicode_escape")
+
+
+class UnicodeTestCase(HelperTestCase):
+ def test__str(self):
+ # test the testing framework, namely _str from common_imports
+ self.assertEqual(_str('\x10'), _str('\u0010'))
+ self.assertEqual(_str('\x10'), _str('\U00000010'))
+ self.assertEqual(_str('\u1234'), _str('\U00001234'))
+
+ def test_unicode_xml(self):
+ tree = etree.XML('<p>%s</p>' % uni)
+ self.assertEqual(uni, tree.text)
+
+ def test_wide_unicode_xml(self):
+ if sys.maxunicode < 1114111:
+ return # skip test
+ tree = etree.XML(_bytes('<p>\\U00026007</p>').decode('unicode_escape'))
+ self.assertEqual(1, len(tree.text))
+ self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'),
+ tree.text)
+
+ def test_unicode_xml_broken(self):
+ uxml = ('<?xml version="1.0" encoding="UTF-8"?>' +
+ '<p>%s</p>' % uni)
+ self.assertRaises(ValueError, etree.XML, uxml)
+
+ def test_unicode_tag(self):
+ el = etree.Element(uni)
+ self.assertEqual(uni, el.tag)
+
+ def test_unicode_tag_invalid(self):
+ # sadly, Klingon is not well-formed
+ self.assertRaises(ValueError, etree.Element, invalid_tag)
+
+ def test_unicode_nstag(self):
+ tag = "{http://abc/}%s" % uni
+ el = etree.Element(tag)
+ self.assertEqual(tag, el.tag)
+
+ def test_unicode_ns_invalid(self):
+ # namespace URIs must conform to RFC 3986
+ tag = "{http://%s/}abc" % uni
+ self.assertRaises(ValueError, etree.Element, tag)
+
+ def test_unicode_nstag_invalid(self):
+ # sadly, Klingon is not well-formed
+ tag = "{http://abc/}%s" % invalid_tag
+ self.assertRaises(ValueError, etree.Element, tag)
+
+ def test_unicode_qname(self):
+ qname = etree.QName(uni, uni)
+ tag = "{%s}%s" % (uni, uni)
+ self.assertEqual(qname.text, tag)
+ self.assertEqual(unicode(qname), tag)
+
+ def test_unicode_qname_invalid(self):
+ self.assertRaises(ValueError, etree.QName, invalid_tag)
+
+ def test_unicode_attr(self):
+ el = etree.Element('foo', {'bar': uni})
+ self.assertEqual(uni, el.attrib['bar'])
+
+ def test_unicode_comment(self):
+ el = etree.Comment(uni)
+ self.assertEqual(uni, el.text)
+
+ def test_unicode_repr1(self):
+ x = etree.Element(_str('Ã¥'))
+ # must not raise UnicodeEncodeError
+ repr(x)
+
+ def test_unicode_repr2(self):
+ x = etree.Comment(_str('ö'))
+ repr(x)
+
+ def test_unicode_repr3(self):
+ x = etree.ProcessingInstruction(_str('Ã…'), _str('\u0131'))
+ repr(x)
+
+ def test_unicode_repr4(self):
+ x = etree.Entity(_str('ä'))
+ repr(x)
+
+ def test_unicode_text(self):
+ e = etree.Element('e')
+
+ def settext(text):
+ e.text = text
+
+ self.assertRaises(ValueError, settext, _str('ab\ufffe'))
+ self.assertRaises(ValueError, settext, _str('ö\ffff'))
+ self.assertRaises(ValueError, settext, _str('\u0123\ud800'))
+ self.assertRaises(ValueError, settext, _str('x\ud8ff'))
+ self.assertRaises(ValueError, settext, _str('\U00010000\udfff'))
+ self.assertRaises(ValueError, settext, _str('abd\x00def'))
+ # should not Raise
+ settext(_str('\ud7ff\ue000\U00010000\U0010FFFFäöas'))
+
+ for char_val in range(0xD800, 0xDFFF+1):
+ self.assertRaises(ValueError, settext, 'abc' + _chr(char_val))
+ self.assertRaises(ValueError, settext, _chr(char_val))
+ self.assertRaises(ValueError, settext, _chr(char_val) + 'abc')
+
+ self.assertRaises(ValueError, settext, _bytes('\xe4'))
+ self.assertRaises(ValueError, settext, _bytes('\x80'))
+ self.assertRaises(ValueError, settext, _bytes('\xff'))
+ self.assertRaises(ValueError, settext, _bytes('\x08'))
+ self.assertRaises(ValueError, settext, _bytes('\x19'))
+ self.assertRaises(ValueError, settext, _bytes('\x20\x00'))
+ # should not Raise
+ settext(_bytes('\x09\x0A\x0D\x20\x60\x7f'))
+
+ def test_uniname(self):
+ Element = etree.Element
+ def el(name):
+ return Element(name)
+
+ self.assertRaises(ValueError, el, ':')
+ self.assertRaises(ValueError, el, '0a')
+ self.assertRaises(ValueError, el, _str('\u203f'))
+ # should not Raise
+ el(_str('\u0132'))
+
+
+
+ def test_unicode_parse_stringio(self):
+ el = etree.parse(StringIO('<p>%s</p>' % uni)).getroot()
+ self.assertEqual(uni, el.text)
+
+## def test_parse_fileobject_unicode(self):
+## # parse unicode from unnamed file object (not supported by ElementTree)
+## f = SillyFileLike(uxml)
+## root = etree.parse(f).getroot()
+## self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'),
+## uxml)
+
+
+class EncodingsTestCase(HelperTestCase):
+ def test_illegal_utf8(self):
+ data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1')
+ self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data)
+
+ def test_illegal_utf8_recover(self):
+ data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1')
+ parser = etree.XMLParser(recover=True)
+ self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data, parser)
+
+ def _test_encoding(self, encoding, xml_encoding_name=None):
+ foo = """<?xml version='1.0' encoding='%s'?>\n<tag attrib='123'></tag>""" % (
+ xml_encoding_name or encoding)
+ root = etree.fromstring(foo.encode(encoding))
+ self.assertEqual('tag', root.tag)
+
+ doc_encoding = root.getroottree().docinfo.encoding
+ self.assertTrue(
+ doc_encoding.lower().rstrip('lbe'),
+ (xml_encoding_name or encoding).lower().rstrip('lbe'))
+
+ def test_utf8_fromstring(self):
+ self._test_encoding('utf-8')
+
+ def test_utf8sig_fromstring(self):
+ self._test_encoding('utf_8_sig', 'utf-8')
+
+ def test_utf16_fromstring(self):
+ self._test_encoding('utf-16')
+
+ def test_utf16LE_fromstring(self):
+ self._test_encoding('utf-16le', 'utf-16')
+
+ def test_utf16BE_fromstring(self):
+ self._test_encoding('utf-16be', 'utf-16')
+
+ def test_utf32_fromstring(self):
+ self._test_encoding('utf-32', 'utf-32')
+
+ def test_utf32LE_fromstring(self):
+ self._test_encoding('utf-32le', 'utf-32')
+
+ def test_utf32BE_fromstring(self):
+ self._test_encoding('utf-32be', 'utf-32')
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(UnicodeTestCase)])
+ suite.addTests([unittest.makeSuite(EncodingsTestCase)])
+ return suite
diff --git a/src/lxml/tests/test_xmlschema.py b/src/lxml/tests/test_xmlschema.py
new file mode 100644
index 0000000..c5653c1
--- /dev/null
+++ b/src/lxml/tests/test_xmlschema.py
@@ -0,0 +1,505 @@
+# -*- coding: utf-8 -*-
+
+"""
+Test cases related to XML Schema parsing and validation
+"""
+
+from __future__ import absolute_import
+
+import unittest
+
+from .common_imports import etree, BytesIO, HelperTestCase, fileInTestDir, make_doctest
+
+
+class ETreeXMLSchemaTestCase(HelperTestCase):
+ def test_xmlschema(self):
+ tree_valid = self.parse('<a><b></b></a>')
+ tree_invalid = self.parse('<a><c></c></a>')
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:string" />
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema)
+ self.assertTrue(schema.validate(tree_valid))
+ self.assertFalse(schema.validate(tree_invalid))
+ self.assertTrue(schema.validate(tree_valid)) # retry valid
+ self.assertFalse(schema.validate(tree_invalid)) # retry invalid
+
+ def test_xmlschema_error_log(self):
+ tree_valid = self.parse('<a><b></b></a>')
+ tree_invalid = self.parse('<a><c></c></a>')
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:string" />
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema)
+ self.assertTrue(schema.validate(tree_valid))
+ self.assertFalse(schema.error_log.filter_from_errors())
+
+ self.assertFalse(schema.validate(tree_invalid))
+ self.assertTrue(schema.error_log.filter_from_errors())
+ self.assertTrue(schema.error_log.filter_types(
+ etree.ErrorTypes.SCHEMAV_ELEMENT_CONTENT))
+
+ self.assertTrue(schema.validate(tree_valid))
+ self.assertFalse(schema.error_log.filter_from_errors())
+
+ self.assertFalse(schema.validate(tree_invalid))
+ self.assertTrue(schema.error_log.filter_from_errors())
+ self.assertTrue(schema.error_log.filter_types(
+ etree.ErrorTypes.SCHEMAV_ELEMENT_CONTENT))
+
+ def test_xmlschema_error_log_path(self):
+ """We don't have a guarantee that there will always be a path
+ for a _LogEntry object (or even a node for which to determine
+ a path), but at least when this test was created schema validation
+ errors always got a node and an XPath value. If that ever changes,
+ we can modify this test to something like::
+
+ self.assertTrue(error_path is None or tree_path == error_path)
+
+ That way, we can at least verify that if we did get a path value
+ it wasn't bogus.
+ """
+ tree = self.parse('<a><b>42</b><b>dada</b></a>')
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:integer" maxOccurs="2"/>
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema)
+ schema.validate(tree)
+ tree_path = tree.getpath(tree.findall('b')[1])
+ error_path = schema.error_log[0].path
+ self.assertTrue(tree_path == error_path)
+
+ def test_xmlschema_default_attributes(self):
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence minOccurs="4" maxOccurs="4">
+ <xsd:element name="b" type="BType" />
+ </xsd:sequence>
+ </xsd:complexType>
+ <xsd:complexType name="BType">
+ <xsd:attribute name="hardy" type="xsd:string" default="hey" />
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema, attribute_defaults=True)
+
+ tree = self.parse('<a><b hardy="ho"/><b/><b hardy="ho"/><b/></a>')
+
+ root = tree.getroot()
+ self.assertEqual('ho', root[0].get('hardy'))
+ self.assertEqual(None, root[1].get('hardy'))
+ self.assertEqual('ho', root[2].get('hardy'))
+ self.assertEqual(None, root[3].get('hardy'))
+
+ self.assertTrue(schema(tree))
+
+ root = tree.getroot()
+ self.assertEqual('ho', root[0].get('hardy'))
+ self.assertEqual('hey', root[1].get('hardy'))
+ self.assertEqual('ho', root[2].get('hardy'))
+ self.assertEqual('hey', root[3].get('hardy'))
+
+ def test_xmlschema_parse(self):
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:string" />
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema)
+ parser = etree.XMLParser(schema=schema)
+
+ tree_valid = self.parse('<a><b></b></a>', parser=parser)
+ self.assertEqual('a', tree_valid.getroot().tag)
+
+ self.assertRaises(etree.XMLSyntaxError,
+ self.parse, '<a><c></c></a>', parser=parser)
+
+ def test_xmlschema_parse_default_attributes(self):
+ # does not work as of libxml2 2.7.3
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence minOccurs="4" maxOccurs="4">
+ <xsd:element name="b" type="BType" />
+ </xsd:sequence>
+ </xsd:complexType>
+ <xsd:complexType name="BType">
+ <xsd:attribute name="hardy" type="xsd:string" default="hey" />
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema)
+ parser = etree.XMLParser(schema=schema, attribute_defaults=True)
+
+ tree_valid = self.parse('<a><b hardy="ho"/><b/><b hardy="ho"/><b/></a>',
+ parser=parser)
+ root = tree_valid.getroot()
+ self.assertEqual('ho', root[0].get('hardy'))
+ self.assertEqual('hey', root[1].get('hardy'))
+ self.assertEqual('ho', root[2].get('hardy'))
+ self.assertEqual('hey', root[3].get('hardy'))
+
+ def test_xmlschema_parse_default_attributes_schema_config(self):
+ # does not work as of libxml2 2.7.3
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence minOccurs="4" maxOccurs="4">
+ <xsd:element name="b" type="BType" />
+ </xsd:sequence>
+ </xsd:complexType>
+ <xsd:complexType name="BType">
+ <xsd:attribute name="hardy" type="xsd:string" default="hey" />
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema, attribute_defaults=True)
+ parser = etree.XMLParser(schema=schema)
+
+ tree_valid = self.parse('<a><b hardy="ho"/><b/><b hardy="ho"/><b/></a>',
+ parser=parser)
+ root = tree_valid.getroot()
+ self.assertEqual('ho', root[0].get('hardy'))
+ self.assertEqual('hey', root[1].get('hardy'))
+ self.assertEqual('ho', root[2].get('hardy'))
+ self.assertEqual('hey', root[3].get('hardy'))
+
+ def test_xmlschema_parse_fixed_attributes(self):
+ # does not work as of libxml2 2.7.3
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence minOccurs="3" maxOccurs="3">
+ <xsd:element name="b" type="BType" />
+ </xsd:sequence>
+ </xsd:complexType>
+ <xsd:complexType name="BType">
+ <xsd:attribute name="hardy" type="xsd:string" fixed="hey" />
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema)
+ parser = etree.XMLParser(schema=schema, attribute_defaults=True)
+
+ tree_valid = self.parse('<a><b/><b hardy="hey"/><b/></a>',
+ parser=parser)
+ root = tree_valid.getroot()
+ self.assertEqual('hey', root[0].get('hardy'))
+ self.assertEqual('hey', root[1].get('hardy'))
+ self.assertEqual('hey', root[2].get('hardy'))
+
+ def test_xmlschema_stringio(self):
+ schema_file = BytesIO('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:string" />
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(file=schema_file)
+ parser = etree.XMLParser(schema=schema)
+
+ tree_valid = self.parse('<a><b></b></a>', parser=parser)
+ self.assertEqual('a', tree_valid.getroot().tag)
+
+ self.assertRaises(etree.XMLSyntaxError,
+ self.parse, '<a><c></c></a>', parser=parser)
+
+ def test_xmlschema_iterparse(self):
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:string" />
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema)
+ xml = BytesIO('<a><b></b></a>')
+ events = [ (event, el.tag)
+ for (event, el) in etree.iterparse(xml, schema=schema) ]
+
+ self.assertEqual([('end', 'b'), ('end', 'a')],
+ events)
+
+ def test_xmlschema_iterparse_incomplete(self):
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:string" />
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema)
+ xml = BytesIO('<a><b></b></a>')
+ event, element = next(iter(etree.iterparse(xml, schema=schema)))
+ self.assertEqual('end', event)
+ self.assertEqual('b', element.tag)
+
+ def test_xmlschema_iterparse_fail(self):
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:string" />
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema)
+ self.assertRaises(
+ etree.XMLSyntaxError,
+ list, etree.iterparse(BytesIO('<a><c></c></a>'), schema=schema))
+
+ def test_xmlschema_elementtree_error(self):
+ self.assertRaises(ValueError, etree.XMLSchema, etree.ElementTree())
+
+ def test_xmlschema_comment_error(self):
+ self.assertRaises(ValueError, etree.XMLSchema, etree.Comment('TEST'))
+
+ def test_xmlschema_illegal_validation_error(self):
+ schema = self.parse('''
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="xsd:string"/>
+</xsd:schema>
+''')
+ schema = etree.XMLSchema(schema)
+
+ root = etree.Element('a')
+ root.text = 'TEST'
+ self.assertTrue(schema(root))
+
+ self.assertRaises(ValueError, schema, etree.Comment('TEST'))
+ self.assertRaises(ValueError, schema, etree.PI('a', 'text'))
+ self.assertRaises(ValueError, schema, etree.Entity('text'))
+
+ def test_xmlschema_invalid_schema1(self):
+ schema = self.parse('''\
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:string" />
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
+''')
+ self.assertRaises(etree.XMLSchemaParseError,
+ etree.XMLSchema, schema)
+
+ def test_xmlschema_invalid_schema2(self):
+ schema = self.parse('<test/>')
+ self.assertRaises(etree.XMLSchemaParseError,
+ etree.XMLSchema, schema)
+
+ def test_xmlschema_file(self):
+ # this will only work if we access the file through path or
+ # file object..
+ f = open(fileInTestDir('test.xsd'), 'rb')
+ try:
+ schema = etree.XMLSchema(file=f)
+ finally:
+ f.close()
+ tree_valid = self.parse('<a><b></b></a>')
+ self.assertTrue(schema.validate(tree_valid))
+
+ def test_xmlschema_import_file(self):
+ # this will only work if we access the file through path or
+ # file object..
+ schema = etree.XMLSchema(file=fileInTestDir('test_import.xsd'))
+ tree_valid = self.parse(
+ '<a:x xmlns:a="http://codespeak.net/lxml/schema/ns1"><b></b></a:x>')
+ self.assertTrue(schema.validate(tree_valid))
+
+ def test_xmlschema_shortcut(self):
+ tree_valid = self.parse('<a><b></b></a>')
+ tree_invalid = self.parse('<a><c></c></a>')
+ schema = self.parse('''\
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
+ <xsd:element name="a" type="AType"/>
+ <xsd:complexType name="AType">
+ <xsd:sequence>
+ <xsd:element name="b" type="xsd:string" />
+ </xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>
+''')
+ self.assertTrue(tree_valid.xmlschema(schema))
+ self.assertFalse(tree_invalid.xmlschema(schema))
+
+ def test_create_from_partial_doc(self):
+ # this used to crash because the schema part was not properly copied out
+ wsdl = self.parse('''\
+<wsdl:definitions
+ xmlns:wsdl="http://schemas.xmlsoap.org/wsdl/"
+ xmlns:xs="http://www.w3.org/2001/XMLSchema">
+ <wsdl:types>
+ <xs:schema>
+ </xs:schema>
+ </wsdl:types>
+</wsdl:definitions>
+ ''')
+ schema_element = wsdl.find(
+ "{http://schemas.xmlsoap.org/wsdl/}types/"
+ "{http://www.w3.org/2001/XMLSchema}schema"
+ )
+ etree.XMLSchema(schema_element)
+ etree.XMLSchema(schema_element)
+ etree.XMLSchema(schema_element)
+
+
+class ETreeXMLSchemaResolversTestCase(HelperTestCase):
+ resolver_schema_int = BytesIO("""\
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+ xmlns:etype="http://codespeak.net/lxml/test/external"
+ targetNamespace="http://codespeak.net/lxml/test/internal">
+ <xsd:import namespace="http://codespeak.net/lxml/test/external" schemaLocation="XXX.xsd" />
+ <xsd:element name="a" type="etype:AType"/>
+</xsd:schema>""")
+
+ resolver_schema_int2 = BytesIO("""\
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+ xmlns:etype="http://codespeak.net/lxml/test/external"
+ targetNamespace="http://codespeak.net/lxml/test/internal">
+ <xsd:import namespace="http://codespeak.net/lxml/test/external" schemaLocation="YYY.xsd" />
+ <xsd:element name="a" type="etype:AType"/>
+</xsd:schema>""")
+
+ resolver_schema_ext = """\
+<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema"
+ targetNamespace="http://codespeak.net/lxml/test/external">
+ <xsd:complexType name="AType">
+ <xsd:sequence><xsd:element name="b" type="xsd:string" minOccurs="0" maxOccurs="unbounded" /></xsd:sequence>
+ </xsd:complexType>
+</xsd:schema>"""
+
+ class simple_resolver(etree.Resolver):
+ def __init__(self, schema):
+ self.schema = schema
+
+ def resolve(self, url, id, context):
+ assert url == 'XXX.xsd'
+ return self.resolve_string(self.schema, context)
+
+ # tests:
+
+ def test_xmlschema_resolvers(self):
+ # test that resolvers work with schema.
+ parser = etree.XMLParser()
+ parser.resolvers.add(self.simple_resolver(self.resolver_schema_ext))
+ schema_doc = etree.parse(self.resolver_schema_int, parser = parser)
+ schema = etree.XMLSchema(schema_doc)
+
+ def test_xmlschema_resolvers_root(self):
+ # test that the default resolver will get called if there's no
+ # specific parser resolver.
+ root_resolver = self.simple_resolver(self.resolver_schema_ext)
+ etree.get_default_parser().resolvers.add(root_resolver)
+ schema_doc = etree.parse(self.resolver_schema_int)
+ schema = etree.XMLSchema(schema_doc)
+ etree.get_default_parser().resolvers.remove(root_resolver)
+
+ def test_xmlschema_resolvers_noroot(self):
+ # test that the default resolver will not get called when a
+ # more specific resolver is registered.
+
+ class res_root(etree.Resolver):
+ def resolve(self, url, id, context):
+ assert False
+ return None
+
+ root_resolver = res_root()
+ etree.get_default_parser().resolvers.add(root_resolver)
+
+ parser = etree.XMLParser()
+ parser.resolvers.add(self.simple_resolver(self.resolver_schema_ext))
+
+ schema_doc = etree.parse(self.resolver_schema_int, parser = parser)
+ schema = etree.XMLSchema(schema_doc)
+ etree.get_default_parser().resolvers.remove(root_resolver)
+
+ def test_xmlschema_nested_resolvers(self):
+ # test that resolvers work in a nested fashion.
+
+ resolver_schema = self.resolver_schema_ext
+
+ class res_nested(etree.Resolver):
+ def __init__(self, ext_schema):
+ self.ext_schema = ext_schema
+
+ def resolve(self, url, id, context):
+ assert url == 'YYY.xsd'
+ return self.resolve_string(self.ext_schema, context)
+
+ class res(etree.Resolver):
+ def __init__(self, ext_schema_1, ext_schema_2):
+ self.ext_schema_1 = ext_schema_1
+ self.ext_schema_2 = ext_schema_2
+
+ def resolve(self, url, id, context):
+ assert url == 'XXX.xsd'
+
+ new_parser = etree.XMLParser()
+ new_parser.resolvers.add(res_nested(self.ext_schema_2))
+ new_schema_doc = etree.parse(self.ext_schema_1, parser = new_parser)
+ new_schema = etree.XMLSchema(new_schema_doc)
+
+ return self.resolve_string(resolver_schema, context)
+
+ parser = etree.XMLParser()
+ parser.resolvers.add(res(self.resolver_schema_int2, self.resolver_schema_ext))
+ schema_doc = etree.parse(self.resolver_schema_int, parser = parser)
+ schema = etree.XMLSchema(schema_doc)
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ETreeXMLSchemaTestCase)])
+ suite.addTests([unittest.makeSuite(ETreeXMLSchemaResolversTestCase)])
+ suite.addTests(
+ [make_doctest('../../../doc/validation.txt')])
+ return suite
+
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_xpathevaluator.py b/src/lxml/tests/test_xpathevaluator.py
new file mode 100644
index 0000000..13ee97e
--- /dev/null
+++ b/src/lxml/tests/test_xpathevaluator.py
@@ -0,0 +1,748 @@
+# -*- coding: utf-8 -*-
+
+"""
+Test cases related to XPath evaluation and the XPath class
+"""
+
+from __future__ import absolute_import
+
+import unittest, sys
+
+from .common_imports import etree, HelperTestCase, _bytes, BytesIO, doctest, make_doctest
+
+
+class ETreeXPathTestCase(HelperTestCase):
+ """XPath tests etree"""
+
+ def test_xpath_boolean(self):
+ tree = self.parse('<a><b></b><b></b></a>')
+ self.assertTrue(tree.xpath('boolean(/a/b)'))
+ self.assertTrue(not tree.xpath('boolean(/a/c)'))
+
+ def test_xpath_number(self):
+ tree = self.parse('<a>1</a>')
+ self.assertEqual(1.,
+ tree.xpath('number(/a)'))
+ tree = self.parse('<a>A</a>')
+ actual = str(tree.xpath('number(/a)'))
+ expected = ['nan', '1.#qnan', 'nanq']
+ if not actual.lower() in expected:
+ self.fail('Expected a NAN value, got %s' % actual)
+
+ def test_xpath_string(self):
+ tree = self.parse('<a>Foo</a>')
+ self.assertEqual('Foo',
+ tree.xpath('string(/a/text())'))
+
+ def test_xpath_document_root(self):
+ tree = self.parse('<a><b/></a>')
+ self.assertEqual([],
+ tree.xpath('/'))
+
+ def test_xpath_namespace(self):
+ tree = self.parse('<a xmlns="test" xmlns:p="myURI"/>')
+ self.assertTrue((None, "test") in tree.xpath('namespace::*'))
+ self.assertTrue(('p', 'myURI') in tree.xpath('namespace::*'))
+
+ def test_xpath_namespace_empty(self):
+ tree = self.parse('<a/>')
+ self.assertEqual([('xml', 'http://www.w3.org/XML/1998/namespace')],
+ tree.xpath('namespace::*'))
+
+ def test_xpath_list_elements(self):
+ tree = self.parse('<a><b>Foo</b><b>Bar</b></a>')
+ root = tree.getroot()
+ self.assertEqual([root[0], root[1]],
+ tree.xpath('/a/b'))
+
+ def test_xpath_list_nothing(self):
+ tree = self.parse('<a><b/></a>')
+ self.assertEqual([],
+ tree.xpath('/a/c'))
+ # this seems to pass a different code path, also should return nothing
+ self.assertEqual([],
+ tree.xpath('/a/c/text()'))
+
+ def test_xpath_list_text(self):
+ tree = self.parse('<a><b>Foo</b><b>Bar</b></a>')
+ root = tree.getroot()
+ self.assertEqual(['Foo', 'Bar'],
+ tree.xpath('/a/b/text()'))
+
+ def test_xpath_list_text_parent(self):
+ tree = self.parse('<a><b>FooBar</b><b>BarFoo</b></a>')
+ root = tree.getroot()
+ self.assertEqual(['FooBar', 'BarFoo'],
+ tree.xpath('/a/b/text()'))
+ self.assertEqual([root[0], root[1]],
+ [r.getparent() for r in tree.xpath('/a/b/text()')])
+
+ def test_xpath_list_text_parent_no_smart_strings(self):
+ tree = self.parse('<a><b>FooBar</b><b>BarFoo</b></a>')
+ root = tree.getroot()
+ self.assertEqual(['FooBar', 'BarFoo'],
+ tree.xpath('/a/b/text()', smart_strings=True))
+ self.assertEqual([root[0], root[1]],
+ [r.getparent() for r in
+ tree.xpath('/a/b/text()', smart_strings=True)])
+ self.assertEqual([None, None],
+ [r.attrname for r in
+ tree.xpath('/a/b/text()', smart_strings=True)])
+
+ self.assertEqual(['FooBar', 'BarFoo'],
+ tree.xpath('/a/b/text()', smart_strings=False))
+ self.assertEqual([False, False],
+ [hasattr(r, 'getparent') for r in
+ tree.xpath('/a/b/text()', smart_strings=False)])
+ self.assertEqual([None, None],
+ [r.attrname for r in
+ tree.xpath('/a/b/text()', smart_strings=True)])
+
+ def test_xpath_list_unicode_text_parent(self):
+ xml = _bytes('<a><b>FooBar\\u0680\\u3120</b><b>BarFoo\\u0680\\u3120</b></a>').decode("unicode_escape")
+ tree = self.parse(xml.encode('utf-8'))
+ root = tree.getroot()
+ self.assertEqual([_bytes('FooBar\\u0680\\u3120').decode("unicode_escape"),
+ _bytes('BarFoo\\u0680\\u3120').decode("unicode_escape")],
+ tree.xpath('/a/b/text()'))
+ self.assertEqual([root[0], root[1]],
+ [r.getparent() for r in tree.xpath('/a/b/text()')])
+
+ def test_xpath_list_attribute(self):
+ tree = self.parse('<a b="B" c="C"/>')
+ self.assertEqual(['B'],
+ tree.xpath('/a/@b'))
+
+ def test_xpath_list_attribute_parent(self):
+ tree = self.parse('<a b="BaSdFgHjKl" c="CqWeRtZuI"/>')
+ results = tree.xpath('/a/@c')
+ self.assertEqual(1, len(results))
+ self.assertEqual('CqWeRtZuI', results[0])
+ self.assertEqual(tree.getroot().tag, results[0].getparent().tag)
+
+ def test_xpath_list_attribute_parent_no_smart_strings(self):
+ tree = self.parse('<a b="BaSdFgHjKl" c="CqWeRtZuI"/>')
+
+ results = tree.xpath('/a/@c', smart_strings=True)
+ self.assertEqual(1, len(results))
+ self.assertEqual('CqWeRtZuI', results[0])
+ self.assertEqual('c', results[0].attrname)
+ self.assertEqual(tree.getroot().tag, results[0].getparent().tag)
+
+ results = tree.xpath('/a/@c', smart_strings=False)
+ self.assertEqual(1, len(results))
+ self.assertEqual('CqWeRtZuI', results[0])
+ self.assertEqual(False, hasattr(results[0], 'getparent'))
+ self.assertEqual(False, hasattr(results[0], 'attrname'))
+
+ def test_xpath_text_from_other_document(self):
+ xml_data = '''
+ <table>
+ <item xml:id="k1"><value>v1</value></item>
+ <item xml:id="k2"><value>v2</value></item>
+ </table>
+ '''
+
+ def lookup(dummy, id):
+ return etree.XML(xml_data).xpath('id(%r)' % id)
+ functions = {(None, 'lookup') : lookup}
+
+ root = etree.XML('<dummy/>')
+ values = root.xpath("lookup('k1')/value/text()",
+ extensions=functions)
+ self.assertEqual(['v1'], values)
+ self.assertEqual('value', values[0].getparent().tag)
+
+ def test_xpath_list_comment(self):
+ tree = self.parse('<a><!-- Foo --></a>')
+ self.assertEqual(['<!-- Foo -->'],
+ list(map(repr, tree.xpath('/a/node()'))))
+
+ def test_rel_xpath_boolean(self):
+ root = etree.XML('<a><b><c/></b></a>')
+ el = root[0]
+ self.assertTrue(el.xpath('boolean(c)'))
+ self.assertTrue(not el.xpath('boolean(d)'))
+
+ def test_rel_xpath_list_elements(self):
+ tree = self.parse('<a><c><b>Foo</b><b>Bar</b></c><c><b>Hey</b></c></a>')
+ root = tree.getroot()
+ c = root[0]
+ self.assertEqual([c[0], c[1]],
+ c.xpath('b'))
+ self.assertEqual([c[0], c[1], root[1][0]],
+ c.xpath('//b'))
+
+ def test_xpath_ns(self):
+ tree = self.parse('<a xmlns="uri:a"><b></b></a>')
+ root = tree.getroot()
+ self.assertEqual(
+ [root[0]],
+ tree.xpath('//foo:b', namespaces={'foo': 'uri:a'}))
+ self.assertEqual(
+ [],
+ tree.xpath('//foo:b', namespaces={'foo': 'uri:c'}))
+ self.assertEqual(
+ [root[0]],
+ root.xpath('//baz:b', namespaces={'baz': 'uri:a'}))
+
+ def test_xpath_ns_none(self):
+ tree = self.parse('<a xmlns="uri:a"><b></b></a>')
+ root = tree.getroot()
+ self.assertRaises(
+ TypeError,
+ root.xpath, '//b', namespaces={None: 'uri:a'})
+
+ def test_xpath_ns_empty(self):
+ tree = self.parse('<a xmlns="uri:a"><b></b></a>')
+ root = tree.getroot()
+ self.assertRaises(
+ TypeError,
+ root.xpath, '//b', namespaces={'': 'uri:a'})
+
+ def test_xpath_error(self):
+ tree = self.parse('<a/>')
+ self.assertRaises(etree.XPathEvalError, tree.xpath, '\\fad')
+
+ def test_xpath_class_error(self):
+ self.assertRaises(SyntaxError, etree.XPath, '\\fad')
+ self.assertRaises(etree.XPathSyntaxError, etree.XPath, '\\fad')
+
+ def test_xpath_prefix_error(self):
+ tree = self.parse('<a/>')
+ self.assertRaises(etree.XPathEvalError, tree.xpath, '/fa:d')
+
+ def test_xpath_class_prefix_error(self):
+ tree = self.parse('<a/>')
+ xpath = etree.XPath("/fa:d")
+ self.assertRaises(etree.XPathEvalError, xpath, tree)
+
+ def test_elementtree_getpath(self):
+ a = etree.Element("a")
+ b = etree.SubElement(a, "b")
+ c = etree.SubElement(a, "c")
+ d1 = etree.SubElement(c, "d")
+ d2 = etree.SubElement(c, "d")
+
+ tree = etree.ElementTree(a)
+ self.assertEqual('/a/c/d',
+ tree.getpath(d2)[:6])
+ self.assertEqual([d2],
+ tree.xpath(tree.getpath(d2)))
+
+ def test_elementtree_getpath_partial(self):
+ a = etree.Element("a")
+ b = etree.SubElement(a, "b")
+ c = etree.SubElement(a, "c")
+ d1 = etree.SubElement(c, "d")
+ d2 = etree.SubElement(c, "d")
+
+ tree = etree.ElementTree(c)
+ self.assertEqual('/c/d',
+ tree.getpath(d2)[:4])
+ self.assertEqual([d2],
+ tree.xpath(tree.getpath(d2)))
+
+ def test_xpath_evaluator(self):
+ tree = self.parse('<a><b><c></c></b></a>')
+ e = etree.XPathEvaluator(tree)
+ root = tree.getroot()
+ self.assertEqual(
+ [root],
+ e('//a'))
+
+ def test_xpath_evaluator_tree(self):
+ tree = self.parse('<a><b><c></c></b></a>')
+ child_tree = etree.ElementTree(tree.getroot()[0])
+ e = etree.XPathEvaluator(child_tree)
+ self.assertEqual(
+ [],
+ e('a'))
+ root = child_tree.getroot()
+ self.assertEqual(
+ [root[0]],
+ e('c'))
+
+ def test_xpath_evaluator_tree_absolute(self):
+ tree = self.parse('<a><b><c></c></b></a>')
+ child_tree = etree.ElementTree(tree.getroot()[0])
+ e = etree.XPathEvaluator(child_tree)
+ self.assertEqual(
+ [],
+ e('/a'))
+ root = child_tree.getroot()
+ self.assertEqual(
+ [root],
+ e('/b'))
+ self.assertEqual(
+ [],
+ e('/c'))
+
+ def test_xpath_evaluator_element(self):
+ tree = self.parse('<a><b><c></c></b></a>')
+ root = tree.getroot()
+ e = etree.XPathEvaluator(root[0])
+ self.assertEqual(
+ [root[0][0]],
+ e('c'))
+
+ def test_xpath_extensions(self):
+ def foo(evaluator, a):
+ return 'hello %s' % a
+ extension = {(None, 'foo'): foo}
+ tree = self.parse('<a><b></b></a>')
+ e = etree.XPathEvaluator(tree, extensions=[extension])
+ self.assertEqual(
+ "hello you", e("foo('you')"))
+
+ def test_xpath_extensions_wrong_args(self):
+ def foo(evaluator, a, b):
+ return "hello %s and %s" % (a, b)
+ extension = {(None, 'foo'): foo}
+ tree = self.parse('<a><b></b></a>')
+ e = etree.XPathEvaluator(tree, extensions=[extension])
+ self.assertRaises(TypeError, e, "foo('you')")
+
+ def test_xpath_extensions_error(self):
+ def foo(evaluator, a):
+ return 1/0
+ extension = {(None, 'foo'): foo}
+ tree = self.parse('<a/>')
+ e = etree.XPathEvaluator(tree, extensions=[extension])
+ self.assertRaises(ZeroDivisionError, e, "foo('test')")
+
+ def test_xpath_extensions_nodes(self):
+ def f(evaluator, arg):
+ r = etree.Element('results')
+ b = etree.SubElement(r, 'result')
+ b.text = 'Hoi'
+ b = etree.SubElement(r, 'result')
+ b.text = 'Dag'
+ return r
+
+ x = self.parse('<a/>')
+ e = etree.XPathEvaluator(x, extensions=[{(None, 'foo'): f}])
+ r = e("foo('World')/result")
+ self.assertEqual(2, len(r))
+ self.assertEqual('Hoi', r[0].text)
+ self.assertEqual('Dag', r[1].text)
+
+ def test_xpath_extensions_nodes_append(self):
+ def f(evaluator, nodes):
+ r = etree.SubElement(nodes[0], 'results')
+ b = etree.SubElement(r, 'result')
+ b.text = 'Hoi'
+ b = etree.SubElement(r, 'result')
+ b.text = 'Dag'
+ return r
+
+ x = self.parse('<a/>')
+ e = etree.XPathEvaluator(x, extensions=[{(None, 'foo'): f}])
+ r = e("foo(/*)/result")
+ self.assertEqual(2, len(r))
+ self.assertEqual('Hoi', r[0].text)
+ self.assertEqual('Dag', r[1].text)
+
+ def test_xpath_extensions_nodes_append2(self):
+ def f(evaluator, nodes):
+ r = etree.Element('results')
+ b = etree.SubElement(r, 'result')
+ b.text = 'Hoi'
+ b = etree.SubElement(r, 'result')
+ b.text = 'Dag'
+ r.append(nodes[0])
+ return r
+
+ x = self.parse('<result>Honk</result>')
+ e = etree.XPathEvaluator(x, extensions=[{(None, 'foo'): f}])
+ r = e("foo(/*)/result")
+ self.assertEqual(3, len(r))
+ self.assertEqual('Hoi', r[0].text)
+ self.assertEqual('Dag', r[1].text)
+ self.assertEqual('Honk', r[2].text)
+
+ def test_xpath_context_node(self):
+ tree = self.parse('<root><a/><b><c/></b></root>')
+
+ check_call = []
+ def check_context(ctxt, nodes):
+ self.assertEqual(len(nodes), 1)
+ check_call.append(nodes[0].tag)
+ self.assertEqual(ctxt.context_node, nodes[0])
+ return True
+
+ find = etree.XPath("//*[p:foo(.)]",
+ namespaces={'p' : 'ns'},
+ extensions=[{('ns', 'foo') : check_context}])
+ find(tree)
+
+ check_call.sort()
+ self.assertEqual(check_call, ["a", "b", "c", "root"])
+
+ def test_xpath_eval_context_propagation(self):
+ tree = self.parse('<root><a/><b><c/></b></root>')
+
+ check_call = {}
+ def check_context(ctxt, nodes):
+ self.assertEqual(len(nodes), 1)
+ tag = nodes[0].tag
+ # empty during the "b" call, a "b" during the "c" call
+ check_call[tag] = ctxt.eval_context.get("b")
+ ctxt.eval_context[tag] = tag
+ return True
+
+ find = etree.XPath("//b[p:foo(.)]/c[p:foo(.)]",
+ namespaces={'p' : 'ns'},
+ extensions=[{('ns', 'foo') : check_context}])
+ result = find(tree)
+
+ self.assertEqual(result, [tree.getroot()[1][0]])
+ self.assertEqual(check_call, {'b':None, 'c':'b'})
+
+ def test_xpath_eval_context_clear(self):
+ tree = self.parse('<root><a/><b><c/></b></root>')
+
+ check_call = {}
+ def check_context(ctxt):
+ check_call["done"] = True
+ # context must be empty for each new evaluation
+ self.assertEqual(len(ctxt.eval_context), 0)
+ ctxt.eval_context["test"] = True
+ return True
+
+ find = etree.XPath("//b[p:foo()]",
+ namespaces={'p' : 'ns'},
+ extensions=[{('ns', 'foo') : check_context}])
+ result = find(tree)
+
+ self.assertEqual(result, [tree.getroot()[1]])
+ self.assertEqual(check_call["done"], True)
+
+ check_call.clear()
+ find = etree.XPath("//b[p:foo()]",
+ namespaces={'p' : 'ns'},
+ extensions=[{('ns', 'foo') : check_context}])
+ result = find(tree)
+
+ self.assertEqual(result, [tree.getroot()[1]])
+ self.assertEqual(check_call["done"], True)
+
+ def test_xpath_variables(self):
+ x = self.parse('<a attr="true"/>')
+ e = etree.XPathEvaluator(x)
+
+ expr = "/a[@attr=$aval]"
+ r = e(expr, aval=1)
+ self.assertEqual(0, len(r))
+
+ r = e(expr, aval="true")
+ self.assertEqual(1, len(r))
+ self.assertEqual("true", r[0].get('attr'))
+
+ r = e(expr, aval=True)
+ self.assertEqual(1, len(r))
+ self.assertEqual("true", r[0].get('attr'))
+
+ def test_xpath_variables_nodeset(self):
+ x = self.parse('<a attr="true"/>')
+ e = etree.XPathEvaluator(x)
+
+ element = etree.Element("test-el")
+ etree.SubElement(element, "test-sub")
+ expr = "$value"
+ r = e(expr, value=element)
+ self.assertEqual(1, len(r))
+ self.assertEqual(element.tag, r[0].tag)
+ self.assertEqual(element[0].tag, r[0][0].tag)
+
+ def test_xpath_extensions_mix(self):
+ x = self.parse('<a attr="true"><test/></a>')
+
+ class LocalException(Exception):
+ pass
+
+ def foo(evaluator, a, varval):
+ etree.Element("DUMMY")
+ if varval == 0:
+ raise LocalException
+ elif varval == 1:
+ return ()
+ elif varval == 2:
+ return None
+ elif varval == 3:
+ return a[0][0]
+ a = a[0]
+ if a.get("attr") == str(varval):
+ return a
+ else:
+ return etree.Element("NODE")
+
+ extension = {(None, 'foo'): foo}
+ e = etree.XPathEvaluator(x, extensions=[extension])
+ del x
+
+ self.assertRaises(LocalException, e, "foo(., 0)")
+ self.assertRaises(LocalException, e, "foo(., $value)", value=0)
+
+ r = e("foo(., $value)", value=1)
+ self.assertEqual(len(r), 0)
+
+ r = e("foo(., 1)")
+ self.assertEqual(len(r), 0)
+
+ r = e("foo(., $value)", value=2)
+ self.assertEqual(len(r), 0)
+
+ r = e("foo(., $value)", value=3)
+ self.assertEqual(len(r), 1)
+ self.assertEqual(r[0].tag, "test")
+
+ r = e("foo(., $value)", value="false")
+ self.assertEqual(len(r), 1)
+ self.assertEqual(r[0].tag, "NODE")
+
+ r = e("foo(., 'false')")
+ self.assertEqual(len(r), 1)
+ self.assertEqual(r[0].tag, "NODE")
+
+ r = e("foo(., 'true')")
+ self.assertEqual(len(r), 1)
+ self.assertEqual(r[0].tag, "a")
+ self.assertEqual(r[0][0].tag, "test")
+
+ r = e("foo(., $value)", value="true")
+ self.assertEqual(len(r), 1)
+ self.assertEqual(r[0].tag, "a")
+
+ self.assertRaises(LocalException, e, "foo(., 0)")
+ self.assertRaises(LocalException, e, "foo(., $value)", value=0)
+
+
+class ETreeXPathClassTestCase(HelperTestCase):
+ "Tests for the XPath class"
+ def test_xpath_compile_doc(self):
+ x = self.parse('<a attr="true"/>')
+
+ expr = etree.XPath("/a[@attr != 'true']")
+ r = expr(x)
+ self.assertEqual(0, len(r))
+
+ expr = etree.XPath("/a[@attr = 'true']")
+ r = expr(x)
+ self.assertEqual(1, len(r))
+
+ expr = etree.XPath( expr.path )
+ r = expr(x)
+ self.assertEqual(1, len(r))
+
+ def test_xpath_compile_element(self):
+ x = self.parse('<a><b/><c/></a>')
+ root = x.getroot()
+
+ expr = etree.XPath("./b")
+ r = expr(root)
+ self.assertEqual(1, len(r))
+ self.assertEqual('b', r[0].tag)
+
+ expr = etree.XPath("./*")
+ r = expr(root)
+ self.assertEqual(2, len(r))
+
+ def test_xpath_compile_vars(self):
+ x = self.parse('<a attr="true"/>')
+
+ expr = etree.XPath("/a[@attr=$aval]")
+ r = expr(x, aval=False)
+ self.assertEqual(0, len(r))
+
+ r = expr(x, aval=True)
+ self.assertEqual(1, len(r))
+
+ def test_xpath_compile_error(self):
+ self.assertRaises(SyntaxError, etree.XPath, '\\fad')
+
+ def test_xpath_elementtree_error(self):
+ self.assertRaises(ValueError, etree.XPath('*'), etree.ElementTree())
+
+
+class ETreeXPathExsltTestCase(HelperTestCase):
+ "Tests for the EXSLT support in XPath (requires libxslt 1.1.25+)"
+
+ NSMAP = dict(
+ date = "http://exslt.org/dates-and-times",
+ math = "http://exslt.org/math",
+ set = "http://exslt.org/sets",
+ str = "http://exslt.org/strings",
+ )
+
+ def test_xpath_exslt_functions_date(self):
+ tree = self.parse('<a><b>2009-11-12</b><b>2008-12-11</b></a>')
+
+ match_dates = tree.xpath('//b[date:year(string()) = 2009]',
+ namespaces=self.NSMAP)
+ self.assertTrue(match_dates, str(match_dates))
+ self.assertEqual(len(match_dates), 1, str(match_dates))
+ self.assertEqual(match_dates[0].text, '2009-11-12')
+
+ def test_xpath_exslt_functions_strings(self):
+ tree = self.parse('<a><b>2009-11-12</b><b>2008-12-11</b></a>')
+
+ aligned_date = tree.xpath(
+ 'str:align(string(//b[1]), "%s", "center")' % ('-'*20),
+ namespaces=self.NSMAP)
+ self.assertTrue(aligned_date, str(aligned_date))
+ self.assertEqual(aligned_date, '-----2009-11-12-----')
+
+
+class ETreeETXPathClassTestCase(HelperTestCase):
+ "Tests for the ETXPath class"
+ def test_xpath_compile_ns(self):
+ x = self.parse('<a><b xmlns="nsa"/><b xmlns="nsb"/></a>')
+
+ expr = etree.ETXPath("/a/{nsa}b")
+ r = expr(x)
+ self.assertEqual(1, len(r))
+ self.assertEqual('{nsa}b', r[0].tag)
+
+ expr = etree.ETXPath("/a/{nsb}b")
+ r = expr(x)
+ self.assertEqual(1, len(r))
+ self.assertEqual('{nsb}b', r[0].tag)
+
+ # disabled this test as non-ASCII characters in namespace URIs are
+ # not acceptable
+ def _test_xpath_compile_unicode(self):
+ x = self.parse(_bytes('<a><b xmlns="http://nsa/\\uf8d2"/><b xmlns="http://nsb/\\uf8d1"/></a>'
+ ).decode("unicode_escape"))
+
+ expr = etree.ETXPath(_bytes("/a/{http://nsa/\\uf8d2}b").decode("unicode_escape"))
+ r = expr(x)
+ self.assertEqual(1, len(r))
+ self.assertEqual(_bytes('{http://nsa/\\uf8d2}b').decode("unicode_escape"), r[0].tag)
+
+ expr = etree.ETXPath(_bytes("/a/{http://nsb/\\uf8d1}b").decode("unicode_escape"))
+ r = expr(x)
+ self.assertEqual(1, len(r))
+ self.assertEqual(_bytes('{http://nsb/\\uf8d1}b').decode("unicode_escape"), r[0].tag)
+
+SAMPLE_XML = etree.parse(BytesIO("""
+<body>
+ <tag>text</tag>
+ <section>
+ <tag>subtext</tag>
+ </section>
+ <tag />
+ <tag />
+</body>
+"""))
+
+def tag(elem):
+ return elem.tag
+
+def tag_or_value(elem):
+ return getattr(elem, 'tag', elem)
+
+def stringTest(ctxt, s1):
+ return "Hello "+s1
+
+def stringListTest(ctxt, s1):
+ return ["Hello "] + list(s1) + ["!"]
+
+def floatTest(ctxt, f1):
+ return f1+4
+
+def booleanTest(ctxt, b1):
+ return not b1
+
+def setTest(ctxt, st1):
+ return st1[0]
+
+def setTest2(ctxt, st1):
+ return st1[0:2]
+
+def argsTest1(ctxt, s, f, b, st):
+ return ", ".join(map(str, (s, f, b, list(map(tag, st)))))
+
+def argsTest2(ctxt, st1, st2):
+ st1.extend(st2)
+ return st1
+
+def resultTypesTest(ctxt):
+ return [None,None]
+
+def resultTypesTest2(ctxt):
+ return resultTypesTest
+
+uri = "http://www.example.com/"
+
+extension = {(None, 'stringTest'): stringTest,
+ (None, 'stringListTest'): stringListTest,
+ (None, 'floatTest'): floatTest,
+ (None, 'booleanTest'): booleanTest,
+ (None, 'setTest'): setTest,
+ (None, 'setTest2'): setTest2,
+ (None, 'argsTest1'): argsTest1,
+ (None, 'argsTest2'): argsTest2,
+ (None, 'resultTypesTest'): resultTypesTest,
+ (None, 'resultTypesTest2'): resultTypesTest2,}
+
+def xpath():
+ """
+ Test xpath extension functions.
+
+ >>> root = SAMPLE_XML
+ >>> e = etree.XPathEvaluator(root, extensions=[extension])
+ >>> e("stringTest('you')")
+ 'Hello you'
+ >>> e(_bytes("stringTest('\\\\xe9lan')").decode("unicode_escape"))
+ u'Hello \\xe9lan'
+ >>> e("stringTest('you','there')") #doctest: +ELLIPSIS
+ Traceback (most recent call last):
+ ...
+ TypeError: stringTest() takes... 2 ...arguments ...
+ >>> e("floatTest(2)")
+ 6.0
+ >>> e("booleanTest(true())")
+ False
+ >>> list(map(tag, e("setTest(/body/tag)")))
+ ['tag']
+ >>> list(map(tag, e("setTest2(/body/*)")))
+ ['tag', 'section']
+ >>> list(map(tag_or_value, e("stringListTest(/body/tag)")))
+ ['Hello ', 'tag', 'tag', 'tag', '!']
+ >>> e("argsTest1('a',1.5,true(),/body/tag)")
+ "a, 1.5, True, ['tag', 'tag', 'tag']"
+ >>> list(map(tag, e("argsTest2(/body/tag, /body/section)")))
+ ['tag', 'section', 'tag', 'tag']
+ >>> e("resultTypesTest()")
+ Traceback (most recent call last):
+ ...
+ XPathResultError: This is not a supported node-set result: None
+ >>> try:
+ ... e("resultTypesTest2()")
+ ... except etree.XPathResultError:
+ ... print("Got error")
+ Got error
+ """
+
+if sys.version_info[0] >= 3:
+ xpath.__doc__ = xpath.__doc__.replace(" u'", " '")
+ xpath.__doc__ = xpath.__doc__.replace(" XPathResultError",
+ " lxml.etree.XPathResultError")
+ xpath.__doc__ = xpath.__doc__.replace(" exactly 2 arguments",
+ " exactly 2 positional arguments")
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ETreeXPathTestCase)])
+ suite.addTests([unittest.makeSuite(ETreeXPathClassTestCase)])
+ if etree.LIBXSLT_COMPILED_VERSION >= (1,1,25):
+ suite.addTests([unittest.makeSuite(ETreeXPathExsltTestCase)])
+ suite.addTests([unittest.makeSuite(ETreeETXPathClassTestCase)])
+ suite.addTests([doctest.DocTestSuite()])
+ suite.addTests(
+ [make_doctest('../../../doc/xpathxslt.txt')])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/tests/test_xslt.py b/src/lxml/tests/test_xslt.py
new file mode 100644
index 0000000..cde2335
--- /dev/null
+++ b/src/lxml/tests/test_xslt.py
@@ -0,0 +1,2093 @@
+# -*- coding: utf-8 -*-
+
+"""
+Test cases related to XSLT processing
+"""
+
+from __future__ import absolute_import
+
+import io
+import sys
+import copy
+import gzip
+import os.path
+import unittest
+import contextlib
+from textwrap import dedent
+from tempfile import NamedTemporaryFile, mkdtemp
+
+is_python3 = sys.version_info[0] >= 3
+
+try:
+ unicode
+except NameError: # Python 3
+ unicode = str
+
+try:
+ basestring
+except NameError: # Python 3
+ basestring = str
+
+from .common_imports import (
+ etree, BytesIO, HelperTestCase, fileInTestDir, _bytes, make_doctest, skipif
+)
+
+
+class ETreeXSLTTestCase(HelperTestCase):
+ """XSLT tests etree"""
+
+ def test_xslt(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="/a/b/text()" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree)
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>B</foo>
+''',
+ str(res))
+
+ def test_xslt_elementtree_error(self):
+ self.assertRaises(ValueError, etree.XSLT, etree.ElementTree())
+
+ def test_xslt_input_none(self):
+ self.assertRaises(TypeError, etree.XSLT, None)
+
+ def test_xslt_invalid_stylesheet(self):
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:stylesheet />
+</xsl:stylesheet>''')
+
+ self.assertRaises(
+ etree.XSLTParseError, etree.XSLT, style)
+
+ def test_xslt_copy(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="/a/b/text()" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ transform = etree.XSLT(style)
+ res = transform(tree)
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>B</foo>
+''',
+ str(res))
+
+ transform_copy = copy.deepcopy(transform)
+ res = transform_copy(tree)
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>B</foo>
+''',
+ str(res))
+
+ transform = etree.XSLT(style)
+ res = transform(tree)
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>B</foo>
+''',
+ str(res))
+
+ @contextlib.contextmanager
+ def _xslt_setup(
+ self, encoding='UTF-16', expected_encoding=None,
+ expected='<?xml version="1.0" encoding="%(ENCODING)s"?><foo>\\uF8D2</foo>'):
+ tree = self.parse(_bytes('<a><b>\\uF8D2</b><c>\\uF8D2</c></a>'
+ ).decode("unicode_escape"))
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:output encoding="%(ENCODING)s"/>
+ <xsl:template match="/">
+ <foo><xsl:value-of select="/a/b/text()" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''' % {'ENCODING': encoding})
+
+ st = etree.XSLT(style)
+ res = st(tree)
+ expected = _bytes(dedent(expected).strip()).decode("unicode_escape").replace('\n', '') % {
+ 'ENCODING': expected_encoding or encoding,
+ }
+
+ data = [res]
+ yield data
+ self.assertEqual(expected, data[0].replace('\n', ''))
+
+ def test_xslt_utf8(self):
+ with self._xslt_setup(encoding='UTF-8') as res:
+ res[0] = unicode(bytes(res[0]), 'UTF-8')
+ assert 'UTF-8' in res[0]
+
+ def test_xslt_encoding(self):
+ with self._xslt_setup() as res:
+ res[0] = unicode(bytes(res[0]), 'UTF-16')
+ assert 'UTF-16' in res[0]
+
+ def test_xslt_encoding_override(self):
+ with self._xslt_setup(encoding='UTF-8', expected_encoding='UTF-16') as res:
+ f = BytesIO()
+ res[0].write(f, encoding='UTF-16')
+ if is_python3:
+ output = str(f.getvalue(), 'UTF-16')
+ else:
+ output = unicode(str(f.getvalue()), 'UTF-16')
+ res[0] = output.replace("'", '"')
+
+ def test_xslt_write_output_bytesio(self):
+ with self._xslt_setup() as res:
+ f = BytesIO()
+ res[0].write_output(f)
+ res[0] = f.getvalue().decode('UTF-16')
+
+ def test_xslt_write_output_failure(self):
+ class Writer(object):
+ def write(self, data):
+ raise ValueError("FAILED!")
+
+ try:
+ with self._xslt_setup() as res:
+ res[0].write_output(Writer())
+ except ValueError as exc:
+ self.assertTrue("FAILED!" in str(exc), exc)
+ else:
+ self.assertTrue(False, "exception not raised")
+
+ def test_xslt_write_output_file(self):
+ with self._xslt_setup() as res:
+ f = NamedTemporaryFile(delete=False)
+ try:
+ try:
+ res[0].write_output(f)
+ finally:
+ f.close()
+ with io.open(f.name, encoding='UTF-16') as f:
+ res[0] = f.read()
+ finally:
+ os.unlink(f.name)
+
+ def test_xslt_write_output_file_path(self):
+ with self._xslt_setup() as res:
+ f = NamedTemporaryFile(delete=False)
+ try:
+ try:
+ res[0].write_output(f.name, compression=9)
+ finally:
+ f.close()
+ with gzip.GzipFile(f.name) as f:
+ res[0] = f.read().decode("UTF-16")
+ finally:
+ os.unlink(f.name)
+
+ def test_xslt_write_output_file_path_urlescaped(self):
+ # libxml2 should not unescape file paths.
+ with self._xslt_setup() as res:
+ f = NamedTemporaryFile(prefix='tmp%2e', suffix='.xml.gz', delete=False)
+ try:
+ try:
+ res[0].write_output(f.name, compression=3)
+ finally:
+ f.close()
+ with gzip.GzipFile(f.name) as f:
+ res[0] = f.read().decode("UTF-16")
+ finally:
+ os.unlink(f.name)
+
+ def test_xslt_write_output_file_path_urlescaped_plus(self):
+ with self._xslt_setup() as res:
+ f = NamedTemporaryFile(prefix='p+%2e', suffix='.xml.gz', delete=False)
+ try:
+ try:
+ res[0].write_output(f.name, compression=1)
+ finally:
+ f.close()
+ with gzip.GzipFile(f.name) as f:
+ res[0] = f.read().decode("UTF-16")
+ finally:
+ os.unlink(f.name)
+
+ def test_xslt_write_output_file_oserror(self):
+ with self._xslt_setup(expected='') as res:
+ tempdir = mkdtemp()
+ try:
+ res[0].write_output(os.path.join(tempdir, 'missing_subdir', 'out.xml'))
+ except IOError:
+ res[0] = ''
+ else:
+ self.fail("IOError not raised")
+ finally:
+ os.rmdir(tempdir)
+
+ def test_xslt_unicode(self):
+ expected = '''
+ <?xml version="1.0"?>
+ <foo>\\uF8D2</foo>
+ '''
+ with self._xslt_setup(expected=expected) as res:
+ res[0] = unicode(res[0])
+
+ def test_xslt_unicode_standalone(self):
+ tree = self.parse(_bytes('<a><b>\\uF8D2</b><c>\\uF8D2</c></a>'
+ ).decode("unicode_escape"))
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:output encoding="UTF-16" standalone="no"/>
+ <xsl:template match="/">
+ <foo><xsl:value-of select="/a/b/text()" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree)
+ expected = _bytes('''\
+<?xml version="1.0" standalone="no"?>
+<foo>\\uF8D2</foo>
+''').decode("unicode_escape")
+ self.assertEqual(expected,
+ unicode(res))
+
+ def test_xslt_input(self):
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="/a/b/text()" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ st = etree.XSLT(style.getroot())
+
+ def test_xslt_input_partial_doc(self):
+ style = self.parse('''\
+<otherroot>
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="/a/b/text()" /></foo>
+ </xsl:template>
+</xsl:stylesheet>
+</otherroot>''')
+
+ self.assertRaises(etree.XSLTParseError, etree.XSLT, style)
+ root_node = style.getroot()
+ self.assertRaises(etree.XSLTParseError, etree.XSLT, root_node)
+ st = etree.XSLT(root_node[0])
+
+ def test_xslt_broken(self):
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:foo />
+</xsl:stylesheet>''')
+ self.assertRaises(etree.XSLTParseError,
+ etree.XSLT, style)
+
+ def test_xslt_parsing_error_log(self):
+ tree = self.parse('<a/>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:foo />
+</xsl:stylesheet>''')
+ self.assertRaises(etree.XSLTParseError,
+ etree.XSLT, style)
+ exc = None
+ try:
+ etree.XSLT(style)
+ except etree.XSLTParseError as e:
+ exc = e
+ else:
+ self.assertFalse(True, "XSLT processing should have failed but didn't")
+ self.assertTrue(exc is not None)
+ self.assertTrue(len(exc.error_log))
+ for error in exc.error_log:
+ self.assertTrue(':ERROR:XSLT:' in str(error))
+
+ def test_xslt_apply_error_log(self):
+ tree = self.parse('<a/>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="a">
+ <xsl:copy>
+ <xsl:message terminate="yes">FAIL</xsl:message>
+ </xsl:copy>
+ </xsl:template>
+</xsl:stylesheet>''')
+ self.assertRaises(etree.XSLTApplyError,
+ etree.XSLT(style), tree)
+
+ transform = etree.XSLT(style)
+ exc = None
+ try:
+ transform(tree)
+ except etree.XSLTApplyError as e:
+ exc = e
+ else:
+ self.assertFalse(True, "XSLT processing should have failed but didn't")
+
+ self.assertTrue(exc is not None)
+ self.assertTrue(len(exc.error_log))
+ self.assertEqual(len(transform.error_log), len(exc.error_log))
+ for error in exc.error_log:
+ self.assertTrue(':ERROR:XSLT:' in str(error))
+ for error in transform.error_log:
+ self.assertTrue(':ERROR:XSLT:' in str(error))
+
+ def test_xslt_parameters(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="/">
+ <foo><xsl:value-of select="$bar" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree, bar="'Bar'")
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>Bar</foo>
+''',
+ str(res))
+
+ def test_xslt_string_parameters(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="/">
+ <foo><xsl:value-of select="$bar" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree, bar=etree.XSLT.strparam('''it's me, "Bar"'''))
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>it's me, "Bar"</foo>
+''',
+ str(res))
+
+ def test_xslt_parameter_invalid(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:param name="bar"/>
+ <xsl:template match="/">
+ <foo><xsl:value-of select="$bar" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = self.assertRaises(etree.XSLTApplyError,
+ st, tree, bar="<test/>")
+ res = self.assertRaises(etree.XSLTApplyError,
+ st, tree, bar="....")
+
+ def test_xslt_parameter_missing(self):
+ # apply() without needed parameter will lead to XSLTApplyError
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="/">
+ <foo><xsl:value-of select="$bar" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ # at least libxslt 1.1.28 produces this error, earlier ones (e.g. 1.1.18) might not ...
+ self.assertRaises(etree.XSLTApplyError, st.apply, tree)
+
+ def test_xslt_multiple_parameters(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="$bar" /></foo>
+ <foo><xsl:value-of select="$baz" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree, bar="'Bar'", baz="'Baz'")
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>Bar</foo><foo>Baz</foo>
+''',
+ str(res))
+
+ def test_xslt_parameter_xpath(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="$bar" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree, bar="/a/b/text()")
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>B</foo>
+''',
+ str(res))
+
+ def test_xslt_parameter_xpath_object(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="$bar" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree, bar=etree.XPath("/a/b/text()"))
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>B</foo>
+''',
+ str(res))
+
+ def test_xslt_default_parameters(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:param name="bar" select="'Default'" />
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="$bar" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree, bar="'Bar'")
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>Bar</foo>
+''',
+ str(res))
+ res = st(tree)
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>Default</foo>
+''',
+ str(res))
+
+ def test_xslt_html_output(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:output method="html"/>
+ <xsl:strip-space elements="*"/>
+ <xsl:template match="/">
+ <html><body><xsl:value-of select="/a/b/text()" /></body></html>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree)
+ self.assertEqual('<html><body>B</body></html>',
+ str(res).strip())
+
+ def test_xslt_include(self):
+ tree = etree.parse(fileInTestDir('test1.xslt'))
+ st = etree.XSLT(tree)
+
+ def test_xslt_include_from_filelike(self):
+ f = open(fileInTestDir('test1.xslt'), 'rb')
+ tree = etree.parse(f)
+ f.close()
+ st = etree.XSLT(tree)
+
+ def test_xslt_multiple_transforms(self):
+ xml = '<a/>'
+ xslt = '''\
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+ <xsl:template match="/">
+ <response>Some text</response>
+ </xsl:template>
+</xsl:stylesheet>
+'''
+ source = self.parse(xml)
+ styledoc = self.parse(xslt)
+ style = etree.XSLT(styledoc)
+ result = style(source)
+
+ etree.tostring(result.getroot())
+
+ source = self.parse(xml)
+ styledoc = self.parse(xslt)
+ style = etree.XSLT(styledoc)
+ result = style(source)
+
+ etree.tostring(result.getroot())
+
+ def test_xslt_repeat_transform(self):
+ xml = '<a/>'
+ xslt = '''\
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+ <xsl:template match="/">
+ <response>Some text</response>
+ </xsl:template>
+</xsl:stylesheet>
+'''
+ source = self.parse(xml)
+ styledoc = self.parse(xslt)
+ transform = etree.XSLT(styledoc)
+ result = transform(source)
+ result = transform(source)
+ etree.tostring(result.getroot())
+ result = transform(source)
+ etree.tostring(result.getroot())
+ str(result)
+
+ result1 = transform(source)
+ result2 = transform(source)
+ self.assertEqual(str(result1), str(result2))
+ result = transform(source)
+ str(result)
+
+ def test_xslt_empty(self):
+ # could segfault if result contains "empty document"
+ xml = '<blah/>'
+ xslt = '''
+ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+ <xsl:template match="/" />
+ </xsl:stylesheet>
+ '''
+
+ source = self.parse(xml)
+ styledoc = self.parse(xslt)
+ style = etree.XSLT(styledoc)
+ result = style(source)
+ self.assertEqual('', str(result))
+
+ def test_xslt_message(self):
+ xml = '<blah/>'
+ xslt = '''
+ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+ <xsl:template match="/">
+ <xsl:message>TEST TEST TEST</xsl:message>
+ </xsl:template>
+ </xsl:stylesheet>
+ '''
+
+ source = self.parse(xml)
+ styledoc = self.parse(xslt)
+ style = etree.XSLT(styledoc)
+ result = style(source)
+ self.assertEqual('', str(result))
+ self.assertTrue("TEST TEST TEST" in [entry.message
+ for entry in style.error_log])
+
+ def test_xslt_message_terminate(self):
+ xml = '<blah/>'
+ xslt = '''
+ <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
+ <xsl:template match="/">
+ <xsl:message terminate="yes">TEST TEST TEST</xsl:message>
+ </xsl:template>
+ </xsl:stylesheet>
+ '''
+
+ source = self.parse(xml)
+ styledoc = self.parse(xslt)
+ style = etree.XSLT(styledoc)
+
+ self.assertRaises(etree.XSLTApplyError, style, source)
+ self.assertTrue("TEST TEST TEST" in [entry.message
+ for entry in style.error_log])
+
+ def test_xslt_shortcut(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <doc>
+ <foo><xsl:value-of select="$bar" /></foo>
+ <foo><xsl:value-of select="$baz" /></foo>
+ </doc>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ result = tree.xslt(style, bar="'Bar'", baz="'Baz'")
+ self.assertEqual(
+ _bytes('<doc><foo>Bar</foo><foo>Baz</foo></doc>'),
+ etree.tostring(result.getroot()))
+
+ def test_multiple_elementrees(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="a"><A><xsl:apply-templates/></A></xsl:template>
+ <xsl:template match="b"><B><xsl:apply-templates/></B></xsl:template>
+ <xsl:template match="c"><C><xsl:apply-templates/></C></xsl:template>
+</xsl:stylesheet>''')
+
+ self.assertEqual(self._rootstring(tree),
+ _bytes('<a><b>B</b><c>C</c></a>'))
+ result = tree.xslt(style)
+ self.assertEqual(self._rootstring(tree),
+ _bytes('<a><b>B</b><c>C</c></a>'))
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A><B>B</B><C>C</C></A>'))
+
+ b_tree = etree.ElementTree(tree.getroot()[0])
+ self.assertEqual(self._rootstring(b_tree),
+ _bytes('<b>B</b>'))
+ result = b_tree.xslt(style)
+ self.assertEqual(self._rootstring(tree),
+ _bytes('<a><b>B</b><c>C</c></a>'))
+ self.assertEqual(self._rootstring(result),
+ _bytes('<B>B</B>'))
+
+ c_tree = etree.ElementTree(tree.getroot()[1])
+ self.assertEqual(self._rootstring(c_tree),
+ _bytes('<c>C</c>'))
+ result = c_tree.xslt(style)
+ self.assertEqual(self._rootstring(tree),
+ _bytes('<a><b>B</b><c>C</c></a>'))
+ self.assertEqual(self._rootstring(result),
+ _bytes('<C>C</C>'))
+
+ def test_xslt_document_XML(self):
+ # make sure document('') works from parsed strings
+ xslt = etree.XSLT(etree.XML("""\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="/">
+ <test>TEXT<xsl:copy-of select="document('')//test"/></test>
+ </xsl:template>
+</xsl:stylesheet>
+"""))
+ result = xslt(etree.XML('<a/>'))
+ root = result.getroot()
+ self.assertEqual(root.tag,
+ 'test')
+ self.assertEqual(root[0].tag,
+ 'test')
+ self.assertEqual(root[0].text,
+ 'TEXT')
+ self.assertEqual(root[0][0].tag,
+ '{http://www.w3.org/1999/XSL/Transform}copy-of')
+
+ def test_xslt_document_parse(self):
+ # make sure document('') works from loaded files
+ xslt = etree.XSLT(etree.parse(fileInTestDir("test-document.xslt")))
+ result = xslt(etree.XML('<a/>'))
+ root = result.getroot()
+ self.assertEqual(root.tag,
+ 'test')
+ self.assertEqual(root[0].tag,
+ '{http://www.w3.org/1999/XSL/Transform}stylesheet')
+
+ def test_xslt_document_elementtree(self):
+ # make sure document('') works from loaded files
+ xslt = etree.XSLT(etree.ElementTree(file=fileInTestDir("test-document.xslt")))
+ result = xslt(etree.XML('<a/>'))
+ root = result.getroot()
+ self.assertEqual(root.tag,
+ 'test')
+ self.assertEqual(root[0].tag,
+ '{http://www.w3.org/1999/XSL/Transform}stylesheet')
+
+ def test_xslt_document_error(self):
+ xslt = etree.XSLT(etree.XML("""\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="/">
+ <test>TEXT<xsl:copy-of select="document('uri:__junkfood__is__evil__')//test"/></test>
+ </xsl:template>
+</xsl:stylesheet>
+"""))
+
+ errors = None
+ try:
+ xslt(etree.XML('<a/>'))
+ except etree.XSLTApplyError as exc:
+ errors = exc.error_log
+ else:
+ self.assertFalse(True, "XSLT processing should have failed but didn't")
+
+ self.assertTrue(len(errors))
+ for error in errors:
+ if ':ERROR:XSLT:' in str(error):
+ break
+ else:
+ self.assertFalse(True, 'No XSLT errors found in error log:\n%s' % errors)
+
+ def test_xslt_document_XML_resolver(self):
+ # make sure document('') works when custom resolvers are in use
+ assertEqual = self.assertEqual
+ called = {'count' : 0}
+ class TestResolver(etree.Resolver):
+ def resolve(self, url, id, context):
+ assertEqual(url, 'file://ANYTHING')
+ called['count'] += 1
+ return self.resolve_string('<CALLED/>', context)
+
+ parser = etree.XMLParser()
+ parser.resolvers.add(TestResolver())
+
+ xslt = etree.XSLT(etree.XML(_bytes("""\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:l="local">
+ <xsl:template match="/">
+ <test>
+ <xsl:for-each select="document('')//l:data/l:entry">
+ <xsl:copy-of select="document('file://ANYTHING')"/>
+ <xsl:copy>
+ <xsl:attribute name="value">
+ <xsl:value-of select="."/>
+ </xsl:attribute>
+ </xsl:copy>
+ </xsl:for-each>
+ </test>
+ </xsl:template>
+ <l:data>
+ <l:entry>A</l:entry>
+ <l:entry>B</l:entry>
+ </l:data>
+</xsl:stylesheet>
+"""), parser))
+
+ self.assertEqual(called['count'], 0)
+ result = xslt(etree.XML('<a/>'))
+ self.assertEqual(called['count'], 1)
+
+ root = result.getroot()
+ self.assertEqual(root.tag,
+ 'test')
+ self.assertEqual(len(root), 4)
+
+ self.assertEqual(root[0].tag,
+ 'CALLED')
+ self.assertEqual(root[1].tag,
+ '{local}entry')
+ self.assertEqual(root[1].text,
+ None)
+ self.assertEqual(root[1].get("value"),
+ 'A')
+ self.assertEqual(root[2].tag,
+ 'CALLED')
+ self.assertEqual(root[3].tag,
+ '{local}entry')
+ self.assertEqual(root[3].text,
+ None)
+ self.assertEqual(root[3].get("value"),
+ 'B')
+
+ def test_xslt_resolver_url_building(self):
+ assertEqual = self.assertEqual
+ called = {'count' : 0}
+ expected_url = None
+ class TestResolver(etree.Resolver):
+ def resolve(self, url, id, context):
+ assertEqual(url, expected_url)
+ called['count'] += 1
+ return self.resolve_string('<CALLED/>', context)
+
+ stylesheet_xml = _bytes("""\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:l="local">
+ <xsl:template match="/">
+ <xsl:copy-of select="document('test.xml')"/>
+ </xsl:template>
+</xsl:stylesheet>
+""")
+
+ parser = etree.XMLParser()
+ parser.resolvers.add(TestResolver())
+
+ # test without base_url => relative path only
+ expected_url = 'test.xml'
+ xslt = etree.XSLT(etree.XML(stylesheet_xml, parser))
+
+ self.assertEqual(called['count'], 0)
+ result = xslt(etree.XML('<a/>'))
+ self.assertEqual(called['count'], 1)
+
+ # now the same thing with a stylesheet base URL on the filesystem
+ called['count'] = 0
+ expected_url = 'MY/BASE/test.xml' # seems to be the same on Windows
+ xslt = etree.XSLT(etree.XML(
+ stylesheet_xml, parser,
+ base_url=os.path.join('MY', 'BASE', 'FILE')))
+
+ self.assertEqual(called['count'], 0)
+ result = xslt(etree.XML('<a/>'))
+ self.assertEqual(called['count'], 1)
+
+ # now the same thing with a stylesheet base URL
+ called['count'] = 0
+ expected_url = 'http://server.com/BASE/DIR/test.xml'
+ xslt = etree.XSLT(etree.XML(
+ stylesheet_xml, parser,
+ base_url='http://server.com/BASE/DIR/FILE'))
+
+ self.assertEqual(called['count'], 0)
+ result = xslt(etree.XML('<a/>'))
+ self.assertEqual(called['count'], 1)
+
+ # now the same thing with a stylesheet base file:// URL
+ called['count'] = 0
+ expected_url = 'file://BASE/DIR/test.xml'
+ xslt = etree.XSLT(etree.XML(
+ stylesheet_xml, parser,
+ base_url='file://BASE/DIR/FILE'))
+
+ self.assertEqual(called['count'], 0)
+ result = xslt(etree.XML('<a/>'))
+ self.assertEqual(called['count'], 1)
+
+ def test_xslt_document_parse_allow(self):
+ access_control = etree.XSLTAccessControl(read_file=True)
+ xslt = etree.XSLT(etree.parse(fileInTestDir("test-document.xslt")),
+ access_control=access_control)
+ result = xslt(etree.XML('<a/>'))
+ root = result.getroot()
+ self.assertEqual(root.tag,
+ 'test')
+ self.assertEqual(root[0].tag,
+ '{http://www.w3.org/1999/XSL/Transform}stylesheet')
+
+ def test_xslt_document_parse_deny(self):
+ access_control = etree.XSLTAccessControl(read_file=False)
+ xslt = etree.XSLT(etree.parse(fileInTestDir("test-document.xslt")),
+ access_control=access_control)
+ self.assertRaises(etree.XSLTApplyError, xslt, etree.XML('<a/>'))
+
+ def test_xslt_document_parse_deny_all(self):
+ access_control = etree.XSLTAccessControl.DENY_ALL
+ xslt = etree.XSLT(etree.parse(fileInTestDir("test-document.xslt")),
+ access_control=access_control)
+ self.assertRaises(etree.XSLTApplyError, xslt, etree.XML('<a/>'))
+
+ def test_xslt_access_control_repr(self):
+ access_control = etree.XSLTAccessControl.DENY_ALL
+ self.assertTrue(repr(access_control).startswith(type(access_control).__name__))
+ self.assertEqual(repr(access_control), repr(access_control))
+ self.assertNotEqual(repr(etree.XSLTAccessControl.DENY_ALL),
+ repr(etree.XSLTAccessControl.DENY_WRITE))
+ self.assertNotEqual(repr(etree.XSLTAccessControl.DENY_ALL),
+ repr(etree.XSLTAccessControl()))
+
+ def test_xslt_move_result(self):
+ root = etree.XML(_bytes('''\
+ <transform>
+ <widget displayType="fieldset"/>
+ </transform>'''))
+
+ xslt = etree.XSLT(etree.XML(_bytes('''\
+ <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:output method="html" indent="no"/>
+ <xsl:template match="/">
+ <html>
+ <xsl:apply-templates/>
+ </html>
+ </xsl:template>
+
+ <xsl:template match="widget">
+ <xsl:element name="{@displayType}"/>
+ </xsl:template>
+
+ </xsl:stylesheet>''')))
+
+ result = xslt(root[0])
+ root[:] = result.getroot()[:]
+ del root # segfaulted before
+
+ def test_xslt_pi(self):
+ tree = self.parse('''\
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="%s"?>
+<a>
+ <b>B</b>
+ <c>C</c>
+</a>''' % fileInTestDir("test1.xslt"))
+
+ style_root = tree.getroot().getprevious().parseXSL().getroot()
+ self.assertEqual("{http://www.w3.org/1999/XSL/Transform}stylesheet",
+ style_root.tag)
+
+ def test_xslt_pi_embedded_xmlid(self):
+ # test xml:id dictionary lookup mechanism
+ tree = self.parse('''\
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="#style"?>
+<a>
+ <b>B</b>
+ <c>C</c>
+ <xsl:stylesheet version="1.0" xml:id="style"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="/a/b/text()" /></foo>
+ </xsl:template>
+ </xsl:stylesheet>
+</a>''')
+
+ style_root = tree.getroot().getprevious().parseXSL().getroot()
+ self.assertEqual("{http://www.w3.org/1999/XSL/Transform}stylesheet",
+ style_root.tag)
+
+ st = etree.XSLT(style_root)
+ res = st(tree)
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>B</foo>
+''',
+ str(res))
+
+ def test_xslt_pi_embedded_id(self):
+ # test XPath lookup mechanism
+ tree = self.parse('''\
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="#style"?>
+<a>
+ <b>B</b>
+ <c>C</c>
+</a>''')
+
+ style = self.parse('''\
+<xsl:stylesheet version="1.0" xml:id="style"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="/a/b/text()" /></foo>
+ </xsl:template>
+</xsl:stylesheet>
+''')
+
+ tree.getroot().append(style.getroot())
+
+ style_root = tree.getroot().getprevious().parseXSL().getroot()
+ self.assertEqual("{http://www.w3.org/1999/XSL/Transform}stylesheet",
+ style_root.tag)
+
+ st = etree.XSLT(style_root)
+ res = st(tree)
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<foo>B</foo>
+''',
+ str(res))
+
+ def test_xslt_pi_get(self):
+ tree = self.parse('''\
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="TEST"?>
+<a>
+ <b>B</b>
+ <c>C</c>
+</a>''')
+
+ pi = tree.getroot().getprevious()
+ self.assertEqual("TEST", pi.get("href"))
+
+ def test_xslt_pi_get_all(self):
+ tree = self.parse('''\
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="TEST"?>
+<a>
+ <b>B</b>
+ <c>C</c>
+</a>''')
+
+ pi = tree.getroot().getprevious()
+ self.assertEqual("TEST", pi.get("href"))
+ self.assertEqual("text/xsl", pi.get("type"))
+ self.assertEqual(None, pi.get("motz"))
+
+ def test_xslt_pi_get_all_reversed(self):
+ tree = self.parse('''\
+<?xml version="1.0"?>
+<?xml-stylesheet href="TEST" type="text/xsl"?>
+<a>
+ <b>B</b>
+ <c>C</c>
+</a>''')
+
+ pi = tree.getroot().getprevious()
+ self.assertEqual("TEST", pi.get("href"))
+ self.assertEqual("text/xsl", pi.get("type"))
+ self.assertEqual(None, pi.get("motz"))
+
+ def test_xslt_pi_get_unknown(self):
+ tree = self.parse('''\
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="TEST"?>
+<a>
+ <b>B</b>
+ <c>C</c>
+</a>''')
+
+ pi = tree.getroot().getprevious()
+ self.assertEqual(None, pi.get("unknownattribute"))
+
+ def test_xslt_pi_set_replace(self):
+ tree = self.parse('''\
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="TEST"?>
+<a>
+ <b>B</b>
+ <c>C</c>
+</a>''')
+
+ pi = tree.getroot().getprevious()
+ self.assertEqual("TEST", pi.get("href"))
+
+ pi.set("href", "TEST123")
+ self.assertEqual("TEST123", pi.get("href"))
+
+ def test_xslt_pi_set_new(self):
+ tree = self.parse('''\
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl"?>
+<a>
+ <b>B</b>
+ <c>C</c>
+</a>''')
+
+ pi = tree.getroot().getprevious()
+ self.assertEqual(None, pi.get("href"))
+
+ pi.set("href", "TEST")
+ self.assertEqual("TEST", pi.get("href"))
+
+class ETreeEXSLTTestCase(HelperTestCase):
+ """EXSLT tests"""
+
+ def test_exslt_str(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:str="http://exslt.org/strings"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ exclude-result-prefixes="str xsl">
+ <xsl:template match="text()">
+ <xsl:value-of select="str:align(string(.), '***', 'center')" />
+ </xsl:template>
+ <xsl:template match="*">
+ <xsl:copy>
+ <xsl:apply-templates/>
+ </xsl:copy>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree)
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<a><b>*B*</b><c>*C*</c></a>
+''',
+ str(res))
+
+ def test_exslt_str_attribute_replace(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+ <xsl:stylesheet version = "1.0"
+ xmlns:xsl='http://www.w3.org/1999/XSL/Transform'
+ xmlns:str="http://exslt.org/strings"
+ extension-element-prefixes="str">
+
+ <xsl:template match="/">
+ <h1 class="{str:replace('abc', 'b', 'x')}">test</h1>
+ </xsl:template>
+
+ </xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree)
+ self.assertEqual(str(res), '''\
+<?xml version="1.0"?>
+<h1 class="axc">test</h1>
+''')
+
+ def test_exslt_math(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:math="http://exslt.org/math"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ exclude-result-prefixes="math xsl">
+ <xsl:template match="*">
+ <xsl:copy>
+ <xsl:attribute name="pi">
+ <xsl:value-of select="math:constant('PI', count(*)+2)"/>
+ </xsl:attribute>
+ <xsl:apply-templates/>
+ </xsl:copy>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree)
+ self.assertEqual('''\
+<?xml version="1.0"?>
+<a pi="3.14"><b pi="3">B</b><c pi="3">C</c></a>
+''',
+ str(res))
+
+ def test_exslt_regexp_test(self):
+ xslt = etree.XSLT(etree.XML(_bytes("""\
+<xsl:stylesheet version="1.0"
+ xmlns:regexp="http://exslt.org/regular-expressions"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*">
+ <test><xsl:copy-of select="*[regexp:test(string(.), '8.')]"/></test>
+ </xsl:template>
+</xsl:stylesheet>
+""")))
+ result = xslt(etree.XML(_bytes('<a><b>123</b><b>098</b><b>987</b></a>')))
+ root = result.getroot()
+ self.assertEqual(root.tag,
+ 'test')
+ self.assertEqual(len(root), 1)
+ self.assertEqual(root[0].tag,
+ 'b')
+ self.assertEqual(root[0].text,
+ '987')
+
+ def test_exslt_regexp_replace(self):
+ xslt = etree.XSLT(etree.XML("""\
+<xsl:stylesheet version="1.0"
+ xmlns:regexp="http://exslt.org/regular-expressions"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*">
+ <test>
+ <xsl:copy-of select="regexp:replace(string(.), 'd.', '', 'XX')"/>
+ <xsl:text>-</xsl:text>
+ <xsl:copy-of select="regexp:replace(string(.), 'd.', 'gi', 'XX')"/>
+ </test>
+ </xsl:template>
+</xsl:stylesheet>
+"""))
+ result = xslt(etree.XML(_bytes('<a>abdCdEeDed</a>')))
+ root = result.getroot()
+ self.assertEqual(root.tag,
+ 'test')
+ self.assertEqual(len(root), 0)
+ self.assertEqual(root.text, 'abXXdEeDed-abXXXXeXXd')
+
+ def test_exslt_regexp_match(self):
+ xslt = etree.XSLT(etree.XML("""\
+<xsl:stylesheet version="1.0"
+ xmlns:regexp="http://exslt.org/regular-expressions"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*">
+ <test>
+ <test1><xsl:copy-of select="regexp:match(string(.), 'd.')"/></test1>
+ <test2><xsl:copy-of select="regexp:match(string(.), 'd.', 'g')"/></test2>
+ <test2i><xsl:copy-of select="regexp:match(string(.), 'd.', 'gi')"/></test2i>
+ </test>
+ </xsl:template>
+</xsl:stylesheet>
+"""))
+ result = xslt(etree.XML(_bytes('<a>abdCdEeDed</a>')))
+ root = result.getroot()
+ self.assertEqual(root.tag, 'test')
+ self.assertEqual(len(root), 3)
+
+ self.assertEqual(len(root[0]), 1)
+ self.assertEqual(root[0][0].tag, 'match')
+ self.assertEqual(root[0][0].text, 'dC')
+
+ self.assertEqual(len(root[1]), 2)
+ self.assertEqual(root[1][0].tag, 'match')
+ self.assertEqual(root[1][0].text, 'dC')
+ self.assertEqual(root[1][1].tag, 'match')
+ self.assertEqual(root[1][1].text, 'dE')
+
+ self.assertEqual(len(root[2]), 3)
+ self.assertEqual(root[2][0].tag, 'match')
+ self.assertEqual(root[2][0].text, 'dC')
+ self.assertEqual(root[2][1].tag, 'match')
+ self.assertEqual(root[2][1].text, 'dE')
+ self.assertEqual(root[2][2].tag, 'match')
+ self.assertEqual(root[2][2].text, 'De')
+
+ def test_exslt_regexp_match_groups(self):
+ xslt = etree.XSLT(etree.XML(_bytes("""\
+<xsl:stylesheet version="1.0"
+ xmlns:regexp="http://exslt.org/regular-expressions"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="/">
+ <test>
+ <xsl:for-each select="regexp:match(
+ '123abc567', '([0-9]+)([a-z]+)([0-9]+)' )">
+ <test1><xsl:value-of select="."/></test1>
+ </xsl:for-each>
+ </test>
+ </xsl:template>
+</xsl:stylesheet>
+""")))
+ result = xslt(etree.XML(_bytes('<a/>')))
+ root = result.getroot()
+ self.assertEqual(root.tag, 'test')
+ self.assertEqual(len(root), 4)
+
+ self.assertEqual(root[0].text, "123abc567")
+ self.assertEqual(root[1].text, "123")
+ self.assertEqual(root[2].text, "abc")
+ self.assertEqual(root[3].text, "567")
+
+ def test_exslt_regexp_match1(self):
+ # taken from http://www.exslt.org/regexp/functions/match/index.html
+ xslt = etree.XSLT(etree.XML(_bytes("""\
+<xsl:stylesheet version="1.0"
+ xmlns:regexp="http://exslt.org/regular-expressions"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="/">
+ <test>
+ <xsl:for-each select="regexp:match(
+ 'http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml',
+ '(\\w+):\\/\\/([^/:]+)(:\\d*)?([^# ]*)')">
+ <test1><xsl:value-of select="."/></test1>
+ </xsl:for-each>
+ </test>
+ </xsl:template>
+</xsl:stylesheet>
+""")))
+ result = xslt(etree.XML(_bytes('<a/>')))
+ root = result.getroot()
+ self.assertEqual(root.tag, 'test')
+ self.assertEqual(len(root), 5)
+
+ self.assertEqual(
+ root[0].text,
+ "http://www.bayes.co.uk/xml/index.xml?/xml/utils/rechecker.xml")
+ self.assertEqual(
+ root[1].text,
+ "http")
+ self.assertEqual(
+ root[2].text,
+ "www.bayes.co.uk")
+ self.assertFalse(root[3].text)
+ self.assertEqual(
+ root[4].text,
+ "/xml/index.xml?/xml/utils/rechecker.xml")
+
+ def test_exslt_regexp_match2(self):
+ # taken from http://www.exslt.org/regexp/functions/match/index.html
+ xslt = etree.XSLT(self.parse("""\
+<xsl:stylesheet version="1.0"
+ xmlns:regexp="http://exslt.org/regular-expressions"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="/">
+ <test>
+ <xsl:for-each select="regexp:match(
+ 'This is a test string', '(\\w+)', 'g')">
+ <test1><xsl:value-of select="."/></test1>
+ </xsl:for-each>
+ </test>
+ </xsl:template>
+</xsl:stylesheet>
+"""))
+ result = xslt(etree.XML(_bytes('<a/>')))
+ root = result.getroot()
+ self.assertEqual(root.tag, 'test')
+ self.assertEqual(len(root), 5)
+
+ self.assertEqual(root[0].text, "This")
+ self.assertEqual(root[1].text, "is")
+ self.assertEqual(root[2].text, "a")
+ self.assertEqual(root[3].text, "test")
+ self.assertEqual(root[4].text, "string")
+
+ def _test_exslt_regexp_match3(self):
+ # taken from http://www.exslt.org/regexp/functions/match/index.html
+ # THIS IS NOT SUPPORTED!
+ xslt = etree.XSLT(etree.XML(_bytes("""\
+<xsl:stylesheet version="1.0"
+ xmlns:regexp="http://exslt.org/regular-expressions"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="/">
+ <test>
+ <xsl:for-each select="regexp:match(
+ 'This is a test string', '([a-z])+ ', 'g')">
+ <test1><xsl:value-of select="."/></test1>
+ </xsl:for-each>
+ </test>
+ </xsl:template>
+</xsl:stylesheet>
+""")))
+ result = xslt(etree.XML(_bytes('<a/>')))
+ root = result.getroot()
+ self.assertEqual(root.tag, 'test')
+ self.assertEqual(len(root), 4)
+
+ self.assertEqual(root[0].text, "his")
+ self.assertEqual(root[1].text, "is")
+ self.assertEqual(root[2].text, "a")
+ self.assertEqual(root[3].text, "test")
+
+ def _test_exslt_regexp_match4(self):
+ # taken from http://www.exslt.org/regexp/functions/match/index.html
+ # THIS IS NOT SUPPORTED!
+ xslt = etree.XSLT(etree.XML(_bytes("""\
+<xsl:stylesheet version="1.0"
+ xmlns:regexp="http://exslt.org/regular-expressions"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="/">
+ <test>
+ <xsl:for-each select="regexp:match(
+ 'This is a test string', '([a-z])+ ', 'gi')">
+ <test1><xsl:value-of select="."/></test1>
+ </xsl:for-each>
+ </test>
+ </xsl:template>
+</xsl:stylesheet>
+""")))
+ result = xslt(etree.XML(_bytes('<a/>')))
+ root = result.getroot()
+ self.assertEqual(root.tag, 'test')
+ self.assertEqual(len(root), 4)
+
+ self.assertEqual(root[0].text, "This")
+ self.assertEqual(root[1].text, "is")
+ self.assertEqual(root[2].text, "a")
+ self.assertEqual(root[3].text, "test")
+
+
+class ETreeXSLTExtFuncTestCase(HelperTestCase):
+ """Tests for XPath extension functions in XSLT."""
+
+ def test_extensions1(self):
+ tree = self.parse('<a><b>B</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ exclude-result-prefixes="myns">
+ <xsl:template match="a"><A><xsl:value-of select="myns:mytext(b)"/></A></xsl:template>
+</xsl:stylesheet>''')
+
+ def mytext(ctxt, values):
+ return 'X' * len(values)
+
+ result = tree.xslt(style, {('testns', 'mytext') : mytext})
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A>X</A>'))
+
+ def test_extensions2(self):
+ tree = self.parse('<a><b>B</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ exclude-result-prefixes="myns">
+ <xsl:template match="a"><A><xsl:value-of select="myns:mytext(b)"/></A></xsl:template>
+</xsl:stylesheet>''')
+
+ def mytext(ctxt, values):
+ return 'X' * len(values)
+
+ namespace = etree.FunctionNamespace('testns')
+ namespace['mytext'] = mytext
+
+ result = tree.xslt(style)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A>X</A>'))
+
+ def test_variable_result_tree_fragment(self):
+ tree = self.parse('<a><b>B</b><b/></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ exclude-result-prefixes="myns">
+ <xsl:template match="a">
+ <xsl:variable name="content">
+ <xsl:apply-templates/>
+ </xsl:variable>
+ <A><xsl:value-of select="myns:mytext($content)"/></A>
+ </xsl:template>
+ <xsl:template match="b"><xsl:copy>BBB</xsl:copy></xsl:template>
+</xsl:stylesheet>''')
+
+ def mytext(ctxt, values):
+ for value in values:
+ self.assertTrue(hasattr(value, 'tag'),
+ "%s is not an Element" % type(value))
+ self.assertEqual(value.tag, 'b')
+ self.assertEqual(value.text, 'BBB')
+ return 'X'.join([el.tag for el in values])
+
+ namespace = etree.FunctionNamespace('testns')
+ namespace['mytext'] = mytext
+
+ result = tree.xslt(style)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A>bXb</A>'))
+
+ def test_xpath_on_context_node(self):
+ tree = self.parse('<a><b>B<c/>C</b><b/></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ exclude-result-prefixes="myns">
+ <xsl:template match="b">
+ <A><xsl:value-of select="myns:myext()"/></A>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ def extfunc(ctxt):
+ text_content = ctxt.context_node.xpath('text()')
+ return 'x'.join(text_content)
+
+ namespace = etree.FunctionNamespace('testns')
+ namespace['myext'] = extfunc
+
+ result = tree.xslt(style)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A>BxC</A>'))
+
+ def test_xpath_on_foreign_context_node(self):
+ # LP ticket 1354652
+ class Resolver(etree.Resolver):
+ def resolve(self, system_url, public_id, context):
+ assert system_url == 'extdoc.xml'
+ return self.resolve_string(b'<a><b>B<c/>C</b><b/></a>', context)
+
+ parser = etree.XMLParser()
+ parser.resolvers.add(Resolver())
+
+ tree = self.parse(b'<a><b/><b/></a>')
+ transform = etree.XSLT(self.parse(b'''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:mypre="testns"
+ exclude-result-prefixes="mypre">
+ <xsl:template match="b">
+ <B><xsl:value-of select="mypre:myext()"/></B>
+ </xsl:template>
+ <xsl:template match="a">
+ <A><xsl:apply-templates select="document('extdoc.xml')//b" /></A>
+ </xsl:template>
+</xsl:stylesheet>''', parser=parser))
+
+ def extfunc(ctxt):
+ text_content = ctxt.context_node.xpath('text()')
+ return 'x'.join(text_content)
+
+ namespace = etree.FunctionNamespace('testns')
+ namespace['myext'] = extfunc
+
+ result = transform(tree)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A><B>BxC</B><B/></A>'))
+
+
+class ETreeXSLTExtElementTestCase(HelperTestCase):
+ """Tests for extension elements in XSLT."""
+
+ def test_extension_element(self):
+ tree = self.parse('<a><b>B</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns"
+ exclude-result-prefixes="myns">
+ <xsl:template match="a">
+ <A><myns:myext>b</myns:myext></A>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ child = etree.Element(self_node.text)
+ child.text = 'X'
+ output_parent.append(child)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A><b>X</b></A>'))
+
+ def test_extension_element_doc_context(self):
+ tree = self.parse('<a><b>B</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns"
+ exclude-result-prefixes="myns">
+ <xsl:template match="/">
+ <A><myns:myext>b</myns:myext></A>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ tags = []
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ tags.append(input_node.tag)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(tags, ['a'])
+
+ def test_extension_element_comment_pi_context(self):
+ tree = self.parse('<?test toast?><a><!--a comment--><?another pi?></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns"
+ exclude-result-prefixes="myns">
+ <xsl:template match="/">
+ <ROOT><xsl:apply-templates /></ROOT>
+ </xsl:template>
+ <xsl:template match="comment()">
+ <A><myns:myext>b</myns:myext></A>
+ </xsl:template>
+ <xsl:template match="processing-instruction()">
+ <A><myns:myext>b</myns:myext></A>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ text = []
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ text.append(input_node.text)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(text, ['toast', 'a comment', 'pi'])
+
+ def _test_extension_element_attribute_context(self):
+ # currently not supported
+ tree = self.parse('<a test="A"><b attr="B"/></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns"
+ exclude-result-prefixes="myns">
+ <xsl:template match="@test">
+ <A><myns:myext>b</myns:myext></A>
+ </xsl:template>
+ <xsl:template match="@attr">
+ <A><myns:myext>b</myns:myext></A>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ text = []
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, attr_value, output_parent):
+ text.append(attr_value)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(text, ['A', 'B'])
+
+ def test_extension_element_content(self):
+ tree = self.parse('<a><b>B</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns">
+ <xsl:template match="a">
+ <A><myns:myext><x>X</x><y>Y</y><z/></myns:myext></A>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ output_parent.extend(list(self_node)[1:])
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A><y>Y</y><z/></A>'))
+
+ def test_extension_element_apply_templates(self):
+ tree = self.parse('<a><b>B</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns">
+ <xsl:template match="a">
+ <A><myns:myext><x>X</x><y>Y</y><z/></myns:myext></A>
+ </xsl:template>
+ <xsl:template match="x" />
+ <xsl:template match="z">XYZ</xsl:template>
+</xsl:stylesheet>''')
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ for child in self_node:
+ for result in self.apply_templates(context, child):
+ if isinstance(result, basestring):
+ el = etree.Element("T")
+ el.text = result
+ else:
+ el = result
+ output_parent.append(el)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A><T>Y</T><T>XYZ</T></A>'))
+
+ def test_extension_element_apply_templates_elements_only(self):
+ tree = self.parse('<a><b>B</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns">
+ <xsl:template match="a">
+ <A><myns:myext><x>X</x><y>Y</y><z/></myns:myext></A>
+ </xsl:template>
+ <xsl:template match="x"><X/></xsl:template>
+ <xsl:template match="z">XYZ</xsl:template>
+</xsl:stylesheet>''')
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ for child in self_node:
+ for result in self.apply_templates(context, child,
+ elements_only=True):
+ assert not isinstance(result, basestring)
+ output_parent.append(result)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A><X/></A>'))
+
+ def test_extension_element_apply_templates_remove_blank_text(self):
+ tree = self.parse('<a><b>B</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns">
+ <xsl:template match="a">
+ <A><myns:myext><x>X</x><y>Y</y><z/></myns:myext></A>
+ </xsl:template>
+ <xsl:template match="x"><X/></xsl:template>
+ <xsl:template match="y"><xsl:text> </xsl:text></xsl:template>
+ <xsl:template match="z">XYZ</xsl:template>
+</xsl:stylesheet>''')
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ for child in self_node:
+ for result in self.apply_templates(context, child,
+ remove_blank_text=True):
+ if isinstance(result, basestring):
+ assert result.strip()
+ el = etree.Element("T")
+ el.text = result
+ else:
+ el = result
+ output_parent.append(el)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A><X/><T>XYZ</T></A>'))
+
+ def test_extension_element_apply_templates_target_node(self):
+ tree = self.parse('<a><b>B</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns">
+ <xsl:template match="a">
+ <A><myns:myext><x>X</x><y>Y</y><z/></myns:myext></A>
+ </xsl:template>
+ <xsl:template match="x" />
+ <xsl:template match="z">XYZ</xsl:template>
+</xsl:stylesheet>''')
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ for child in self_node:
+ self.apply_templates(context, child, output_parent)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A>YXYZ</A>'))
+
+ def test_extension_element_apply_templates_target_node_doc(self):
+ tree = self.parse('<a><b>B</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns">
+ <xsl:template match="a">
+ <myns:myext><x>X</x><y>Y</y><z/></myns:myext>
+ </xsl:template>
+ <xsl:template match="x"><xsl:processing-instruction name="test">TEST</xsl:processing-instruction></xsl:template>
+ <xsl:template match="y"><Y>XYZ</Y></xsl:template>
+ <xsl:template match="z"><xsl:comment>TEST</xsl:comment></xsl:template>
+</xsl:stylesheet>''')
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ for child in self_node:
+ self.apply_templates(context, child, output_parent)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(etree.tostring(result),
+ _bytes('<?test TEST?><Y>XYZ</Y><!--TEST-->'))
+
+ def test_extension_element_process_children(self):
+ tree = self.parse('<a><b>E</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns">
+ <xsl:template match="a">
+ <xsl:variable name="testvar">yo</xsl:variable>
+ <A>
+ <myns:myext>
+ <xsl:attribute name="attr">
+ <xsl:value-of select="$testvar" />
+ </xsl:attribute>
+ <B>
+ <xsl:choose>
+ <xsl:when test="1 = 2"><C/></xsl:when>
+ <xsl:otherwise><D><xsl:value-of select="b/text()" /></D></xsl:otherwise>
+ </xsl:choose>
+ </B>
+ </myns:myext>
+ </A>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ el = etree.Element('MY')
+ self.process_children(context, el)
+ output_parent.append(el)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A><MYattr="yo"><B><D>E</D></B></MY></A>'))
+
+ def test_extension_element_process_children_to_append_only(self):
+ tree = self.parse('<a/>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns">
+ <xsl:template match="a">
+ <myns:myext>
+ <A/>
+ </myns:myext>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ self.process_children(context, output_parent)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<A/>'))
+
+ def test_extension_element_process_children_to_read_only_raise(self):
+ tree = self.parse('<a/>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns">
+ <xsl:template match="a">
+ <myns:myext>
+ <A/>
+ </myns:myext>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ self.process_children(context, self_node)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ self.assertRaises(TypeError, tree.xslt, style, extensions=extensions)
+
+ def test_extension_element_process_children_with_subextension_element(self):
+ tree = self.parse('<a/>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns">
+ <xsl:template match="a">
+ <myns:myext>
+ <A><myns:myext><B/></myns:myext></A>
+ </myns:myext>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ class MyExt(etree.XSLTExtension):
+ callback_call_counter = 0
+ def execute(self, context, self_node, input_node, output_parent):
+ self.callback_call_counter += 1
+ el = etree.Element('MY', n=str(self.callback_call_counter))
+ self.process_children(context, el)
+ output_parent.append(el)
+
+ extensions = { ('testns', 'myext') : MyExt() }
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(self._rootstring(result),
+ _bytes('<MYn="1"><A><MYn="2"><B/></MY></A></MY>'))
+
+ def test_extension_element_raise(self):
+ tree = self.parse('<a><b>B</b></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+ xmlns:myns="testns"
+ extension-element-prefixes="myns"
+ exclude-result-prefixes="myns">
+ <xsl:template match="a">
+ <A><myns:myext>b</myns:myext></A>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ class MyError(Exception):
+ pass
+
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ raise MyError("expected!")
+
+ extensions = { ('testns', 'myext') : MyExt() }
+ self.assertRaises(MyError, tree.xslt, style, extensions=extensions)
+
+ # FIXME: DISABLED - implementation seems to be broken
+ # if someone cares enough about this feature, I take pull requests that fix it.
+ def _test_multiple_extension_elements_with_output_parent(self):
+ tree = self.parse("""\
+<text>
+ <par>This is <format>arbitrary</format> text in a paragraph</par>
+</text>""")
+ style = self.parse("""\
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:my="my" extension-element-prefixes="my" version="1.0">
+ <xsl:template match="par">
+ <my:par><xsl:apply-templates /></my:par>
+ </xsl:template>
+ <xsl:template match="format">
+ <my:format><xsl:apply-templates /></my:format>
+ </xsl:template>
+</xsl:stylesheet>
+""")
+ test = self
+ calls = []
+
+ class ExtMyPar(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ calls.append('par')
+ p = etree.Element("p")
+ p.attrib["style"] = "color:red"
+ self.process_children(context, p)
+ output_parent.append(p)
+
+ class ExtMyFormat(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ calls.append('format')
+ content = self.process_children(context)
+ test.assertEqual(1, len(content))
+ test.assertEqual('arbitrary', content[0])
+ test.assertEqual('This is ', output_parent.text)
+ output_parent.text += '*-%s-*' % content[0]
+
+ extensions = {("my", "par"): ExtMyPar(), ("my", "format"): ExtMyFormat()}
+ transform = etree.XSLT(style, extensions=extensions)
+ result = transform(tree)
+ self.assertEqual(['par', 'format'], calls)
+ self.assertEqual(
+ b'<p style="color:red">This is *-arbitrary-* text in a paragraph</p>\n',
+ etree.tostring(result))
+
+ def test_extensions_nsmap(self):
+ tree = self.parse("""\
+<root>
+ <inner xmlns:sha256="http://www.w3.org/2001/04/xmlenc#sha256">
+ <data>test</data>
+ </inner>
+</root>
+""")
+ style = self.parse("""\
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:my="extns" extension-element-prefixes="my" version="1.0">
+ <xsl:template match="node()|@*">
+ <xsl:copy>
+ <xsl:apply-templates select="node()|@*"/>
+ </xsl:copy>
+ </xsl:template>
+
+ <xsl:template match="data">
+ <my:show-nsmap/>
+ </xsl:template>
+</xsl:stylesheet>
+""")
+ class MyExt(etree.XSLTExtension):
+ def execute(self, context, self_node, input_node, output_parent):
+ output_parent.text = str(input_node.nsmap)
+
+ extensions = {('extns', 'show-nsmap'): MyExt()}
+
+ result = tree.xslt(style, extensions=extensions)
+ self.assertEqual(etree.tostring(result, pretty_print=True), b"""\
+<root>
+ <inner xmlns:sha256="http://www.w3.org/2001/04/xmlenc#sha256">{'sha256': 'http://www.w3.org/2001/04/xmlenc#sha256'}
+ </inner>
+</root>
+""")
+
+
+
+class Py3XSLTTestCase(HelperTestCase):
+ """XSLT tests for etree under Python 3"""
+
+ pytestmark = skipif('sys.version_info < (3,0)')
+
+ def test_xslt_result_bytes(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="/a/b/text()" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree)
+ self.assertEqual(_bytes('''\
+<?xml version="1.0"?>
+<foo>B</foo>
+'''),
+ bytes(res))
+
+ def test_xslt_result_bytearray(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="/a/b/text()" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree)
+ self.assertEqual(_bytes('''\
+<?xml version="1.0"?>
+<foo>B</foo>
+'''),
+ bytearray(res))
+
+ def test_xslt_result_memoryview(self):
+ tree = self.parse('<a><b>B</b><c>C</c></a>')
+ style = self.parse('''\
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <xsl:template match="*" />
+ <xsl:template match="/">
+ <foo><xsl:value-of select="/a/b/text()" /></foo>
+ </xsl:template>
+</xsl:stylesheet>''')
+
+ st = etree.XSLT(style)
+ res = st(tree)
+ self.assertEqual(_bytes('''\
+<?xml version="1.0"?>
+<foo>B</foo>
+'''),
+ bytes(memoryview(res)))
+
+
+def test_suite():
+ suite = unittest.TestSuite()
+ suite.addTests([unittest.makeSuite(ETreeXSLTTestCase)])
+ suite.addTests([unittest.makeSuite(ETreeEXSLTTestCase)])
+ suite.addTests([unittest.makeSuite(ETreeXSLTExtFuncTestCase)])
+ suite.addTests([unittest.makeSuite(ETreeXSLTExtElementTestCase)])
+ if is_python3:
+ suite.addTests([unittest.makeSuite(Py3XSLTTestCase)])
+ suite.addTests(
+ [make_doctest('../../../doc/extensions.txt')])
+ suite.addTests(
+ [make_doctest('../../../doc/xpathxslt.txt')])
+ return suite
+
+if __name__ == '__main__':
+ print('to test use test.py %s' % __file__)
diff --git a/src/lxml/usedoctest.py b/src/lxml/usedoctest.py
new file mode 100644
index 0000000..f1da8ca
--- /dev/null
+++ b/src/lxml/usedoctest.py
@@ -0,0 +1,13 @@
+"""Doctest module for XML comparison.
+
+Usage::
+
+ >>> import lxml.usedoctest
+ >>> # now do your XML doctests ...
+
+See `lxml.doctestcompare`
+"""
+
+from lxml import doctestcompare
+
+doctestcompare.temp_install(del_module=__name__)
diff --git a/src/lxml/xinclude.pxi b/src/lxml/xinclude.pxi
new file mode 100644
index 0000000..6bac829
--- /dev/null
+++ b/src/lxml/xinclude.pxi
@@ -0,0 +1,67 @@
+# XInclude processing
+
+from lxml.includes cimport xinclude
+
+
+cdef class XIncludeError(LxmlError):
+ u"""Error during XInclude processing.
+ """
+
+
+cdef class XInclude:
+ u"""XInclude(self)
+ XInclude processor.
+
+ Create an instance and call it on an Element to run XInclude
+ processing.
+ """
+ cdef _ErrorLog _error_log
+ def __init__(self):
+ self._error_log = _ErrorLog()
+
+ @property
+ def error_log(self):
+ assert self._error_log is not None, "XInclude instance not initialised"
+ return self._error_log.copy()
+
+ def __call__(self, _Element node not None):
+ u"__call__(self, node)"
+ # We cannot pass the XML_PARSE_NOXINCNODE option as this would free
+ # the XInclude nodes - there may still be Python references to them!
+ # Therefore, we allow XInclude nodes to be converted to
+ # XML_XINCLUDE_START nodes. XML_XINCLUDE_END nodes are added as
+ # siblings. Tree traversal will simply ignore them as they are not
+ # typed as elements. The included fragment is added between the two,
+ # i.e. as a sibling, which does not conflict with traversal.
+ cdef int result
+ _assertValidNode(node)
+ assert self._error_log is not None, "XInclude processor not initialised"
+ if node._doc._parser is not None:
+ parse_options = node._doc._parser._parse_options
+ context = node._doc._parser._getParserContext()
+ c_context = <void*>context
+ else:
+ parse_options = 0
+ context = None
+ c_context = NULL
+
+ self._error_log.connect()
+ if tree.LIBXML_VERSION < 20704 or not c_context:
+ __GLOBAL_PARSER_CONTEXT.pushImpliedContext(context)
+ with nogil:
+ orig_loader = _register_document_loader()
+ if c_context:
+ result = xinclude.xmlXIncludeProcessTreeFlagsData(
+ node._c_node, parse_options, c_context)
+ else:
+ result = xinclude.xmlXIncludeProcessTree(node._c_node)
+ _reset_document_loader(orig_loader)
+ if tree.LIBXML_VERSION < 20704 or not c_context:
+ __GLOBAL_PARSER_CONTEXT.popImpliedContext()
+ self._error_log.disconnect()
+
+ if result == -1:
+ raise XIncludeError(
+ self._error_log._buildExceptionMessage(
+ u"XInclude processing failed"),
+ self._error_log)
diff --git a/src/lxml/xmlerror.pxi b/src/lxml/xmlerror.pxi
new file mode 100644
index 0000000..ccc9e64
--- /dev/null
+++ b/src/lxml/xmlerror.pxi
@@ -0,0 +1,1646 @@
+# DEBUG and error logging
+
+from lxml.includes cimport xmlerror
+from lxml cimport cvarargs
+
+DEF GLOBAL_ERROR_LOG = u"_GlobalErrorLog"
+DEF XSLT_ERROR_LOG = u"_XSLTErrorLog"
+
+# module level API functions
+
+def clear_error_log():
+ u"""clear_error_log()
+
+ Clear the global error log. Note that this log is already bound to a
+ fixed size.
+
+ Note: since lxml 2.2, the global error log is local to a thread
+ and this function will only clear the global error log of the
+ current thread.
+ """
+ _getThreadErrorLog(GLOBAL_ERROR_LOG).clear()
+
+
+# setup for global log:
+
+cdef void _initThreadLogging():
+ # Disable generic error lines from libxml2.
+ _connectGenericErrorLog(None)
+
+ # Divert XSLT error messages to the global XSLT error log instead of stderr.
+ xslt.xsltSetGenericErrorFunc(NULL, <xmlerror.xmlGenericErrorFunc>_receiveXSLTError)
+
+
+# Logging classes
+
+@cython.final
+@cython.freelist(16)
+cdef class _LogEntry:
+ """A log message entry from an error log.
+
+ Attributes:
+
+ - message: the message text
+ - domain: the domain ID (see lxml.etree.ErrorDomains)
+ - type: the message type ID (see lxml.etree.ErrorTypes)
+ - level: the log level ID (see lxml.etree.ErrorLevels)
+ - line: the line at which the message originated (if applicable)
+ - column: the character column at which the message originated (if applicable)
+ - filename: the name of the file in which the message originated (if applicable)
+ - path: the location in which the error was found (if available)
+ """
+ cdef readonly int domain
+ cdef readonly int type
+ cdef readonly int level
+ cdef readonly int line
+ cdef readonly int column
+ cdef basestring _message
+ cdef basestring _filename
+ cdef char* _c_message
+ cdef xmlChar* _c_filename
+ cdef xmlChar* _c_path
+
+ def __dealloc__(self):
+ tree.xmlFree(self._c_message)
+ tree.xmlFree(self._c_filename)
+ tree.xmlFree(self._c_path)
+
+ @cython.final
+ cdef _setError(self, xmlerror.xmlError* error):
+ self.domain = error.domain
+ self.type = error.code
+ self.level = <int>error.level
+ self.line = error.line
+ self.column = error.int2
+ self._c_message = NULL
+ self._c_filename = NULL
+ self._c_path = NULL
+ if (error.message is NULL or
+ error.message[0] == b'\0' or
+ error.message[0] == b'\n' and error.message[1] == b'\0'):
+ self._message = u"unknown error"
+ else:
+ self._message = None
+ self._c_message = <char*> tree.xmlStrdup(
+ <const_xmlChar*> error.message)
+ if not self._c_message:
+ raise MemoryError()
+ if error.file is NULL:
+ self._filename = u'<string>'
+ else:
+ self._filename = None
+ self._c_filename = tree.xmlStrdup(<const_xmlChar*> error.file)
+ if not self._c_filename:
+ raise MemoryError()
+ if error.node is not NULL:
+ self._c_path = tree.xmlGetNodePath(<xmlNode*> error.node)
+
+ @cython.final
+ cdef _setGeneric(self, int domain, int type, int level, int line,
+ message, filename):
+ self.domain = domain
+ self.type = type
+ self.level = level
+ self.line = line
+ self.column = 0
+ self._message = message
+ self._filename = filename
+ self._c_path = NULL
+
+ def __repr__(self):
+ return u"%s:%d:%d:%s:%s:%s: %s" % (
+ self.filename, self.line, self.column, self.level_name,
+ self.domain_name, self.type_name, self.message)
+
+ @property
+ def domain_name(self):
+ """The name of the error domain. See lxml.etree.ErrorDomains
+ """
+ return ErrorDomains._getName(self.domain, u"unknown")
+
+ @property
+ def type_name(self):
+ """The name of the error type. See lxml.etree.ErrorTypes
+ """
+ if self.domain == ErrorDomains.RELAXNGV:
+ getName = RelaxNGErrorTypes._getName
+ else:
+ getName = ErrorTypes._getName
+ return getName(self.type, u"unknown")
+
+ @property
+ def level_name(self):
+ """The name of the error level. See lxml.etree.ErrorLevels
+ """
+ return ErrorLevels._getName(self.level, u"unknown")
+
+ @property
+ def message(self):
+ """The log message string.
+ """
+ cdef size_t size
+ if self._message is not None:
+ return self._message
+ if self._c_message is NULL:
+ return None
+ size = cstring_h.strlen(self._c_message)
+ if size > 0 and self._c_message[size-1] == '\n':
+ size -= 1 # strip EOL
+ # cannot use funicode() here because the message may contain
+ # byte encoded file paths etc.
+ try:
+ self._message = self._c_message[:size].decode('utf8')
+ except UnicodeDecodeError:
+ try:
+ self._message = self._c_message[:size].decode(
+ 'ascii', 'backslashreplace')
+ except UnicodeDecodeError:
+ self._message = u'<undecodable error message>'
+ if self._c_message:
+ # clean up early
+ tree.xmlFree(self._c_message)
+ self._c_message = NULL
+ return self._message
+
+ @property
+ def filename(self):
+ """The file path where the report originated, if any.
+ """
+ if self._filename is None:
+ if self._c_filename is not NULL:
+ self._filename = _decodeFilename(self._c_filename)
+ # clean up early
+ tree.xmlFree(self._c_filename)
+ self._c_filename = NULL
+ return self._filename
+
+ @property
+ def path(self):
+ """The XPath for the node where the error was detected.
+ """
+ return funicode(self._c_path) if self._c_path is not NULL else None
+
+
+cdef class _BaseErrorLog:
+ cdef _LogEntry _first_error
+ cdef readonly object last_error
+ def __init__(self, first_error, last_error):
+ self._first_error = first_error
+ self.last_error = last_error
+
+ cpdef copy(self):
+ return _BaseErrorLog(self._first_error, self.last_error)
+
+ def __repr__(self):
+ return u''
+
+ cpdef receive(self, _LogEntry entry):
+ pass
+
+ @cython.final
+ cdef void _receive(self, xmlerror.xmlError* error):
+ cdef bint is_error
+ cdef _LogEntry entry
+ cdef _BaseErrorLog global_log
+ entry = _LogEntry.__new__(_LogEntry)
+ entry._setError(error)
+ is_error = error.level == xmlerror.XML_ERR_ERROR or \
+ error.level == xmlerror.XML_ERR_FATAL
+ global_log = _getThreadErrorLog(GLOBAL_ERROR_LOG)
+ if global_log is not self:
+ global_log.receive(entry)
+ if is_error:
+ global_log.last_error = entry
+ self.receive(entry)
+ if is_error:
+ self.last_error = entry
+
+ @cython.final
+ cdef void _receiveGeneric(self, int domain, int type, int level, int line,
+ message, filename):
+ cdef bint is_error
+ cdef _LogEntry entry
+ cdef _BaseErrorLog global_log
+ entry = _LogEntry.__new__(_LogEntry)
+ entry._setGeneric(domain, type, level, line, message, filename)
+ is_error = level == xmlerror.XML_ERR_ERROR or \
+ level == xmlerror.XML_ERR_FATAL
+ global_log = _getThreadErrorLog(GLOBAL_ERROR_LOG)
+ if global_log is not self:
+ global_log.receive(entry)
+ if is_error:
+ global_log.last_error = entry
+ self.receive(entry)
+ if is_error:
+ self.last_error = entry
+
+ @cython.final
+ cdef _buildParseException(self, exctype, default_message):
+ code = xmlerror.XML_ERR_INTERNAL_ERROR
+ if self._first_error is None:
+ return exctype(default_message, code, 0, 0)
+ message = self._first_error.message
+ if message:
+ code = self._first_error.type
+ else:
+ message = default_message
+ line = self._first_error.line
+ column = self._first_error.column
+ filename = self._first_error.filename
+ if line > 0:
+ if column > 0:
+ message = f"{message}, line {line}, column {column}"
+ else:
+ message = f"{message}, line {line}"
+ return exctype(message, code, line, column, filename)
+
+ @cython.final
+ cdef _buildExceptionMessage(self, default_message):
+ if self._first_error is None:
+ return default_message
+ if self._first_error.message:
+ message = self._first_error.message
+ elif default_message is None:
+ return None
+ else:
+ message = default_message
+ if self._first_error.line > 0:
+ if self._first_error.column > 0:
+ message = f"{message}, line {self._first_error.line}, column {self._first_error.column}"
+ else:
+ message = f"{message}, line {self._first_error.line}"
+ return message
+
+cdef class _ListErrorLog(_BaseErrorLog):
+ u"Immutable base version of a list based error log."
+ cdef list _entries
+ cdef int _offset
+ def __init__(self, entries, first_error, last_error):
+ if entries:
+ if first_error is None:
+ first_error = entries[0]
+ if last_error is None:
+ last_error = entries[-1]
+ _BaseErrorLog.__init__(self, first_error, last_error)
+ self._entries = entries
+
+ cpdef copy(self):
+ u"""Creates a shallow copy of this error log. Reuses the list of
+ entries.
+ """
+ cdef _ListErrorLog log = _ListErrorLog(
+ self._entries, self._first_error, self.last_error)
+ log._offset = self._offset
+ return log
+
+ def __iter__(self):
+ entries = self._entries
+ if self._offset:
+ entries = islice(entries, self._offset)
+ return iter(entries)
+
+ def __repr__(self):
+ return u'\n'.join([repr(entry) for entry in self])
+
+ def __getitem__(self, index):
+ if self._offset:
+ index += self._offset
+ return self._entries[index]
+
+ def __len__(self):
+ return len(self._entries) - self._offset
+
+ def __contains__(self, error_type):
+ cdef Py_ssize_t i
+ for i, entry in enumerate(self._entries):
+ if i < self._offset:
+ continue
+ if entry.type == error_type:
+ return True
+ return False
+
+ def __nonzero__(self):
+ return len(self._entries) > self._offset
+
+ def filter_domains(self, domains):
+ u"""Filter the errors by the given domains and return a new error log
+ containing the matches.
+ """
+ cdef _LogEntry entry
+ if isinstance(domains, (int, long)):
+ domains = (domains,)
+ filtered = [entry for entry in self if entry.domain in domains]
+ return _ListErrorLog(filtered, None, None)
+
+ def filter_types(self, types):
+ u"""filter_types(self, types)
+
+ Filter the errors by the given types and return a new error
+ log containing the matches.
+ """
+ cdef _LogEntry entry
+ if isinstance(types, (int, long)):
+ types = (types,)
+ filtered = [entry for entry in self if entry.type in types]
+ return _ListErrorLog(filtered, None, None)
+
+ def filter_levels(self, levels):
+ u"""filter_levels(self, levels)
+
+ Filter the errors by the given error levels and return a new
+ error log containing the matches.
+ """
+ cdef _LogEntry entry
+ if isinstance(levels, (int, long)):
+ levels = (levels,)
+ filtered = [entry for entry in self if entry.level in levels]
+ return _ListErrorLog(filtered, None, None)
+
+ def filter_from_level(self, level):
+ u"""filter_from_level(self, level)
+
+ Return a log with all messages of the requested level of worse.
+ """
+ cdef _LogEntry entry
+ filtered = [entry for entry in self if entry.level >= level]
+ return _ListErrorLog(filtered, None, None)
+
+ def filter_from_fatals(self):
+ u"""filter_from_fatals(self)
+
+ Convenience method to get all fatal error messages.
+ """
+ return self.filter_from_level(ErrorLevels.FATAL)
+
+ def filter_from_errors(self):
+ u"""filter_from_errors(self)
+
+ Convenience method to get all error messages or worse.
+ """
+ return self.filter_from_level(ErrorLevels.ERROR)
+
+ def filter_from_warnings(self):
+ u"""filter_from_warnings(self)
+
+ Convenience method to get all warnings or worse.
+ """
+ return self.filter_from_level(ErrorLevels.WARNING)
+
+
+@cython.final
+@cython.internal
+cdef class _ErrorLogContext:
+ """
+ Error log context for the 'with' statement.
+ Stores a reference to the current callbacks to allow for
+ recursively stacked log contexts.
+ """
+ cdef xmlerror.xmlStructuredErrorFunc old_error_func
+ cdef void* old_error_context
+ cdef xmlerror.xmlGenericErrorFunc old_xslt_error_func
+ cdef void* old_xslt_error_context
+ cdef _BaseErrorLog old_xslt_error_log
+
+ cdef int push_error_log(self, _BaseErrorLog log) except -1:
+ self.old_error_func = xmlerror.xmlStructuredError
+ self.old_error_context = xmlerror.xmlStructuredErrorContext
+ xmlerror.xmlSetStructuredErrorFunc(
+ <void*>log, <xmlerror.xmlStructuredErrorFunc>_receiveError)
+
+ # xslt.xsltSetGenericErrorFunc() is not thread-local => keep error log in TLS
+ self.old_xslt_error_func = xslt.xsltGenericError
+ self.old_xslt_error_context = xslt.xsltGenericErrorContext
+ self.old_xslt_error_log = _getThreadErrorLog(XSLT_ERROR_LOG)
+ _setThreadErrorLog(XSLT_ERROR_LOG, log)
+ xslt.xsltSetGenericErrorFunc(
+ NULL, <xmlerror.xmlGenericErrorFunc>_receiveXSLTError)
+ return 0
+
+ cdef int pop_error_log(self) except -1:
+ xmlerror.xmlSetStructuredErrorFunc(
+ self.old_error_context, self.old_error_func)
+ xslt.xsltSetGenericErrorFunc(
+ self.old_xslt_error_context, self.old_xslt_error_func)
+ _setThreadErrorLog(XSLT_ERROR_LOG, self.old_xslt_error_log)
+ self.old_xslt_error_log= None
+ return 0
+
+
+cdef class _ErrorLog(_ListErrorLog):
+ cdef list _logContexts
+ def __cinit__(self):
+ self._logContexts = []
+
+ def __init__(self):
+ _ListErrorLog.__init__(self, [], None, None)
+
+ @cython.final
+ cdef int __enter__(self) except -1:
+ return self.connect()
+
+ def __exit__(self, *args):
+ # TODO: make this a cdef function when Cython supports it
+ self.disconnect()
+
+ @cython.final
+ cdef int connect(self) except -1:
+ self._first_error = None
+ del self._entries[:]
+
+ cdef _ErrorLogContext context = _ErrorLogContext.__new__(_ErrorLogContext)
+ context.push_error_log(self)
+ self._logContexts.append(context)
+ return 0
+
+ @cython.final
+ cdef int disconnect(self) except -1:
+ cdef _ErrorLogContext context = self._logContexts.pop()
+ context.pop_error_log()
+ return 0
+
+ cpdef clear(self):
+ self._first_error = None
+ self.last_error = None
+ self._offset = 0
+ del self._entries[:]
+
+ cpdef copy(self):
+ u"""Creates a shallow copy of this error log and the list of entries.
+ """
+ return _ListErrorLog(
+ self._entries[self._offset:],
+ self._first_error, self.last_error)
+
+ def __iter__(self):
+ return iter(self._entries[self._offset:])
+
+ cpdef receive(self, _LogEntry entry):
+ if self._first_error is None and entry.level >= xmlerror.XML_ERR_ERROR:
+ self._first_error = entry
+ self._entries.append(entry)
+
+cdef class _DomainErrorLog(_ErrorLog):
+ def __init__(self, domains):
+ _ErrorLog.__init__(self)
+ self._accepted_domains = tuple(domains)
+
+ cpdef receive(self, _LogEntry entry):
+ if entry.domain in self._accepted_domains:
+ _ErrorLog.receive(self, entry)
+
+cdef class _RotatingErrorLog(_ErrorLog):
+ cdef int _max_len
+ def __init__(self, max_len):
+ _ErrorLog.__init__(self)
+ self._max_len = max_len
+
+ cpdef receive(self, _LogEntry entry):
+ if self._first_error is None and entry.level >= xmlerror.XML_ERR_ERROR:
+ self._first_error = entry
+ self._entries.append(entry)
+
+ if len(self._entries) > self._max_len:
+ self._offset += 1
+ if self._offset > self._max_len // 3:
+ offset = self._offset
+ self._offset = 0
+ del self._entries[:offset]
+
+cdef class PyErrorLog(_BaseErrorLog):
+ u"""PyErrorLog(self, logger_name=None, logger=None)
+ A global error log that connects to the Python stdlib logging package.
+
+ The constructor accepts an optional logger name or a readily
+ instantiated logger instance.
+
+ If you want to change the mapping between libxml2's ErrorLevels and Python
+ logging levels, you can modify the level_map dictionary from a subclass.
+
+ The default mapping is::
+
+ ErrorLevels.WARNING = logging.WARNING
+ ErrorLevels.ERROR = logging.ERROR
+ ErrorLevels.FATAL = logging.CRITICAL
+
+ You can also override the method ``receive()`` that takes a LogEntry
+ object and calls ``self.log(log_entry, format_string, arg1, arg2, ...)``
+ with appropriate data.
+ """
+ cdef readonly dict level_map
+ cdef object _map_level
+ cdef object _log
+ def __init__(self, logger_name=None, logger=None):
+ _BaseErrorLog.__init__(self, None, None)
+ import logging
+ self.level_map = {
+ ErrorLevels.WARNING : logging.WARNING,
+ ErrorLevels.ERROR : logging.ERROR,
+ ErrorLevels.FATAL : logging.CRITICAL
+ }
+ self._map_level = self.level_map.get
+ if logger is None:
+ if logger_name:
+ logger = logging.getLogger(logger_name)
+ else:
+ logger = logging.getLogger()
+ self._log = logger.log
+
+ cpdef copy(self):
+ u"""Dummy method that returns an empty error log.
+ """
+ return _ListErrorLog([], None, None)
+
+ def log(self, log_entry, message, *args):
+ u"""log(self, log_entry, message, *args)
+
+ Called by the .receive() method to log a _LogEntry instance to
+ the Python logging system. This handles the error level
+ mapping.
+
+ In the default implementation, the ``message`` argument
+ receives a complete log line, and there are no further
+ ``args``. To change the message format, it is best to
+ override the .receive() method instead of this one.
+ """
+ self._log(
+ self._map_level(log_entry.level, 0),
+ message, *args
+ )
+
+ cpdef receive(self, _LogEntry log_entry):
+ u"""receive(self, log_entry)
+
+ Receive a _LogEntry instance from the logging system. Calls
+ the .log() method with appropriate parameters::
+
+ self.log(log_entry, repr(log_entry))
+
+ You can override this method to provide your own log output
+ format.
+ """
+ self.log(log_entry, repr(log_entry))
+
+# thread-local, global list log to collect error output messages from
+# libxml2/libxslt
+
+cdef _BaseErrorLog __GLOBAL_ERROR_LOG = _RotatingErrorLog(__MAX_LOG_SIZE)
+
+
+cdef _BaseErrorLog _getThreadErrorLog(name):
+ u"""Retrieve the current error log with name 'name' of this thread."""
+ cdef python.PyObject* thread_dict
+ thread_dict = python.PyThreadState_GetDict()
+ if thread_dict is NULL:
+ return __GLOBAL_ERROR_LOG
+ try:
+ return (<object>thread_dict)[name]
+ except KeyError:
+ log = (<object>thread_dict)[name] = \
+ _RotatingErrorLog(__MAX_LOG_SIZE)
+ return log
+
+
+cdef _setThreadErrorLog(name, _BaseErrorLog log):
+ u"""Set the global error log of this thread."""
+ cdef python.PyObject* thread_dict
+ thread_dict = python.PyThreadState_GetDict()
+ if thread_dict is NULL:
+ if name == GLOBAL_ERROR_LOG:
+ global __GLOBAL_ERROR_LOG
+ __GLOBAL_ERROR_LOG = log
+ else:
+ (<object>thread_dict)[name] = log
+
+
+cdef __copyGlobalErrorLog():
+ u"Helper function for properties in exceptions."
+ return _getThreadErrorLog(GLOBAL_ERROR_LOG).copy()
+
+
+def use_global_python_log(PyErrorLog log not None):
+ u"""use_global_python_log(log)
+
+ Replace the global error log by an etree.PyErrorLog that uses the
+ standard Python logging package.
+
+ Note that this disables access to the global error log from exceptions.
+ Parsers, XSLT etc. will continue to provide their normal local error log.
+
+ Note: prior to lxml 2.2, this changed the error log globally.
+ Since lxml 2.2, the global error log is local to a thread and this
+ function will only set the global error log of the current thread.
+ """
+ _setThreadErrorLog(GLOBAL_ERROR_LOG, log)
+
+
+# local log functions: forward error to logger object
+cdef void _forwardError(void* c_log_handler, xmlerror.xmlError* error) with gil:
+ cdef _BaseErrorLog log_handler
+ if c_log_handler is not NULL:
+ log_handler = <_BaseErrorLog>c_log_handler
+ elif error.domain == xmlerror.XML_FROM_XSLT:
+ log_handler = _getThreadErrorLog(XSLT_ERROR_LOG)
+ else:
+ log_handler = _getThreadErrorLog(GLOBAL_ERROR_LOG)
+ log_handler._receive(error)
+
+
+cdef void _receiveError(void* c_log_handler, xmlerror.xmlError* error) nogil:
+ # no Python objects here, may be called without thread context !
+ if __DEBUG:
+ _forwardError(c_log_handler, error)
+
+
+cdef void _receiveXSLTError(void* c_log_handler, char* msg, ...) nogil:
+ # no Python objects here, may be called without thread context !
+ cdef cvarargs.va_list args
+ cvarargs.va_start(args, msg)
+ _receiveGenericError(c_log_handler, xmlerror.XML_FROM_XSLT, msg, args)
+ cvarargs.va_end(args)
+
+cdef void _receiveRelaxNGParseError(void* c_log_handler, char* msg, ...) nogil:
+ # no Python objects here, may be called without thread context !
+ cdef cvarargs.va_list args
+ cvarargs.va_start(args, msg)
+ _receiveGenericError(c_log_handler, xmlerror.XML_FROM_RELAXNGP, msg, args)
+ cvarargs.va_end(args)
+
+cdef void _receiveRelaxNGValidationError(void* c_log_handler, char* msg, ...) nogil:
+ # no Python objects here, may be called without thread context !
+ cdef cvarargs.va_list args
+ cvarargs.va_start(args, msg)
+ _receiveGenericError(c_log_handler, xmlerror.XML_FROM_RELAXNGV, msg, args)
+ cvarargs.va_end(args)
+
+# dummy function: no log output at all
+cdef void _nullGenericErrorFunc(void* ctxt, char* msg, ...) nogil:
+ pass
+
+
+cdef void _connectGenericErrorLog(log, int c_domain=-1):
+ cdef xmlerror.xmlGenericErrorFunc error_func = NULL
+ c_log = <void*>log
+ if c_domain == xmlerror.XML_FROM_XSLT:
+ error_func = <xmlerror.xmlGenericErrorFunc>_receiveXSLTError
+ elif c_domain == xmlerror.XML_FROM_RELAXNGP:
+ error_func = <xmlerror.xmlGenericErrorFunc>_receiveRelaxNGParseError
+ elif c_domain == xmlerror.XML_FROM_RELAXNGV:
+ error_func = <xmlerror.xmlGenericErrorFunc>_receiveRelaxNGValidationError
+
+ if log is None or error_func is NULL:
+ c_log = NULL
+ error_func = <xmlerror.xmlGenericErrorFunc>_nullGenericErrorFunc
+ xmlerror.xmlSetGenericErrorFunc(c_log, error_func)
+
+
+cdef void _receiveGenericError(void* c_log_handler, int c_domain,
+ char* msg, cvarargs.va_list args) nogil:
+ # no Python objects here, may be called without thread context !
+ cdef xmlerror.xmlError c_error
+ cdef char* c_text
+ cdef char* c_message
+ cdef char* c_element
+ cdef char* c_pos
+ cdef char* c_name_pos
+ cdef char* c_str
+ cdef int text_size, element_size, format_count, c_int
+ if not __DEBUG or msg is NULL:
+ return
+ if msg[0] in b'\n\0':
+ return
+
+ c_text = c_element = c_error.file = c_error.node = NULL
+ c_error.line = 0
+
+ # parse "NAME %s" chunks from the format string
+ c_name_pos = c_pos = msg
+ format_count = 0
+ while c_pos[0]:
+ if c_pos[0] == '%':
+ c_pos += 1
+ if c_pos[0] == 's': # "%s"
+ format_count += 1
+ c_str = cvarargs.va_charptr(args)
+ if c_pos == msg + 1:
+ c_text = c_str # msg == "%s..."
+ elif c_name_pos[0] == 'e':
+ if cstring_h.strncmp(c_name_pos, 'element %s', 10) == 0:
+ c_element = c_str
+ elif c_name_pos[0] == 'f':
+ if cstring_h.strncmp(c_name_pos, 'file %s', 7) == 0:
+ if cstring_h.strncmp('string://__STRING__XSLT',
+ c_str, 23) == 0:
+ c_str = '<xslt>'
+ c_error.file = c_str
+ elif c_pos[0] == 'd': # "%d"
+ format_count += 1
+ c_int = cvarargs.va_int(args)
+ if cstring_h.strncmp(c_name_pos, 'line %d', 7) == 0:
+ c_error.line = c_int
+ elif c_pos[0] != '%': # "%%" == "%"
+ format_count += 1
+ break # unexpected format or end of string => abort
+ elif c_pos[0] == ' ':
+ if c_pos[1] != '%':
+ c_name_pos = c_pos + 1
+ c_pos += 1
+
+ c_message = NULL
+ if c_text is NULL:
+ if c_element is not NULL and format_count == 1:
+ # special case: a single occurrence of 'element %s'
+ text_size = cstring_h.strlen(msg)
+ element_size = cstring_h.strlen(c_element)
+ c_message = <char*>stdlib.malloc(
+ (text_size + element_size + 1) * sizeof(char))
+ stdio.sprintf(c_message, msg, c_element)
+ c_error.message = c_message
+ else:
+ c_error.message = ''
+ elif c_element is NULL:
+ c_error.message = c_text
+ else:
+ text_size = cstring_h.strlen(c_text)
+ element_size = cstring_h.strlen(c_element)
+ c_message = <char*>stdlib.malloc(
+ (text_size + 12 + element_size + 1) * sizeof(char))
+ stdio.sprintf(c_message, "%s, element '%s'", c_text, c_element)
+ c_error.message = c_message
+
+ c_error.domain = c_domain
+ c_error.code = xmlerror.XML_ERR_OK # what else?
+ c_error.level = xmlerror.XML_ERR_ERROR # what else?
+ c_error.int2 = 0
+
+ _forwardError(c_log_handler, &c_error)
+
+ if c_message is not NULL:
+ stdlib.free(c_message)
+
+################################################################################
+## CONSTANTS FROM "xmlerror.h" (or rather libxml-xmlerror.html)
+################################################################################
+
+cdef __initErrorConstants():
+ "Called at setup time to parse the constants and build the classes below."
+ global __ERROR_LEVELS, __ERROR_DOMAINS, __PARSER_ERROR_TYPES, __RELAXNG_ERROR_TYPES
+ const_defs = ((ErrorLevels, __ERROR_LEVELS),
+ (ErrorDomains, __ERROR_DOMAINS),
+ (ErrorTypes, __PARSER_ERROR_TYPES),
+ (RelaxNGErrorTypes, __RELAXNG_ERROR_TYPES))
+
+ for cls, constants in const_defs:
+ reverse_dict = {}
+ cls._names = reverse_dict
+ cls._getName = reverse_dict.get
+ for line in constants.splitlines():
+ if not line:
+ continue
+ name, value = line.split('=')
+ value = int(value)
+ setattr(cls, name, value)
+ reverse_dict[value] = name
+
+ # discard the global tuple references after use
+ __ERROR_LEVELS = __ERROR_DOMAINS = __PARSER_ERROR_TYPES = __RELAXNG_ERROR_TYPES = None
+
+
+class ErrorLevels(object):
+ u"Libxml2 error levels"
+
+class ErrorDomains(object):
+ u"Libxml2 error domains"
+
+class ErrorTypes(object):
+ u"Libxml2 error types"
+
+class RelaxNGErrorTypes(object):
+ u"Libxml2 RelaxNG error types"
+
+# --- BEGIN: GENERATED CONSTANTS ---
+
+# This section is generated by the script 'update-error-constants.py'.
+
+cdef object __ERROR_LEVELS = """\
+NONE=0
+WARNING=1
+ERROR=2
+FATAL=3
+"""
+
+cdef object __ERROR_DOMAINS = """\
+NONE=0
+PARSER=1
+TREE=2
+NAMESPACE=3
+DTD=4
+HTML=5
+MEMORY=6
+OUTPUT=7
+IO=8
+FTP=9
+HTTP=10
+XINCLUDE=11
+XPATH=12
+XPOINTER=13
+REGEXP=14
+DATATYPE=15
+SCHEMASP=16
+SCHEMASV=17
+RELAXNGP=18
+RELAXNGV=19
+CATALOG=20
+C14N=21
+XSLT=22
+VALID=23
+CHECK=24
+WRITER=25
+MODULE=26
+I18N=27
+SCHEMATRONV=28
+BUFFER=29
+URI=30
+"""
+
+cdef object __PARSER_ERROR_TYPES = """\
+ERR_OK=0
+ERR_INTERNAL_ERROR=1
+ERR_NO_MEMORY=2
+ERR_DOCUMENT_START=3
+ERR_DOCUMENT_EMPTY=4
+ERR_DOCUMENT_END=5
+ERR_INVALID_HEX_CHARREF=6
+ERR_INVALID_DEC_CHARREF=7
+ERR_INVALID_CHARREF=8
+ERR_INVALID_CHAR=9
+ERR_CHARREF_AT_EOF=10
+ERR_CHARREF_IN_PROLOG=11
+ERR_CHARREF_IN_EPILOG=12
+ERR_CHARREF_IN_DTD=13
+ERR_ENTITYREF_AT_EOF=14
+ERR_ENTITYREF_IN_PROLOG=15
+ERR_ENTITYREF_IN_EPILOG=16
+ERR_ENTITYREF_IN_DTD=17
+ERR_PEREF_AT_EOF=18
+ERR_PEREF_IN_PROLOG=19
+ERR_PEREF_IN_EPILOG=20
+ERR_PEREF_IN_INT_SUBSET=21
+ERR_ENTITYREF_NO_NAME=22
+ERR_ENTITYREF_SEMICOL_MISSING=23
+ERR_PEREF_NO_NAME=24
+ERR_PEREF_SEMICOL_MISSING=25
+ERR_UNDECLARED_ENTITY=26
+WAR_UNDECLARED_ENTITY=27
+ERR_UNPARSED_ENTITY=28
+ERR_ENTITY_IS_EXTERNAL=29
+ERR_ENTITY_IS_PARAMETER=30
+ERR_UNKNOWN_ENCODING=31
+ERR_UNSUPPORTED_ENCODING=32
+ERR_STRING_NOT_STARTED=33
+ERR_STRING_NOT_CLOSED=34
+ERR_NS_DECL_ERROR=35
+ERR_ENTITY_NOT_STARTED=36
+ERR_ENTITY_NOT_FINISHED=37
+ERR_LT_IN_ATTRIBUTE=38
+ERR_ATTRIBUTE_NOT_STARTED=39
+ERR_ATTRIBUTE_NOT_FINISHED=40
+ERR_ATTRIBUTE_WITHOUT_VALUE=41
+ERR_ATTRIBUTE_REDEFINED=42
+ERR_LITERAL_NOT_STARTED=43
+ERR_LITERAL_NOT_FINISHED=44
+ERR_COMMENT_NOT_FINISHED=45
+ERR_PI_NOT_STARTED=46
+ERR_PI_NOT_FINISHED=47
+ERR_NOTATION_NOT_STARTED=48
+ERR_NOTATION_NOT_FINISHED=49
+ERR_ATTLIST_NOT_STARTED=50
+ERR_ATTLIST_NOT_FINISHED=51
+ERR_MIXED_NOT_STARTED=52
+ERR_MIXED_NOT_FINISHED=53
+ERR_ELEMCONTENT_NOT_STARTED=54
+ERR_ELEMCONTENT_NOT_FINISHED=55
+ERR_XMLDECL_NOT_STARTED=56
+ERR_XMLDECL_NOT_FINISHED=57
+ERR_CONDSEC_NOT_STARTED=58
+ERR_CONDSEC_NOT_FINISHED=59
+ERR_EXT_SUBSET_NOT_FINISHED=60
+ERR_DOCTYPE_NOT_FINISHED=61
+ERR_MISPLACED_CDATA_END=62
+ERR_CDATA_NOT_FINISHED=63
+ERR_RESERVED_XML_NAME=64
+ERR_SPACE_REQUIRED=65
+ERR_SEPARATOR_REQUIRED=66
+ERR_NMTOKEN_REQUIRED=67
+ERR_NAME_REQUIRED=68
+ERR_PCDATA_REQUIRED=69
+ERR_URI_REQUIRED=70
+ERR_PUBID_REQUIRED=71
+ERR_LT_REQUIRED=72
+ERR_GT_REQUIRED=73
+ERR_LTSLASH_REQUIRED=74
+ERR_EQUAL_REQUIRED=75
+ERR_TAG_NAME_MISMATCH=76
+ERR_TAG_NOT_FINISHED=77
+ERR_STANDALONE_VALUE=78
+ERR_ENCODING_NAME=79
+ERR_HYPHEN_IN_COMMENT=80
+ERR_INVALID_ENCODING=81
+ERR_EXT_ENTITY_STANDALONE=82
+ERR_CONDSEC_INVALID=83
+ERR_VALUE_REQUIRED=84
+ERR_NOT_WELL_BALANCED=85
+ERR_EXTRA_CONTENT=86
+ERR_ENTITY_CHAR_ERROR=87
+ERR_ENTITY_PE_INTERNAL=88
+ERR_ENTITY_LOOP=89
+ERR_ENTITY_BOUNDARY=90
+ERR_INVALID_URI=91
+ERR_URI_FRAGMENT=92
+WAR_CATALOG_PI=93
+ERR_NO_DTD=94
+ERR_CONDSEC_INVALID_KEYWORD=95
+ERR_VERSION_MISSING=96
+WAR_UNKNOWN_VERSION=97
+WAR_LANG_VALUE=98
+WAR_NS_URI=99
+WAR_NS_URI_RELATIVE=100
+ERR_MISSING_ENCODING=101
+WAR_SPACE_VALUE=102
+ERR_NOT_STANDALONE=103
+ERR_ENTITY_PROCESSING=104
+ERR_NOTATION_PROCESSING=105
+WAR_NS_COLUMN=106
+WAR_ENTITY_REDEFINED=107
+ERR_UNKNOWN_VERSION=108
+ERR_VERSION_MISMATCH=109
+ERR_NAME_TOO_LONG=110
+ERR_USER_STOP=111
+NS_ERR_XML_NAMESPACE=200
+NS_ERR_UNDEFINED_NAMESPACE=201
+NS_ERR_QNAME=202
+NS_ERR_ATTRIBUTE_REDEFINED=203
+NS_ERR_EMPTY=204
+NS_ERR_COLON=205
+DTD_ATTRIBUTE_DEFAULT=500
+DTD_ATTRIBUTE_REDEFINED=501
+DTD_ATTRIBUTE_VALUE=502
+DTD_CONTENT_ERROR=503
+DTD_CONTENT_MODEL=504
+DTD_CONTENT_NOT_DETERMINIST=505
+DTD_DIFFERENT_PREFIX=506
+DTD_ELEM_DEFAULT_NAMESPACE=507
+DTD_ELEM_NAMESPACE=508
+DTD_ELEM_REDEFINED=509
+DTD_EMPTY_NOTATION=510
+DTD_ENTITY_TYPE=511
+DTD_ID_FIXED=512
+DTD_ID_REDEFINED=513
+DTD_ID_SUBSET=514
+DTD_INVALID_CHILD=515
+DTD_INVALID_DEFAULT=516
+DTD_LOAD_ERROR=517
+DTD_MISSING_ATTRIBUTE=518
+DTD_MIXED_CORRUPT=519
+DTD_MULTIPLE_ID=520
+DTD_NO_DOC=521
+DTD_NO_DTD=522
+DTD_NO_ELEM_NAME=523
+DTD_NO_PREFIX=524
+DTD_NO_ROOT=525
+DTD_NOTATION_REDEFINED=526
+DTD_NOTATION_VALUE=527
+DTD_NOT_EMPTY=528
+DTD_NOT_PCDATA=529
+DTD_NOT_STANDALONE=530
+DTD_ROOT_NAME=531
+DTD_STANDALONE_WHITE_SPACE=532
+DTD_UNKNOWN_ATTRIBUTE=533
+DTD_UNKNOWN_ELEM=534
+DTD_UNKNOWN_ENTITY=535
+DTD_UNKNOWN_ID=536
+DTD_UNKNOWN_NOTATION=537
+DTD_STANDALONE_DEFAULTED=538
+DTD_XMLID_VALUE=539
+DTD_XMLID_TYPE=540
+DTD_DUP_TOKEN=541
+HTML_STRUCURE_ERROR=800
+HTML_UNKNOWN_TAG=801
+RNGP_ANYNAME_ATTR_ANCESTOR=1000
+RNGP_ATTR_CONFLICT=1001
+RNGP_ATTRIBUTE_CHILDREN=1002
+RNGP_ATTRIBUTE_CONTENT=1003
+RNGP_ATTRIBUTE_EMPTY=1004
+RNGP_ATTRIBUTE_NOOP=1005
+RNGP_CHOICE_CONTENT=1006
+RNGP_CHOICE_EMPTY=1007
+RNGP_CREATE_FAILURE=1008
+RNGP_DATA_CONTENT=1009
+RNGP_DEF_CHOICE_AND_INTERLEAVE=1010
+RNGP_DEFINE_CREATE_FAILED=1011
+RNGP_DEFINE_EMPTY=1012
+RNGP_DEFINE_MISSING=1013
+RNGP_DEFINE_NAME_MISSING=1014
+RNGP_ELEM_CONTENT_EMPTY=1015
+RNGP_ELEM_CONTENT_ERROR=1016
+RNGP_ELEMENT_EMPTY=1017
+RNGP_ELEMENT_CONTENT=1018
+RNGP_ELEMENT_NAME=1019
+RNGP_ELEMENT_NO_CONTENT=1020
+RNGP_ELEM_TEXT_CONFLICT=1021
+RNGP_EMPTY=1022
+RNGP_EMPTY_CONSTRUCT=1023
+RNGP_EMPTY_CONTENT=1024
+RNGP_EMPTY_NOT_EMPTY=1025
+RNGP_ERROR_TYPE_LIB=1026
+RNGP_EXCEPT_EMPTY=1027
+RNGP_EXCEPT_MISSING=1028
+RNGP_EXCEPT_MULTIPLE=1029
+RNGP_EXCEPT_NO_CONTENT=1030
+RNGP_EXTERNALREF_EMTPY=1031
+RNGP_EXTERNAL_REF_FAILURE=1032
+RNGP_EXTERNALREF_RECURSE=1033
+RNGP_FORBIDDEN_ATTRIBUTE=1034
+RNGP_FOREIGN_ELEMENT=1035
+RNGP_GRAMMAR_CONTENT=1036
+RNGP_GRAMMAR_EMPTY=1037
+RNGP_GRAMMAR_MISSING=1038
+RNGP_GRAMMAR_NO_START=1039
+RNGP_GROUP_ATTR_CONFLICT=1040
+RNGP_HREF_ERROR=1041
+RNGP_INCLUDE_EMPTY=1042
+RNGP_INCLUDE_FAILURE=1043
+RNGP_INCLUDE_RECURSE=1044
+RNGP_INTERLEAVE_ADD=1045
+RNGP_INTERLEAVE_CREATE_FAILED=1046
+RNGP_INTERLEAVE_EMPTY=1047
+RNGP_INTERLEAVE_NO_CONTENT=1048
+RNGP_INVALID_DEFINE_NAME=1049
+RNGP_INVALID_URI=1050
+RNGP_INVALID_VALUE=1051
+RNGP_MISSING_HREF=1052
+RNGP_NAME_MISSING=1053
+RNGP_NEED_COMBINE=1054
+RNGP_NOTALLOWED_NOT_EMPTY=1055
+RNGP_NSNAME_ATTR_ANCESTOR=1056
+RNGP_NSNAME_NO_NS=1057
+RNGP_PARAM_FORBIDDEN=1058
+RNGP_PARAM_NAME_MISSING=1059
+RNGP_PARENTREF_CREATE_FAILED=1060
+RNGP_PARENTREF_NAME_INVALID=1061
+RNGP_PARENTREF_NO_NAME=1062
+RNGP_PARENTREF_NO_PARENT=1063
+RNGP_PARENTREF_NOT_EMPTY=1064
+RNGP_PARSE_ERROR=1065
+RNGP_PAT_ANYNAME_EXCEPT_ANYNAME=1066
+RNGP_PAT_ATTR_ATTR=1067
+RNGP_PAT_ATTR_ELEM=1068
+RNGP_PAT_DATA_EXCEPT_ATTR=1069
+RNGP_PAT_DATA_EXCEPT_ELEM=1070
+RNGP_PAT_DATA_EXCEPT_EMPTY=1071
+RNGP_PAT_DATA_EXCEPT_GROUP=1072
+RNGP_PAT_DATA_EXCEPT_INTERLEAVE=1073
+RNGP_PAT_DATA_EXCEPT_LIST=1074
+RNGP_PAT_DATA_EXCEPT_ONEMORE=1075
+RNGP_PAT_DATA_EXCEPT_REF=1076
+RNGP_PAT_DATA_EXCEPT_TEXT=1077
+RNGP_PAT_LIST_ATTR=1078
+RNGP_PAT_LIST_ELEM=1079
+RNGP_PAT_LIST_INTERLEAVE=1080
+RNGP_PAT_LIST_LIST=1081
+RNGP_PAT_LIST_REF=1082
+RNGP_PAT_LIST_TEXT=1083
+RNGP_PAT_NSNAME_EXCEPT_ANYNAME=1084
+RNGP_PAT_NSNAME_EXCEPT_NSNAME=1085
+RNGP_PAT_ONEMORE_GROUP_ATTR=1086
+RNGP_PAT_ONEMORE_INTERLEAVE_ATTR=1087
+RNGP_PAT_START_ATTR=1088
+RNGP_PAT_START_DATA=1089
+RNGP_PAT_START_EMPTY=1090
+RNGP_PAT_START_GROUP=1091
+RNGP_PAT_START_INTERLEAVE=1092
+RNGP_PAT_START_LIST=1093
+RNGP_PAT_START_ONEMORE=1094
+RNGP_PAT_START_TEXT=1095
+RNGP_PAT_START_VALUE=1096
+RNGP_PREFIX_UNDEFINED=1097
+RNGP_REF_CREATE_FAILED=1098
+RNGP_REF_CYCLE=1099
+RNGP_REF_NAME_INVALID=1100
+RNGP_REF_NO_DEF=1101
+RNGP_REF_NO_NAME=1102
+RNGP_REF_NOT_EMPTY=1103
+RNGP_START_CHOICE_AND_INTERLEAVE=1104
+RNGP_START_CONTENT=1105
+RNGP_START_EMPTY=1106
+RNGP_START_MISSING=1107
+RNGP_TEXT_EXPECTED=1108
+RNGP_TEXT_HAS_CHILD=1109
+RNGP_TYPE_MISSING=1110
+RNGP_TYPE_NOT_FOUND=1111
+RNGP_TYPE_VALUE=1112
+RNGP_UNKNOWN_ATTRIBUTE=1113
+RNGP_UNKNOWN_COMBINE=1114
+RNGP_UNKNOWN_CONSTRUCT=1115
+RNGP_UNKNOWN_TYPE_LIB=1116
+RNGP_URI_FRAGMENT=1117
+RNGP_URI_NOT_ABSOLUTE=1118
+RNGP_VALUE_EMPTY=1119
+RNGP_VALUE_NO_CONTENT=1120
+RNGP_XMLNS_NAME=1121
+RNGP_XML_NS=1122
+XPATH_EXPRESSION_OK=1200
+XPATH_NUMBER_ERROR=1201
+XPATH_UNFINISHED_LITERAL_ERROR=1202
+XPATH_START_LITERAL_ERROR=1203
+XPATH_VARIABLE_REF_ERROR=1204
+XPATH_UNDEF_VARIABLE_ERROR=1205
+XPATH_INVALID_PREDICATE_ERROR=1206
+XPATH_EXPR_ERROR=1207
+XPATH_UNCLOSED_ERROR=1208
+XPATH_UNKNOWN_FUNC_ERROR=1209
+XPATH_INVALID_OPERAND=1210
+XPATH_INVALID_TYPE=1211
+XPATH_INVALID_ARITY=1212
+XPATH_INVALID_CTXT_SIZE=1213
+XPATH_INVALID_CTXT_POSITION=1214
+XPATH_MEMORY_ERROR=1215
+XPTR_SYNTAX_ERROR=1216
+XPTR_RESOURCE_ERROR=1217
+XPTR_SUB_RESOURCE_ERROR=1218
+XPATH_UNDEF_PREFIX_ERROR=1219
+XPATH_ENCODING_ERROR=1220
+XPATH_INVALID_CHAR_ERROR=1221
+TREE_INVALID_HEX=1300
+TREE_INVALID_DEC=1301
+TREE_UNTERMINATED_ENTITY=1302
+TREE_NOT_UTF8=1303
+SAVE_NOT_UTF8=1400
+SAVE_CHAR_INVALID=1401
+SAVE_NO_DOCTYPE=1402
+SAVE_UNKNOWN_ENCODING=1403
+REGEXP_COMPILE_ERROR=1450
+IO_UNKNOWN=1500
+IO_EACCES=1501
+IO_EAGAIN=1502
+IO_EBADF=1503
+IO_EBADMSG=1504
+IO_EBUSY=1505
+IO_ECANCELED=1506
+IO_ECHILD=1507
+IO_EDEADLK=1508
+IO_EDOM=1509
+IO_EEXIST=1510
+IO_EFAULT=1511
+IO_EFBIG=1512
+IO_EINPROGRESS=1513
+IO_EINTR=1514
+IO_EINVAL=1515
+IO_EIO=1516
+IO_EISDIR=1517
+IO_EMFILE=1518
+IO_EMLINK=1519
+IO_EMSGSIZE=1520
+IO_ENAMETOOLONG=1521
+IO_ENFILE=1522
+IO_ENODEV=1523
+IO_ENOENT=1524
+IO_ENOEXEC=1525
+IO_ENOLCK=1526
+IO_ENOMEM=1527
+IO_ENOSPC=1528
+IO_ENOSYS=1529
+IO_ENOTDIR=1530
+IO_ENOTEMPTY=1531
+IO_ENOTSUP=1532
+IO_ENOTTY=1533
+IO_ENXIO=1534
+IO_EPERM=1535
+IO_EPIPE=1536
+IO_ERANGE=1537
+IO_EROFS=1538
+IO_ESPIPE=1539
+IO_ESRCH=1540
+IO_ETIMEDOUT=1541
+IO_EXDEV=1542
+IO_NETWORK_ATTEMPT=1543
+IO_ENCODER=1544
+IO_FLUSH=1545
+IO_WRITE=1546
+IO_NO_INPUT=1547
+IO_BUFFER_FULL=1548
+IO_LOAD_ERROR=1549
+IO_ENOTSOCK=1550
+IO_EISCONN=1551
+IO_ECONNREFUSED=1552
+IO_ENETUNREACH=1553
+IO_EADDRINUSE=1554
+IO_EALREADY=1555
+IO_EAFNOSUPPORT=1556
+XINCLUDE_RECURSION=1600
+XINCLUDE_PARSE_VALUE=1601
+XINCLUDE_ENTITY_DEF_MISMATCH=1602
+XINCLUDE_NO_HREF=1603
+XINCLUDE_NO_FALLBACK=1604
+XINCLUDE_HREF_URI=1605
+XINCLUDE_TEXT_FRAGMENT=1606
+XINCLUDE_TEXT_DOCUMENT=1607
+XINCLUDE_INVALID_CHAR=1608
+XINCLUDE_BUILD_FAILED=1609
+XINCLUDE_UNKNOWN_ENCODING=1610
+XINCLUDE_MULTIPLE_ROOT=1611
+XINCLUDE_XPTR_FAILED=1612
+XINCLUDE_XPTR_RESULT=1613
+XINCLUDE_INCLUDE_IN_INCLUDE=1614
+XINCLUDE_FALLBACKS_IN_INCLUDE=1615
+XINCLUDE_FALLBACK_NOT_IN_INCLUDE=1616
+XINCLUDE_DEPRECATED_NS=1617
+XINCLUDE_FRAGMENT_ID=1618
+CATALOG_MISSING_ATTR=1650
+CATALOG_ENTRY_BROKEN=1651
+CATALOG_PREFER_VALUE=1652
+CATALOG_NOT_CATALOG=1653
+CATALOG_RECURSION=1654
+SCHEMAP_PREFIX_UNDEFINED=1700
+SCHEMAP_ATTRFORMDEFAULT_VALUE=1701
+SCHEMAP_ATTRGRP_NONAME_NOREF=1702
+SCHEMAP_ATTR_NONAME_NOREF=1703
+SCHEMAP_COMPLEXTYPE_NONAME_NOREF=1704
+SCHEMAP_ELEMFORMDEFAULT_VALUE=1705
+SCHEMAP_ELEM_NONAME_NOREF=1706
+SCHEMAP_EXTENSION_NO_BASE=1707
+SCHEMAP_FACET_NO_VALUE=1708
+SCHEMAP_FAILED_BUILD_IMPORT=1709
+SCHEMAP_GROUP_NONAME_NOREF=1710
+SCHEMAP_IMPORT_NAMESPACE_NOT_URI=1711
+SCHEMAP_IMPORT_REDEFINE_NSNAME=1712
+SCHEMAP_IMPORT_SCHEMA_NOT_URI=1713
+SCHEMAP_INVALID_BOOLEAN=1714
+SCHEMAP_INVALID_ENUM=1715
+SCHEMAP_INVALID_FACET=1716
+SCHEMAP_INVALID_FACET_VALUE=1717
+SCHEMAP_INVALID_MAXOCCURS=1718
+SCHEMAP_INVALID_MINOCCURS=1719
+SCHEMAP_INVALID_REF_AND_SUBTYPE=1720
+SCHEMAP_INVALID_WHITE_SPACE=1721
+SCHEMAP_NOATTR_NOREF=1722
+SCHEMAP_NOTATION_NO_NAME=1723
+SCHEMAP_NOTYPE_NOREF=1724
+SCHEMAP_REF_AND_SUBTYPE=1725
+SCHEMAP_RESTRICTION_NONAME_NOREF=1726
+SCHEMAP_SIMPLETYPE_NONAME=1727
+SCHEMAP_TYPE_AND_SUBTYPE=1728
+SCHEMAP_UNKNOWN_ALL_CHILD=1729
+SCHEMAP_UNKNOWN_ANYATTRIBUTE_CHILD=1730
+SCHEMAP_UNKNOWN_ATTR_CHILD=1731
+SCHEMAP_UNKNOWN_ATTRGRP_CHILD=1732
+SCHEMAP_UNKNOWN_ATTRIBUTE_GROUP=1733
+SCHEMAP_UNKNOWN_BASE_TYPE=1734
+SCHEMAP_UNKNOWN_CHOICE_CHILD=1735
+SCHEMAP_UNKNOWN_COMPLEXCONTENT_CHILD=1736
+SCHEMAP_UNKNOWN_COMPLEXTYPE_CHILD=1737
+SCHEMAP_UNKNOWN_ELEM_CHILD=1738
+SCHEMAP_UNKNOWN_EXTENSION_CHILD=1739
+SCHEMAP_UNKNOWN_FACET_CHILD=1740
+SCHEMAP_UNKNOWN_FACET_TYPE=1741
+SCHEMAP_UNKNOWN_GROUP_CHILD=1742
+SCHEMAP_UNKNOWN_IMPORT_CHILD=1743
+SCHEMAP_UNKNOWN_LIST_CHILD=1744
+SCHEMAP_UNKNOWN_NOTATION_CHILD=1745
+SCHEMAP_UNKNOWN_PROCESSCONTENT_CHILD=1746
+SCHEMAP_UNKNOWN_REF=1747
+SCHEMAP_UNKNOWN_RESTRICTION_CHILD=1748
+SCHEMAP_UNKNOWN_SCHEMAS_CHILD=1749
+SCHEMAP_UNKNOWN_SEQUENCE_CHILD=1750
+SCHEMAP_UNKNOWN_SIMPLECONTENT_CHILD=1751
+SCHEMAP_UNKNOWN_SIMPLETYPE_CHILD=1752
+SCHEMAP_UNKNOWN_TYPE=1753
+SCHEMAP_UNKNOWN_UNION_CHILD=1754
+SCHEMAP_ELEM_DEFAULT_FIXED=1755
+SCHEMAP_REGEXP_INVALID=1756
+SCHEMAP_FAILED_LOAD=1757
+SCHEMAP_NOTHING_TO_PARSE=1758
+SCHEMAP_NOROOT=1759
+SCHEMAP_REDEFINED_GROUP=1760
+SCHEMAP_REDEFINED_TYPE=1761
+SCHEMAP_REDEFINED_ELEMENT=1762
+SCHEMAP_REDEFINED_ATTRGROUP=1763
+SCHEMAP_REDEFINED_ATTR=1764
+SCHEMAP_REDEFINED_NOTATION=1765
+SCHEMAP_FAILED_PARSE=1766
+SCHEMAP_UNKNOWN_PREFIX=1767
+SCHEMAP_DEF_AND_PREFIX=1768
+SCHEMAP_UNKNOWN_INCLUDE_CHILD=1769
+SCHEMAP_INCLUDE_SCHEMA_NOT_URI=1770
+SCHEMAP_INCLUDE_SCHEMA_NO_URI=1771
+SCHEMAP_NOT_SCHEMA=1772
+SCHEMAP_UNKNOWN_MEMBER_TYPE=1773
+SCHEMAP_INVALID_ATTR_USE=1774
+SCHEMAP_RECURSIVE=1775
+SCHEMAP_SUPERNUMEROUS_LIST_ITEM_TYPE=1776
+SCHEMAP_INVALID_ATTR_COMBINATION=1777
+SCHEMAP_INVALID_ATTR_INLINE_COMBINATION=1778
+SCHEMAP_MISSING_SIMPLETYPE_CHILD=1779
+SCHEMAP_INVALID_ATTR_NAME=1780
+SCHEMAP_REF_AND_CONTENT=1781
+SCHEMAP_CT_PROPS_CORRECT_1=1782
+SCHEMAP_CT_PROPS_CORRECT_2=1783
+SCHEMAP_CT_PROPS_CORRECT_3=1784
+SCHEMAP_CT_PROPS_CORRECT_4=1785
+SCHEMAP_CT_PROPS_CORRECT_5=1786
+SCHEMAP_DERIVATION_OK_RESTRICTION_1=1787
+SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_1=1788
+SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_2=1789
+SCHEMAP_DERIVATION_OK_RESTRICTION_2_2=1790
+SCHEMAP_DERIVATION_OK_RESTRICTION_3=1791
+SCHEMAP_WILDCARD_INVALID_NS_MEMBER=1792
+SCHEMAP_INTERSECTION_NOT_EXPRESSIBLE=1793
+SCHEMAP_UNION_NOT_EXPRESSIBLE=1794
+SCHEMAP_SRC_IMPORT_3_1=1795
+SCHEMAP_SRC_IMPORT_3_2=1796
+SCHEMAP_DERIVATION_OK_RESTRICTION_4_1=1797
+SCHEMAP_DERIVATION_OK_RESTRICTION_4_2=1798
+SCHEMAP_DERIVATION_OK_RESTRICTION_4_3=1799
+SCHEMAP_COS_CT_EXTENDS_1_3=1800
+SCHEMAV_NOROOT=1801
+SCHEMAV_UNDECLAREDELEM=1802
+SCHEMAV_NOTTOPLEVEL=1803
+SCHEMAV_MISSING=1804
+SCHEMAV_WRONGELEM=1805
+SCHEMAV_NOTYPE=1806
+SCHEMAV_NOROLLBACK=1807
+SCHEMAV_ISABSTRACT=1808
+SCHEMAV_NOTEMPTY=1809
+SCHEMAV_ELEMCONT=1810
+SCHEMAV_HAVEDEFAULT=1811
+SCHEMAV_NOTNILLABLE=1812
+SCHEMAV_EXTRACONTENT=1813
+SCHEMAV_INVALIDATTR=1814
+SCHEMAV_INVALIDELEM=1815
+SCHEMAV_NOTDETERMINIST=1816
+SCHEMAV_CONSTRUCT=1817
+SCHEMAV_INTERNAL=1818
+SCHEMAV_NOTSIMPLE=1819
+SCHEMAV_ATTRUNKNOWN=1820
+SCHEMAV_ATTRINVALID=1821
+SCHEMAV_VALUE=1822
+SCHEMAV_FACET=1823
+SCHEMAV_CVC_DATATYPE_VALID_1_2_1=1824
+SCHEMAV_CVC_DATATYPE_VALID_1_2_2=1825
+SCHEMAV_CVC_DATATYPE_VALID_1_2_3=1826
+SCHEMAV_CVC_TYPE_3_1_1=1827
+SCHEMAV_CVC_TYPE_3_1_2=1828
+SCHEMAV_CVC_FACET_VALID=1829
+SCHEMAV_CVC_LENGTH_VALID=1830
+SCHEMAV_CVC_MINLENGTH_VALID=1831
+SCHEMAV_CVC_MAXLENGTH_VALID=1832
+SCHEMAV_CVC_MININCLUSIVE_VALID=1833
+SCHEMAV_CVC_MAXINCLUSIVE_VALID=1834
+SCHEMAV_CVC_MINEXCLUSIVE_VALID=1835
+SCHEMAV_CVC_MAXEXCLUSIVE_VALID=1836
+SCHEMAV_CVC_TOTALDIGITS_VALID=1837
+SCHEMAV_CVC_FRACTIONDIGITS_VALID=1838
+SCHEMAV_CVC_PATTERN_VALID=1839
+SCHEMAV_CVC_ENUMERATION_VALID=1840
+SCHEMAV_CVC_COMPLEX_TYPE_2_1=1841
+SCHEMAV_CVC_COMPLEX_TYPE_2_2=1842
+SCHEMAV_CVC_COMPLEX_TYPE_2_3=1843
+SCHEMAV_CVC_COMPLEX_TYPE_2_4=1844
+SCHEMAV_CVC_ELT_1=1845
+SCHEMAV_CVC_ELT_2=1846
+SCHEMAV_CVC_ELT_3_1=1847
+SCHEMAV_CVC_ELT_3_2_1=1848
+SCHEMAV_CVC_ELT_3_2_2=1849
+SCHEMAV_CVC_ELT_4_1=1850
+SCHEMAV_CVC_ELT_4_2=1851
+SCHEMAV_CVC_ELT_4_3=1852
+SCHEMAV_CVC_ELT_5_1_1=1853
+SCHEMAV_CVC_ELT_5_1_2=1854
+SCHEMAV_CVC_ELT_5_2_1=1855
+SCHEMAV_CVC_ELT_5_2_2_1=1856
+SCHEMAV_CVC_ELT_5_2_2_2_1=1857
+SCHEMAV_CVC_ELT_5_2_2_2_2=1858
+SCHEMAV_CVC_ELT_6=1859
+SCHEMAV_CVC_ELT_7=1860
+SCHEMAV_CVC_ATTRIBUTE_1=1861
+SCHEMAV_CVC_ATTRIBUTE_2=1862
+SCHEMAV_CVC_ATTRIBUTE_3=1863
+SCHEMAV_CVC_ATTRIBUTE_4=1864
+SCHEMAV_CVC_COMPLEX_TYPE_3_1=1865
+SCHEMAV_CVC_COMPLEX_TYPE_3_2_1=1866
+SCHEMAV_CVC_COMPLEX_TYPE_3_2_2=1867
+SCHEMAV_CVC_COMPLEX_TYPE_4=1868
+SCHEMAV_CVC_COMPLEX_TYPE_5_1=1869
+SCHEMAV_CVC_COMPLEX_TYPE_5_2=1870
+SCHEMAV_ELEMENT_CONTENT=1871
+SCHEMAV_DOCUMENT_ELEMENT_MISSING=1872
+SCHEMAV_CVC_COMPLEX_TYPE_1=1873
+SCHEMAV_CVC_AU=1874
+SCHEMAV_CVC_TYPE_1=1875
+SCHEMAV_CVC_TYPE_2=1876
+SCHEMAV_CVC_IDC=1877
+SCHEMAV_CVC_WILDCARD=1878
+SCHEMAV_MISC=1879
+XPTR_UNKNOWN_SCHEME=1900
+XPTR_CHILDSEQ_START=1901
+XPTR_EVAL_FAILED=1902
+XPTR_EXTRA_OBJECTS=1903
+C14N_CREATE_CTXT=1950
+C14N_REQUIRES_UTF8=1951
+C14N_CREATE_STACK=1952
+C14N_INVALID_NODE=1953
+C14N_UNKNOW_NODE=1954
+C14N_RELATIVE_NAMESPACE=1955
+FTP_PASV_ANSWER=2000
+FTP_EPSV_ANSWER=2001
+FTP_ACCNT=2002
+FTP_URL_SYNTAX=2003
+HTTP_URL_SYNTAX=2020
+HTTP_USE_IP=2021
+HTTP_UNKNOWN_HOST=2022
+SCHEMAP_SRC_SIMPLE_TYPE_1=3000
+SCHEMAP_SRC_SIMPLE_TYPE_2=3001
+SCHEMAP_SRC_SIMPLE_TYPE_3=3002
+SCHEMAP_SRC_SIMPLE_TYPE_4=3003
+SCHEMAP_SRC_RESOLVE=3004
+SCHEMAP_SRC_RESTRICTION_BASE_OR_SIMPLETYPE=3005
+SCHEMAP_SRC_LIST_ITEMTYPE_OR_SIMPLETYPE=3006
+SCHEMAP_SRC_UNION_MEMBERTYPES_OR_SIMPLETYPES=3007
+SCHEMAP_ST_PROPS_CORRECT_1=3008
+SCHEMAP_ST_PROPS_CORRECT_2=3009
+SCHEMAP_ST_PROPS_CORRECT_3=3010
+SCHEMAP_COS_ST_RESTRICTS_1_1=3011
+SCHEMAP_COS_ST_RESTRICTS_1_2=3012
+SCHEMAP_COS_ST_RESTRICTS_1_3_1=3013
+SCHEMAP_COS_ST_RESTRICTS_1_3_2=3014
+SCHEMAP_COS_ST_RESTRICTS_2_1=3015
+SCHEMAP_COS_ST_RESTRICTS_2_3_1_1=3016
+SCHEMAP_COS_ST_RESTRICTS_2_3_1_2=3017
+SCHEMAP_COS_ST_RESTRICTS_2_3_2_1=3018
+SCHEMAP_COS_ST_RESTRICTS_2_3_2_2=3019
+SCHEMAP_COS_ST_RESTRICTS_2_3_2_3=3020
+SCHEMAP_COS_ST_RESTRICTS_2_3_2_4=3021
+SCHEMAP_COS_ST_RESTRICTS_2_3_2_5=3022
+SCHEMAP_COS_ST_RESTRICTS_3_1=3023
+SCHEMAP_COS_ST_RESTRICTS_3_3_1=3024
+SCHEMAP_COS_ST_RESTRICTS_3_3_1_2=3025
+SCHEMAP_COS_ST_RESTRICTS_3_3_2_2=3026
+SCHEMAP_COS_ST_RESTRICTS_3_3_2_1=3027
+SCHEMAP_COS_ST_RESTRICTS_3_3_2_3=3028
+SCHEMAP_COS_ST_RESTRICTS_3_3_2_4=3029
+SCHEMAP_COS_ST_RESTRICTS_3_3_2_5=3030
+SCHEMAP_COS_ST_DERIVED_OK_2_1=3031
+SCHEMAP_COS_ST_DERIVED_OK_2_2=3032
+SCHEMAP_S4S_ELEM_NOT_ALLOWED=3033
+SCHEMAP_S4S_ELEM_MISSING=3034
+SCHEMAP_S4S_ATTR_NOT_ALLOWED=3035
+SCHEMAP_S4S_ATTR_MISSING=3036
+SCHEMAP_S4S_ATTR_INVALID_VALUE=3037
+SCHEMAP_SRC_ELEMENT_1=3038
+SCHEMAP_SRC_ELEMENT_2_1=3039
+SCHEMAP_SRC_ELEMENT_2_2=3040
+SCHEMAP_SRC_ELEMENT_3=3041
+SCHEMAP_P_PROPS_CORRECT_1=3042
+SCHEMAP_P_PROPS_CORRECT_2_1=3043
+SCHEMAP_P_PROPS_CORRECT_2_2=3044
+SCHEMAP_E_PROPS_CORRECT_2=3045
+SCHEMAP_E_PROPS_CORRECT_3=3046
+SCHEMAP_E_PROPS_CORRECT_4=3047
+SCHEMAP_E_PROPS_CORRECT_5=3048
+SCHEMAP_E_PROPS_CORRECT_6=3049
+SCHEMAP_SRC_INCLUDE=3050
+SCHEMAP_SRC_ATTRIBUTE_1=3051
+SCHEMAP_SRC_ATTRIBUTE_2=3052
+SCHEMAP_SRC_ATTRIBUTE_3_1=3053
+SCHEMAP_SRC_ATTRIBUTE_3_2=3054
+SCHEMAP_SRC_ATTRIBUTE_4=3055
+SCHEMAP_NO_XMLNS=3056
+SCHEMAP_NO_XSI=3057
+SCHEMAP_COS_VALID_DEFAULT_1=3058
+SCHEMAP_COS_VALID_DEFAULT_2_1=3059
+SCHEMAP_COS_VALID_DEFAULT_2_2_1=3060
+SCHEMAP_COS_VALID_DEFAULT_2_2_2=3061
+SCHEMAP_CVC_SIMPLE_TYPE=3062
+SCHEMAP_COS_CT_EXTENDS_1_1=3063
+SCHEMAP_SRC_IMPORT_1_1=3064
+SCHEMAP_SRC_IMPORT_1_2=3065
+SCHEMAP_SRC_IMPORT_2=3066
+SCHEMAP_SRC_IMPORT_2_1=3067
+SCHEMAP_SRC_IMPORT_2_2=3068
+SCHEMAP_INTERNAL=3069
+SCHEMAP_NOT_DETERMINISTIC=3070
+SCHEMAP_SRC_ATTRIBUTE_GROUP_1=3071
+SCHEMAP_SRC_ATTRIBUTE_GROUP_2=3072
+SCHEMAP_SRC_ATTRIBUTE_GROUP_3=3073
+SCHEMAP_MG_PROPS_CORRECT_1=3074
+SCHEMAP_MG_PROPS_CORRECT_2=3075
+SCHEMAP_SRC_CT_1=3076
+SCHEMAP_DERIVATION_OK_RESTRICTION_2_1_3=3077
+SCHEMAP_AU_PROPS_CORRECT_2=3078
+SCHEMAP_A_PROPS_CORRECT_2=3079
+SCHEMAP_C_PROPS_CORRECT=3080
+SCHEMAP_SRC_REDEFINE=3081
+SCHEMAP_SRC_IMPORT=3082
+SCHEMAP_WARN_SKIP_SCHEMA=3083
+SCHEMAP_WARN_UNLOCATED_SCHEMA=3084
+SCHEMAP_WARN_ATTR_REDECL_PROH=3085
+SCHEMAP_WARN_ATTR_POINTLESS_PROH=3086
+SCHEMAP_AG_PROPS_CORRECT=3087
+SCHEMAP_COS_CT_EXTENDS_1_2=3088
+SCHEMAP_AU_PROPS_CORRECT=3089
+SCHEMAP_A_PROPS_CORRECT_3=3090
+SCHEMAP_COS_ALL_LIMITED=3091
+SCHEMATRONV_ASSERT=4000
+SCHEMATRONV_REPORT=4001
+MODULE_OPEN=4900
+MODULE_CLOSE=4901
+CHECK_FOUND_ELEMENT=5000
+CHECK_FOUND_ATTRIBUTE=5001
+CHECK_FOUND_TEXT=5002
+CHECK_FOUND_CDATA=5003
+CHECK_FOUND_ENTITYREF=5004
+CHECK_FOUND_ENTITY=5005
+CHECK_FOUND_PI=5006
+CHECK_FOUND_COMMENT=5007
+CHECK_FOUND_DOCTYPE=5008
+CHECK_FOUND_FRAGMENT=5009
+CHECK_FOUND_NOTATION=5010
+CHECK_UNKNOWN_NODE=5011
+CHECK_ENTITY_TYPE=5012
+CHECK_NO_PARENT=5013
+CHECK_NO_DOC=5014
+CHECK_NO_NAME=5015
+CHECK_NO_ELEM=5016
+CHECK_WRONG_DOC=5017
+CHECK_NO_PREV=5018
+CHECK_WRONG_PREV=5019
+CHECK_NO_NEXT=5020
+CHECK_WRONG_NEXT=5021
+CHECK_NOT_DTD=5022
+CHECK_NOT_ATTR=5023
+CHECK_NOT_ATTR_DECL=5024
+CHECK_NOT_ELEM_DECL=5025
+CHECK_NOT_ENTITY_DECL=5026
+CHECK_NOT_NS_DECL=5027
+CHECK_NO_HREF=5028
+CHECK_WRONG_PARENT=5029
+CHECK_NS_SCOPE=5030
+CHECK_NS_ANCESTOR=5031
+CHECK_NOT_UTF8=5032
+CHECK_NO_DICT=5033
+CHECK_NOT_NCNAME=5034
+CHECK_OUTSIDE_DICT=5035
+CHECK_WRONG_NAME=5036
+CHECK_NAME_NOT_NULL=5037
+I18N_NO_NAME=6000
+I18N_NO_HANDLER=6001
+I18N_EXCESS_HANDLER=6002
+I18N_CONV_FAILED=6003
+I18N_NO_OUTPUT=6004
+BUF_OVERFLOW=7000
+"""
+
+cdef object __RELAXNG_ERROR_TYPES = """\
+RELAXNG_OK=0
+RELAXNG_ERR_MEMORY=1
+RELAXNG_ERR_TYPE=2
+RELAXNG_ERR_TYPEVAL=3
+RELAXNG_ERR_DUPID=4
+RELAXNG_ERR_TYPECMP=5
+RELAXNG_ERR_NOSTATE=6
+RELAXNG_ERR_NODEFINE=7
+RELAXNG_ERR_LISTEXTRA=8
+RELAXNG_ERR_LISTEMPTY=9
+RELAXNG_ERR_INTERNODATA=10
+RELAXNG_ERR_INTERSEQ=11
+RELAXNG_ERR_INTEREXTRA=12
+RELAXNG_ERR_ELEMNAME=13
+RELAXNG_ERR_ATTRNAME=14
+RELAXNG_ERR_ELEMNONS=15
+RELAXNG_ERR_ATTRNONS=16
+RELAXNG_ERR_ELEMWRONGNS=17
+RELAXNG_ERR_ATTRWRONGNS=18
+RELAXNG_ERR_ELEMEXTRANS=19
+RELAXNG_ERR_ATTREXTRANS=20
+RELAXNG_ERR_ELEMNOTEMPTY=21
+RELAXNG_ERR_NOELEM=22
+RELAXNG_ERR_NOTELEM=23
+RELAXNG_ERR_ATTRVALID=24
+RELAXNG_ERR_CONTENTVALID=25
+RELAXNG_ERR_EXTRACONTENT=26
+RELAXNG_ERR_INVALIDATTR=27
+RELAXNG_ERR_DATAELEM=28
+RELAXNG_ERR_VALELEM=29
+RELAXNG_ERR_LISTELEM=30
+RELAXNG_ERR_DATATYPE=31
+RELAXNG_ERR_VALUE=32
+RELAXNG_ERR_LIST=33
+RELAXNG_ERR_NOGRAMMAR=34
+RELAXNG_ERR_EXTRADATA=35
+RELAXNG_ERR_LACKDATA=36
+RELAXNG_ERR_INTERNAL=37
+RELAXNG_ERR_ELEMWRONG=38
+RELAXNG_ERR_TEXTWRONG=39
+"""
+# --- END: GENERATED CONSTANTS ---
+
+__initErrorConstants()
diff --git a/src/lxml/xmlid.pxi b/src/lxml/xmlid.pxi
new file mode 100644
index 0000000..c1f2bbf
--- /dev/null
+++ b/src/lxml/xmlid.pxi
@@ -0,0 +1,179 @@
+cdef object _find_id_attributes
+
+def XMLID(text, parser=None, *, base_url=None):
+ u"""XMLID(text, parser=None, base_url=None)
+
+ Parse the text and return a tuple (root node, ID dictionary). The root
+ node is the same as returned by the XML() function. The dictionary
+ contains string-element pairs. The dictionary keys are the values of 'id'
+ attributes. The elements referenced by the ID are stored as dictionary
+ values.
+ """
+ cdef dict dic
+ global _find_id_attributes
+ if _find_id_attributes is None:
+ _find_id_attributes = XPath(u'//*[string(@id)]')
+
+ # ElementTree compatible implementation: parse and look for 'id' attributes
+ root = XML(text, parser, base_url=base_url)
+ dic = {}
+ for elem in _find_id_attributes(root):
+ dic[elem.get(u'id')] = elem
+ return root, dic
+
+def XMLDTDID(text, parser=None, *, base_url=None):
+ u"""XMLDTDID(text, parser=None, base_url=None)
+
+ Parse the text and return a tuple (root node, ID dictionary). The root
+ node is the same as returned by the XML() function. The dictionary
+ contains string-element pairs. The dictionary keys are the values of ID
+ attributes as defined by the DTD. The elements referenced by the ID are
+ stored as dictionary values.
+
+ Note that you must not modify the XML tree if you use the ID dictionary.
+ The results are undefined.
+ """
+ cdef _Element root
+ root = XML(text, parser, base_url=base_url)
+ # xml:id spec compatible implementation: use DTD ID attributes from libxml2
+ if root._doc._c_doc.ids is NULL:
+ return root, {}
+ else:
+ return root, _IDDict(root)
+
+def parseid(source, parser=None, *, base_url=None):
+ u"""parseid(source, parser=None)
+
+ Parses the source into a tuple containing an ElementTree object and an
+ ID dictionary. If no parser is provided as second argument, the default
+ parser is used.
+
+ Note that you must not modify the XML tree if you use the ID dictionary.
+ The results are undefined.
+ """
+ cdef _Document doc
+ doc = _parseDocument(source, parser, base_url)
+ return _elementTreeFactory(doc, None), _IDDict(doc)
+
+cdef class _IDDict:
+ u"""IDDict(self, etree)
+ A dictionary-like proxy class that mapps ID attributes to elements.
+
+ The dictionary must be instantiated with the root element of a parsed XML
+ document, otherwise the behaviour is undefined. Elements and XML trees
+ that were created or modified 'by hand' are not supported.
+ """
+ cdef _Document _doc
+ cdef object _keys
+ cdef object _items
+ def __cinit__(self, etree):
+ cdef _Document doc
+ doc = _documentOrRaise(etree)
+ if doc._c_doc.ids is NULL:
+ raise ValueError, u"No ID dictionary available."
+ self._doc = doc
+ self._keys = None
+ self._items = None
+
+ def copy(self):
+ return _IDDict(self._doc)
+
+ def __getitem__(self, id_name):
+ cdef tree.xmlHashTable* c_ids
+ cdef tree.xmlID* c_id
+ cdef xmlAttr* c_attr
+ c_ids = self._doc._c_doc.ids
+ id_utf = _utf8(id_name)
+ c_id = <tree.xmlID*>tree.xmlHashLookup(c_ids, _xcstr(id_utf))
+ if c_id is NULL:
+ raise KeyError, u"key not found."
+ c_attr = c_id.attr
+ if c_attr is NULL or c_attr.parent is NULL:
+ raise KeyError, u"ID attribute not found."
+ return _elementFactory(self._doc, c_attr.parent)
+
+ def get(self, id_name):
+ return self[id_name]
+
+ def __contains__(self, id_name):
+ cdef tree.xmlID* c_id
+ id_utf = _utf8(id_name)
+ c_id = <tree.xmlID*>tree.xmlHashLookup(
+ self._doc._c_doc.ids, _xcstr(id_utf))
+ return c_id is not NULL
+
+ def has_key(self, id_name):
+ return id_name in self
+
+ def __repr__(self):
+ return repr(dict(self))
+
+ def keys(self):
+ if self._keys is None:
+ self._keys = self._build_keys()
+ return self._keys[:]
+
+ def __iter__(self):
+ if self._keys is None:
+ self._keys = self._build_keys()
+ return iter(self._keys)
+
+ def iterkeys(self):
+ return self
+
+ def __len__(self):
+ if self._keys is None:
+ self._keys = self._build_keys()
+ return len(self._keys)
+
+ def items(self):
+ if self._items is None:
+ self._items = self._build_items()
+ return self._items[:]
+
+ def iteritems(self):
+ if self._items is None:
+ self._items = self._build_items()
+ return iter(self._items)
+
+ def values(self):
+ cdef list values = []
+ if self._items is None:
+ self._items = self._build_items()
+ for item in self._items:
+ value = python.PyTuple_GET_ITEM(item, 1)
+ python.Py_INCREF(value)
+ values.append(value)
+ return values
+
+ def itervalues(self):
+ return iter(self.values())
+
+ cdef object _build_keys(self):
+ keys = []
+ tree.xmlHashScan(<tree.xmlHashTable*>self._doc._c_doc.ids,
+ <tree.xmlHashScanner>_collectIdHashKeys, <python.PyObject*>keys)
+ return keys
+
+ cdef object _build_items(self):
+ items = []
+ context = (items, self._doc)
+ tree.xmlHashScan(<tree.xmlHashTable*>self._doc._c_doc.ids,
+ <tree.xmlHashScanner>_collectIdHashItemList, <python.PyObject*>context)
+ return items
+
+cdef void _collectIdHashItemList(void* payload, void* context, xmlChar* name):
+ # collect elements from ID attribute hash table
+ cdef list lst
+ c_id = <tree.xmlID*>payload
+ if c_id is NULL or c_id.attr is NULL or c_id.attr.parent is NULL:
+ return
+ lst, doc = <tuple>context
+ element = _elementFactory(doc, c_id.attr.parent)
+ lst.append( (funicode(name), element) )
+
+cdef void _collectIdHashKeys(void* payload, void* collect_list, xmlChar* name):
+ c_id = <tree.xmlID*>payload
+ if c_id is NULL or c_id.attr is NULL or c_id.attr.parent is NULL:
+ return
+ (<list>collect_list).append(funicode(name))
diff --git a/src/lxml/xmlschema.pxi b/src/lxml/xmlschema.pxi
new file mode 100644
index 0000000..ab26d93
--- /dev/null
+++ b/src/lxml/xmlschema.pxi
@@ -0,0 +1,211 @@
+# support for XMLSchema validation
+from lxml.includes cimport xmlschema
+
+
+cdef class XMLSchemaError(LxmlError):
+ """Base class of all XML Schema errors
+ """
+
+cdef class XMLSchemaParseError(XMLSchemaError):
+ """Error while parsing an XML document as XML Schema.
+ """
+
+cdef class XMLSchemaValidateError(XMLSchemaError):
+ """Error while validating an XML document with an XML Schema.
+ """
+
+
+################################################################################
+# XMLSchema
+
+cdef XPath _check_for_default_attributes = XPath(
+ u"boolean(//xs:attribute[@default or @fixed][1])",
+ namespaces={u'xs': u'http://www.w3.org/2001/XMLSchema'})
+
+
+cdef class XMLSchema(_Validator):
+ u"""XMLSchema(self, etree=None, file=None)
+ Turn a document into an XML Schema validator.
+
+ Either pass a schema as Element or ElementTree, or pass a file or
+ filename through the ``file`` keyword argument.
+
+ Passing the ``attribute_defaults`` boolean option will make the
+ schema insert default/fixed attributes into validated documents.
+ """
+ cdef xmlschema.xmlSchema* _c_schema
+ cdef _Document _doc
+ cdef bint _has_default_attributes
+ cdef bint _add_attribute_defaults
+
+ def __cinit__(self):
+ self._has_default_attributes = True # play it safe
+ self._add_attribute_defaults = False
+
+ def __init__(self, etree=None, *, file=None, bint attribute_defaults=False):
+ cdef xmlschema.xmlSchemaParserCtxt* parser_ctxt
+ cdef xmlDoc* c_doc
+
+ self._add_attribute_defaults = attribute_defaults
+ _Validator.__init__(self)
+ c_doc = NULL
+ if etree is not None:
+ doc = _documentOrRaise(etree)
+ root_node = _rootNodeOrRaise(etree)
+ c_doc = _copyDocRoot(doc._c_doc, root_node._c_node)
+ self._doc = _documentFactory(c_doc, doc._parser)
+ parser_ctxt = xmlschema.xmlSchemaNewDocParserCtxt(c_doc)
+ elif file is not None:
+ if _isString(file):
+ filename = _encodeFilename(file)
+ parser_ctxt = xmlschema.xmlSchemaNewParserCtxt(_cstr(filename))
+ else:
+ self._doc = _parseDocument(file, None, None)
+ parser_ctxt = xmlschema.xmlSchemaNewDocParserCtxt(self._doc._c_doc)
+ else:
+ raise XMLSchemaParseError, u"No tree or file given"
+
+ if parser_ctxt is NULL:
+ raise MemoryError()
+
+ xmlschema.xmlSchemaSetParserStructuredErrors(
+ parser_ctxt, _receiveError, <void*>self._error_log)
+ if self._doc is not None:
+ # calling xmlSchemaParse on a schema with imports or
+ # includes will cause libxml2 to create an internal
+ # context for parsing, so push an implied context to route
+ # resolve requests to the document's parser
+ __GLOBAL_PARSER_CONTEXT.pushImpliedContextFromParser(self._doc._parser)
+ with nogil:
+ orig_loader = _register_document_loader()
+ self._c_schema = xmlschema.xmlSchemaParse(parser_ctxt)
+ _reset_document_loader(orig_loader)
+ if self._doc is not None:
+ __GLOBAL_PARSER_CONTEXT.popImpliedContext()
+ xmlschema.xmlSchemaFreeParserCtxt(parser_ctxt)
+
+ if self._c_schema is NULL:
+ raise XMLSchemaParseError(
+ self._error_log._buildExceptionMessage(
+ u"Document is not valid XML Schema"),
+ self._error_log)
+
+ if self._doc is not None:
+ self._has_default_attributes = _check_for_default_attributes(self._doc)
+ self._add_attribute_defaults = attribute_defaults and self._has_default_attributes
+
+ def __dealloc__(self):
+ xmlschema.xmlSchemaFree(self._c_schema)
+
+ def __call__(self, etree):
+ u"""__call__(self, etree)
+
+ Validate doc using XML Schema.
+
+ Returns true if document is valid, false if not.
+ """
+ cdef xmlschema.xmlSchemaValidCtxt* valid_ctxt
+ cdef _Document doc
+ cdef _Element root_node
+ cdef xmlDoc* c_doc
+ cdef int ret
+
+ assert self._c_schema is not NULL, "Schema instance not initialised"
+ doc = _documentOrRaise(etree)
+ root_node = _rootNodeOrRaise(etree)
+
+ valid_ctxt = xmlschema.xmlSchemaNewValidCtxt(self._c_schema)
+ if valid_ctxt is NULL:
+ raise MemoryError()
+
+ try:
+ if self._add_attribute_defaults:
+ xmlschema.xmlSchemaSetValidOptions(
+ valid_ctxt, xmlschema.XML_SCHEMA_VAL_VC_I_CREATE)
+
+ self._error_log.clear()
+ xmlschema.xmlSchemaSetValidStructuredErrors(
+ valid_ctxt, _receiveError, <void*>self._error_log)
+
+ c_doc = _fakeRootDoc(doc._c_doc, root_node._c_node)
+ with nogil:
+ ret = xmlschema.xmlSchemaValidateDoc(valid_ctxt, c_doc)
+ _destroyFakeDoc(doc._c_doc, c_doc)
+ finally:
+ xmlschema.xmlSchemaFreeValidCtxt(valid_ctxt)
+
+ if ret == -1:
+ raise XMLSchemaValidateError(
+ u"Internal error in XML Schema validation.",
+ self._error_log)
+ if ret == 0:
+ return True
+ else:
+ return False
+
+ cdef _ParserSchemaValidationContext _newSaxValidator(
+ self, bint add_default_attributes):
+ cdef _ParserSchemaValidationContext context
+ context = _ParserSchemaValidationContext.__new__(_ParserSchemaValidationContext)
+ context._schema = self
+ context._add_default_attributes = (self._has_default_attributes and (
+ add_default_attributes or self._add_attribute_defaults))
+ return context
+
+@cython.final
+@cython.internal
+cdef class _ParserSchemaValidationContext:
+ cdef XMLSchema _schema
+ cdef xmlschema.xmlSchemaValidCtxt* _valid_ctxt
+ cdef xmlschema.xmlSchemaSAXPlugStruct* _sax_plug
+ cdef bint _add_default_attributes
+ def __cinit__(self):
+ self._valid_ctxt = NULL
+ self._sax_plug = NULL
+ self._add_default_attributes = False
+
+ def __dealloc__(self):
+ self.disconnect()
+ if self._valid_ctxt:
+ xmlschema.xmlSchemaFreeValidCtxt(self._valid_ctxt)
+
+ cdef _ParserSchemaValidationContext copy(self):
+ assert self._schema is not None, "_ParserSchemaValidationContext not initialised"
+ return self._schema._newSaxValidator(
+ self._add_default_attributes)
+
+ cdef void inject_default_attributes(self, xmlDoc* c_doc):
+ # we currently need to insert default attributes manually
+ # after parsing, as libxml2 does not support this at parse
+ # time
+ if self._add_default_attributes:
+ with nogil:
+ xmlschema.xmlSchemaValidateDoc(self._valid_ctxt, c_doc)
+
+ cdef int connect(self, xmlparser.xmlParserCtxt* c_ctxt, _BaseErrorLog error_log) except -1:
+ if self._valid_ctxt is NULL:
+ self._valid_ctxt = xmlschema.xmlSchemaNewValidCtxt(
+ self._schema._c_schema)
+ if self._valid_ctxt is NULL:
+ raise MemoryError()
+ if self._add_default_attributes:
+ xmlschema.xmlSchemaSetValidOptions(
+ self._valid_ctxt, xmlschema.XML_SCHEMA_VAL_VC_I_CREATE)
+ if error_log is not None:
+ xmlschema.xmlSchemaSetValidStructuredErrors(
+ self._valid_ctxt, _receiveError, <void*>error_log)
+ self._sax_plug = xmlschema.xmlSchemaSAXPlug(
+ self._valid_ctxt, &c_ctxt.sax, &c_ctxt.userData)
+
+ cdef void disconnect(self):
+ if self._sax_plug is not NULL:
+ xmlschema.xmlSchemaSAXUnplug(self._sax_plug)
+ self._sax_plug = NULL
+ if self._valid_ctxt is not NULL:
+ xmlschema.xmlSchemaSetValidStructuredErrors(
+ self._valid_ctxt, NULL, NULL)
+
+ cdef bint isvalid(self):
+ if self._valid_ctxt is NULL:
+ return 1 # valid
+ return xmlschema.xmlSchemaIsValid(self._valid_ctxt)
diff --git a/src/lxml/xpath.pxi b/src/lxml/xpath.pxi
new file mode 100644
index 0000000..a7cae4b
--- /dev/null
+++ b/src/lxml/xpath.pxi
@@ -0,0 +1,502 @@
+# XPath evaluation
+
+class XPathSyntaxError(LxmlSyntaxError, XPathError):
+ pass
+
+################################################################################
+# XPath
+
+cdef object _XPATH_SYNTAX_ERRORS = (
+ xmlerror.XML_XPATH_NUMBER_ERROR,
+ xmlerror.XML_XPATH_UNFINISHED_LITERAL_ERROR,
+ xmlerror.XML_XPATH_VARIABLE_REF_ERROR,
+ xmlerror.XML_XPATH_INVALID_PREDICATE_ERROR,
+ xmlerror.XML_XPATH_UNCLOSED_ERROR,
+ xmlerror.XML_XPATH_INVALID_CHAR_ERROR
+)
+
+cdef object _XPATH_EVAL_ERRORS = (
+ xmlerror.XML_XPATH_UNDEF_VARIABLE_ERROR,
+ xmlerror.XML_XPATH_UNDEF_PREFIX_ERROR,
+ xmlerror.XML_XPATH_UNKNOWN_FUNC_ERROR,
+ xmlerror.XML_XPATH_INVALID_OPERAND,
+ xmlerror.XML_XPATH_INVALID_TYPE,
+ xmlerror.XML_XPATH_INVALID_ARITY,
+ xmlerror.XML_XPATH_INVALID_CTXT_SIZE,
+ xmlerror.XML_XPATH_INVALID_CTXT_POSITION
+)
+
+cdef int _register_xpath_function(void* ctxt, name_utf, ns_utf):
+ if ns_utf is None:
+ return xpath.xmlXPathRegisterFunc(
+ <xpath.xmlXPathContext*>ctxt, _xcstr(name_utf),
+ _xpath_function_call)
+ else:
+ return xpath.xmlXPathRegisterFuncNS(
+ <xpath.xmlXPathContext*>ctxt, _xcstr(name_utf), _xcstr(ns_utf),
+ _xpath_function_call)
+
+cdef int _unregister_xpath_function(void* ctxt, name_utf, ns_utf):
+ if ns_utf is None:
+ return xpath.xmlXPathRegisterFunc(
+ <xpath.xmlXPathContext*>ctxt, _xcstr(name_utf), NULL)
+ else:
+ return xpath.xmlXPathRegisterFuncNS(
+ <xpath.xmlXPathContext*>ctxt, _xcstr(name_utf), _xcstr(ns_utf), NULL)
+
+
+@cython.final
+@cython.internal
+cdef class _XPathContext(_BaseContext):
+ cdef object _variables
+ def __init__(self, namespaces, extensions, error_log, enable_regexp, variables,
+ build_smart_strings):
+ self._variables = variables
+ _BaseContext.__init__(self, namespaces, extensions, error_log, enable_regexp,
+ build_smart_strings)
+
+ cdef set_context(self, xpath.xmlXPathContext* xpathCtxt):
+ self._set_xpath_context(xpathCtxt)
+ # This would be a good place to set up the XPath parser dict, but
+ # we cannot use the current thread dict as we do not know which
+ # thread will execute the XPath evaluator - so, no dict for now.
+ self.registerLocalNamespaces()
+ self.registerLocalFunctions(xpathCtxt, _register_xpath_function)
+
+ cdef register_context(self, _Document doc):
+ self._register_context(doc)
+ self.registerGlobalNamespaces()
+ self.registerGlobalFunctions(self._xpathCtxt, _register_xpath_function)
+ self.registerExsltFunctions()
+ if self._variables is not None:
+ self.registerVariables(self._variables)
+
+ cdef unregister_context(self):
+ self.unregisterGlobalFunctions(
+ self._xpathCtxt, _unregister_xpath_function)
+ self.unregisterGlobalNamespaces()
+ xpath.xmlXPathRegisteredVariablesCleanup(self._xpathCtxt)
+ self._cleanup_context()
+
+ cdef void registerExsltFunctions(self):
+ if xslt.LIBXSLT_VERSION < 10125:
+ # we'd only execute dummy functions anyway
+ return
+ tree.xmlHashScan(
+ self._xpathCtxt.nsHash, _registerExsltFunctionsForNamespaces,
+ self._xpathCtxt)
+
+ cdef registerVariables(self, variable_dict):
+ for name, value in variable_dict.items():
+ name_utf = self._to_utf(name)
+ xpath.xmlXPathRegisterVariable(
+ self._xpathCtxt, _xcstr(name_utf), _wrapXPathObject(value, None, None))
+
+ cdef registerVariable(self, name, value):
+ name_utf = self._to_utf(name)
+ xpath.xmlXPathRegisterVariable(
+ self._xpathCtxt, _xcstr(name_utf), _wrapXPathObject(value, None, None))
+
+
+cdef void _registerExsltFunctionsForNamespaces(
+ void* _c_href, void* _ctxt, const_xmlChar* c_prefix):
+ c_href = <const_xmlChar*> _c_href
+ ctxt = <xpath.xmlXPathContext*> _ctxt
+
+ if tree.xmlStrcmp(c_href, xslt.EXSLT_DATE_NAMESPACE) == 0:
+ xslt.exsltDateXpathCtxtRegister(ctxt, c_prefix)
+ elif tree.xmlStrcmp(c_href, xslt.EXSLT_SETS_NAMESPACE) == 0:
+ xslt.exsltSetsXpathCtxtRegister(ctxt, c_prefix)
+ elif tree.xmlStrcmp(c_href, xslt.EXSLT_MATH_NAMESPACE) == 0:
+ xslt.exsltMathXpathCtxtRegister(ctxt, c_prefix)
+ elif tree.xmlStrcmp(c_href, xslt.EXSLT_STRINGS_NAMESPACE) == 0:
+ xslt.exsltStrXpathCtxtRegister(ctxt, c_prefix)
+
+
+cdef class _XPathEvaluatorBase:
+ cdef xpath.xmlXPathContext* _xpathCtxt
+ cdef _XPathContext _context
+ cdef python.PyThread_type_lock _eval_lock
+ cdef _ErrorLog _error_log
+ def __cinit__(self):
+ self._xpathCtxt = NULL
+ if config.ENABLE_THREADING:
+ self._eval_lock = python.PyThread_allocate_lock()
+ if self._eval_lock is NULL:
+ raise MemoryError()
+ self._error_log = _ErrorLog()
+
+ def __init__(self, namespaces, extensions, enable_regexp,
+ smart_strings):
+ self._context = _XPathContext(namespaces, extensions, self._error_log,
+ enable_regexp, None, smart_strings)
+
+ @property
+ def error_log(self):
+ assert self._error_log is not None, "XPath evaluator not initialised"
+ return self._error_log.copy()
+
+ def __dealloc__(self):
+ if self._xpathCtxt is not NULL:
+ xpath.xmlXPathFreeContext(self._xpathCtxt)
+ if config.ENABLE_THREADING:
+ if self._eval_lock is not NULL:
+ python.PyThread_free_lock(self._eval_lock)
+
+ cdef set_context(self, xpath.xmlXPathContext* xpathCtxt):
+ self._xpathCtxt = xpathCtxt
+ self._context.set_context(xpathCtxt)
+
+ def evaluate(self, _eval_arg, **_variables):
+ u"""evaluate(self, _eval_arg, **_variables)
+
+ Evaluate an XPath expression.
+
+ Instead of calling this method, you can also call the evaluator object
+ itself.
+
+ Variables may be provided as keyword arguments. Note that namespaces
+ are currently not supported for variables.
+
+ :deprecated: call the object, not its method.
+ """
+ return self(_eval_arg, **_variables)
+
+ cdef bint _checkAbsolutePath(self, char* path):
+ cdef char c
+ if path is NULL:
+ return 0
+ c = path[0]
+ while c == c' ' or c == c'\t':
+ path = path + 1
+ c = path[0]
+ return c == c'/'
+
+ @cython.final
+ cdef int _lock(self) except -1:
+ cdef int result
+ if config.ENABLE_THREADING and self._eval_lock != NULL:
+ with nogil:
+ result = python.PyThread_acquire_lock(
+ self._eval_lock, python.WAIT_LOCK)
+ if result == 0:
+ raise XPathError, u"XPath evaluator locking failed"
+ return 0
+
+ @cython.final
+ cdef void _unlock(self):
+ if config.ENABLE_THREADING and self._eval_lock != NULL:
+ python.PyThread_release_lock(self._eval_lock)
+
+ cdef _build_parse_error(self):
+ cdef _BaseErrorLog entries
+ entries = self._error_log.filter_types(_XPATH_SYNTAX_ERRORS)
+ if entries:
+ message = entries._buildExceptionMessage(None)
+ if message is not None:
+ return XPathSyntaxError(message, self._error_log)
+ return XPathSyntaxError(
+ self._error_log._buildExceptionMessage(u"Error in xpath expression"),
+ self._error_log)
+
+ cdef _build_eval_error(self):
+ cdef _BaseErrorLog entries
+ entries = self._error_log.filter_types(_XPATH_EVAL_ERRORS)
+ if not entries:
+ entries = self._error_log.filter_types(_XPATH_SYNTAX_ERRORS)
+ if entries:
+ message = entries._buildExceptionMessage(None)
+ if message is not None:
+ return XPathEvalError(message, self._error_log)
+ return XPathEvalError(
+ self._error_log._buildExceptionMessage(u"Error in xpath expression"),
+ self._error_log)
+
+ cdef object _handle_result(self, xpath.xmlXPathObject* xpathObj, _Document doc):
+ if self._context._exc._has_raised():
+ if xpathObj is not NULL:
+ _freeXPathObject(xpathObj)
+ xpathObj = NULL
+ self._context._release_temp_refs()
+ self._context._exc._raise_if_stored()
+
+ if xpathObj is NULL:
+ self._context._release_temp_refs()
+ raise self._build_eval_error()
+
+ try:
+ result = _unwrapXPathObject(xpathObj, doc, self._context)
+ finally:
+ _freeXPathObject(xpathObj)
+ self._context._release_temp_refs()
+
+ return result
+
+
+cdef class XPathElementEvaluator(_XPathEvaluatorBase):
+ u"""XPathElementEvaluator(self, element, namespaces=None, extensions=None, regexp=True, smart_strings=True)
+ Create an XPath evaluator for an element.
+
+ Absolute XPath expressions (starting with '/') will be evaluated against
+ the ElementTree as returned by getroottree().
+
+ Additional namespace declarations can be passed with the
+ 'namespace' keyword argument. EXSLT regular expression support
+ can be disabled with the 'regexp' boolean keyword (defaults to
+ True). Smart strings will be returned for string results unless
+ you pass ``smart_strings=False``.
+ """
+ cdef _Element _element
+ def __init__(self, _Element element not None, *, namespaces=None,
+ extensions=None, regexp=True, smart_strings=True):
+ cdef xpath.xmlXPathContext* xpathCtxt
+ cdef int ns_register_status
+ cdef _Document doc
+ _assertValidNode(element)
+ _assertValidDoc(element._doc)
+ self._element = element
+ doc = element._doc
+ _XPathEvaluatorBase.__init__(self, namespaces, extensions,
+ regexp, smart_strings)
+ xpathCtxt = xpath.xmlXPathNewContext(doc._c_doc)
+ if xpathCtxt is NULL:
+ raise MemoryError()
+ self.set_context(xpathCtxt)
+
+ def register_namespace(self, prefix, uri):
+ u"""Register a namespace with the XPath context.
+ """
+ assert self._xpathCtxt is not NULL, "XPath context not initialised"
+ self._context.addNamespace(prefix, uri)
+
+ def register_namespaces(self, namespaces):
+ u"""Register a prefix -> uri dict.
+ """
+ assert self._xpathCtxt is not NULL, "XPath context not initialised"
+ for prefix, uri in namespaces.items():
+ self._context.addNamespace(prefix, uri)
+
+ def __call__(self, _path, **_variables):
+ u"""__call__(self, _path, **_variables)
+
+ Evaluate an XPath expression on the document.
+
+ Variables may be provided as keyword arguments. Note that namespaces
+ are currently not supported for variables.
+
+ Absolute XPath expressions (starting with '/') will be evaluated
+ against the ElementTree as returned by getroottree().
+ """
+ cdef xpath.xmlXPathObject* xpathObj
+ cdef _Document doc
+ assert self._xpathCtxt is not NULL, "XPath context not initialised"
+ path = _utf8(_path)
+ doc = self._element._doc
+
+ self._lock()
+ self._xpathCtxt.node = self._element._c_node
+ try:
+ self._context.register_context(doc)
+ self._context.registerVariables(_variables)
+ c_path = _xcstr(path)
+ with nogil:
+ xpathObj = xpath.xmlXPathEvalExpression(
+ c_path, self._xpathCtxt)
+ result = self._handle_result(xpathObj, doc)
+ finally:
+ self._context.unregister_context()
+ self._unlock()
+
+ return result
+
+
+cdef class XPathDocumentEvaluator(XPathElementEvaluator):
+ u"""XPathDocumentEvaluator(self, etree, namespaces=None, extensions=None, regexp=True, smart_strings=True)
+ Create an XPath evaluator for an ElementTree.
+
+ Additional namespace declarations can be passed with the
+ 'namespace' keyword argument. EXSLT regular expression support
+ can be disabled with the 'regexp' boolean keyword (defaults to
+ True). Smart strings will be returned for string results unless
+ you pass ``smart_strings=False``.
+ """
+ def __init__(self, _ElementTree etree not None, *, namespaces=None,
+ extensions=None, regexp=True, smart_strings=True):
+ XPathElementEvaluator.__init__(
+ self, etree._context_node, namespaces=namespaces,
+ extensions=extensions, regexp=regexp,
+ smart_strings=smart_strings)
+
+ def __call__(self, _path, **_variables):
+ u"""__call__(self, _path, **_variables)
+
+ Evaluate an XPath expression on the document.
+
+ Variables may be provided as keyword arguments. Note that namespaces
+ are currently not supported for variables.
+ """
+ cdef xpath.xmlXPathObject* xpathObj
+ cdef xmlDoc* c_doc
+ cdef _Document doc
+ assert self._xpathCtxt is not NULL, "XPath context not initialised"
+ path = _utf8(_path)
+ doc = self._element._doc
+
+ self._lock()
+ try:
+ self._context.register_context(doc)
+ c_doc = _fakeRootDoc(doc._c_doc, self._element._c_node)
+ try:
+ self._context.registerVariables(_variables)
+ c_path = _xcstr(path)
+ with nogil:
+ self._xpathCtxt.doc = c_doc
+ self._xpathCtxt.node = tree.xmlDocGetRootElement(c_doc)
+ xpathObj = xpath.xmlXPathEvalExpression(
+ c_path, self._xpathCtxt)
+ result = self._handle_result(xpathObj, doc)
+ finally:
+ _destroyFakeDoc(doc._c_doc, c_doc)
+ self._context.unregister_context()
+ finally:
+ self._unlock()
+
+ return result
+
+
+def XPathEvaluator(etree_or_element, *, namespaces=None, extensions=None,
+ regexp=True, smart_strings=True):
+ u"""XPathEvaluator(etree_or_element, namespaces=None, extensions=None, regexp=True, smart_strings=True)
+
+ Creates an XPath evaluator for an ElementTree or an Element.
+
+ The resulting object can be called with an XPath expression as argument
+ and XPath variables provided as keyword arguments.
+
+ Additional namespace declarations can be passed with the
+ 'namespace' keyword argument. EXSLT regular expression support
+ can be disabled with the 'regexp' boolean keyword (defaults to
+ True). Smart strings will be returned for string results unless
+ you pass ``smart_strings=False``.
+ """
+ if isinstance(etree_or_element, _ElementTree):
+ return XPathDocumentEvaluator(
+ etree_or_element, namespaces=namespaces,
+ extensions=extensions, regexp=regexp, smart_strings=smart_strings)
+ else:
+ return XPathElementEvaluator(
+ etree_or_element, namespaces=namespaces,
+ extensions=extensions, regexp=regexp, smart_strings=smart_strings)
+
+
+cdef class XPath(_XPathEvaluatorBase):
+ u"""XPath(self, path, namespaces=None, extensions=None, regexp=True, smart_strings=True)
+ A compiled XPath expression that can be called on Elements and ElementTrees.
+
+ Besides the XPath expression, you can pass prefix-namespace
+ mappings and extension functions to the constructor through the
+ keyword arguments ``namespaces`` and ``extensions``. EXSLT
+ regular expression support can be disabled with the 'regexp'
+ boolean keyword (defaults to True). Smart strings will be
+ returned for string results unless you pass
+ ``smart_strings=False``.
+ """
+ cdef xpath.xmlXPathCompExpr* _xpath
+ cdef bytes _path
+ def __cinit__(self):
+ self._xpath = NULL
+
+ def __init__(self, path, *, namespaces=None, extensions=None,
+ regexp=True, smart_strings=True):
+ cdef xpath.xmlXPathContext* xpathCtxt
+ _XPathEvaluatorBase.__init__(self, namespaces, extensions,
+ regexp, smart_strings)
+ self._path = _utf8(path)
+ xpathCtxt = xpath.xmlXPathNewContext(NULL)
+ if xpathCtxt is NULL:
+ raise MemoryError()
+ self.set_context(xpathCtxt)
+ self._xpath = xpath.xmlXPathCtxtCompile(xpathCtxt, _xcstr(self._path))
+ if self._xpath is NULL:
+ raise self._build_parse_error()
+
+ def __call__(self, _etree_or_element, **_variables):
+ u"__call__(self, _etree_or_element, **_variables)"
+ cdef xpath.xmlXPathObject* xpathObj
+ cdef _Document document
+ cdef _Element element
+
+ assert self._xpathCtxt is not NULL, "XPath context not initialised"
+ document = _documentOrRaise(_etree_or_element)
+ element = _rootNodeOrRaise(_etree_or_element)
+
+ self._lock()
+ self._xpathCtxt.doc = document._c_doc
+ self._xpathCtxt.node = element._c_node
+
+ try:
+ self._context.register_context(document)
+ self._context.registerVariables(_variables)
+ with nogil:
+ xpathObj = xpath.xmlXPathCompiledEval(
+ self._xpath, self._xpathCtxt)
+ result = self._handle_result(xpathObj, document)
+ finally:
+ self._context.unregister_context()
+ self._unlock()
+ return result
+
+ @property
+ def path(self):
+ """The literal XPath expression.
+ """
+ return self._path.decode(u'UTF-8')
+
+ def __dealloc__(self):
+ if self._xpath is not NULL:
+ xpath.xmlXPathFreeCompExpr(self._xpath)
+
+ def __repr__(self):
+ return self.path
+
+
+cdef object _replace_strings = re.compile(b'("[^"]*")|(\'[^\']*\')').sub
+cdef object _find_namespaces = re.compile(b'({[^}]+})').findall
+
+cdef class ETXPath(XPath):
+ u"""ETXPath(self, path, extensions=None, regexp=True, smart_strings=True)
+ Special XPath class that supports the ElementTree {uri} notation for namespaces.
+
+ Note that this class does not accept the ``namespace`` keyword
+ argument. All namespaces must be passed as part of the path
+ string. Smart strings will be returned for string results unless
+ you pass ``smart_strings=False``.
+ """
+ def __init__(self, path, *, extensions=None, regexp=True,
+ smart_strings=True):
+ path, namespaces = self._nsextract_path(path)
+ XPath.__init__(self, path, namespaces=namespaces,
+ extensions=extensions, regexp=regexp,
+ smart_strings=smart_strings)
+
+ cdef _nsextract_path(self, path):
+ # replace {namespaces} by new prefixes
+ cdef dict namespaces = {}
+ cdef list namespace_defs = []
+ cdef int i
+ path_utf = _utf8(path)
+ stripped_path = _replace_strings(b'', path_utf) # remove string literals
+ i = 1
+ for namespace_def in _find_namespaces(stripped_path):
+ if namespace_def not in namespace_defs:
+ prefix = python.PyBytes_FromFormat("__xpp%02d", i)
+ i += 1
+ namespace_defs.append(namespace_def)
+ namespace = namespace_def[1:-1] # remove '{}'
+ namespace = (<bytes>namespace).decode('utf8')
+ namespaces[prefix.decode('utf8')] = namespace
+ prefix_str = prefix + b':'
+ # FIXME: this also replaces {namespaces} within strings!
+ path_utf = path_utf.replace(namespace_def, prefix_str)
+ path = path_utf.decode('utf8')
+ return path, namespaces
diff --git a/src/lxml/xslt.pxi b/src/lxml/xslt.pxi
new file mode 100644
index 0000000..d483cfa
--- /dev/null
+++ b/src/lxml/xslt.pxi
@@ -0,0 +1,971 @@
+
+# XSLT
+from lxml.includes cimport xslt
+
+
+cdef class XSLTError(LxmlError):
+ """Base class of all XSLT errors.
+ """
+
+cdef class XSLTParseError(XSLTError):
+ """Error parsing a stylesheet document.
+ """
+
+cdef class XSLTApplyError(XSLTError):
+ """Error running an XSL transformation.
+ """
+
+class XSLTSaveError(XSLTError, SerialisationError):
+ """Error serialising an XSLT result.
+ """
+
+cdef class XSLTExtensionError(XSLTError):
+ """Error registering an XSLT extension.
+ """
+
+
+# version information
+LIBXSLT_COMPILED_VERSION = __unpackIntVersion(xslt.LIBXSLT_VERSION)
+LIBXSLT_VERSION = __unpackIntVersion(xslt.xsltLibxsltVersion)
+
+
+################################################################################
+# Where do we store what?
+#
+# xsltStylesheet->doc->_private
+# == _XSLTResolverContext for XSL stylesheet
+#
+# xsltTransformContext->_private
+# == _XSLTResolverContext for transformed document
+#
+################################################################################
+
+
+################################################################################
+# XSLT document loaders
+
+@cython.final
+@cython.internal
+cdef class _XSLTResolverContext(_ResolverContext):
+ cdef xmlDoc* _c_style_doc
+ cdef _BaseParser _parser
+
+ cdef _XSLTResolverContext _copy(self):
+ cdef _XSLTResolverContext context
+ context = _XSLTResolverContext()
+ _initXSLTResolverContext(context, self._parser)
+ context._c_style_doc = self._c_style_doc
+ return context
+
+cdef _initXSLTResolverContext(_XSLTResolverContext context,
+ _BaseParser parser):
+ _initResolverContext(context, parser.resolvers)
+ context._parser = parser
+ context._c_style_doc = NULL
+
+cdef xmlDoc* _xslt_resolve_from_python(const_xmlChar* c_uri, void* c_context,
+ int parse_options, int* error) with gil:
+ # call the Python document loaders
+ cdef _XSLTResolverContext context
+ cdef _ResolverRegistry resolvers
+ cdef _InputDocument doc_ref
+ cdef xmlDoc* c_doc
+ cdef xmlDoc* c_return_doc = NULL
+
+ error[0] = 0
+ context = <_XSLTResolverContext>c_context
+
+ # shortcut if we resolve the stylesheet itself
+ c_doc = context._c_style_doc
+ try:
+ if c_doc is not NULL and c_doc.URL is not NULL:
+ if tree.xmlStrcmp(c_uri, c_doc.URL) == 0:
+ c_return_doc = _copyDoc(c_doc, 1)
+ return c_return_doc # 'goto', see 'finally' below
+
+ # delegate to the Python resolvers
+ resolvers = context._resolvers
+ if tree.xmlStrncmp(<unsigned char*>'string://__STRING__XSLT__/', c_uri, 26) == 0:
+ c_uri += 26
+ uri = _decodeFilename(c_uri)
+ doc_ref = resolvers.resolve(uri, None, context)
+
+ if doc_ref is not None:
+ if doc_ref._type == PARSER_DATA_STRING:
+ c_return_doc = _parseDoc(
+ doc_ref._data_bytes, doc_ref._filename, context._parser)
+ elif doc_ref._type == PARSER_DATA_FILENAME:
+ c_return_doc = _parseDocFromFile(
+ doc_ref._filename, context._parser)
+ elif doc_ref._type == PARSER_DATA_FILE:
+ c_return_doc = _parseDocFromFilelike(
+ doc_ref._file, doc_ref._filename, context._parser)
+ elif doc_ref._type == PARSER_DATA_EMPTY:
+ c_return_doc = _newXMLDoc()
+ if c_return_doc is not NULL and c_return_doc.URL is NULL:
+ c_return_doc.URL = tree.xmlStrdup(c_uri)
+ except:
+ error[0] = 1
+ context._store_raised()
+ finally:
+ return c_return_doc # and swallow any further exceptions
+
+
+cdef void _xslt_store_resolver_exception(const_xmlChar* c_uri, void* context,
+ xslt.xsltLoadType c_type) with gil:
+ try:
+ message = f"Cannot resolve URI {_decodeFilename(c_uri)}"
+ if c_type == xslt.XSLT_LOAD_DOCUMENT:
+ exception = XSLTApplyError(message)
+ else:
+ exception = XSLTParseError(message)
+ (<_XSLTResolverContext>context)._store_exception(exception)
+ except BaseException as e:
+ (<_XSLTResolverContext>context)._store_exception(e)
+ finally:
+ return # and swallow any further exceptions
+
+
+cdef xmlDoc* _xslt_doc_loader(const_xmlChar* c_uri, tree.xmlDict* c_dict,
+ int parse_options, void* c_ctxt,
+ xslt.xsltLoadType c_type) nogil:
+ # nogil => no Python objects here, may be called without thread context !
+ cdef xmlDoc* c_doc
+ cdef xmlDoc* result
+ cdef void* c_pcontext
+ cdef int error = 0
+ # find resolver contexts of stylesheet and transformed doc
+ if c_type == xslt.XSLT_LOAD_DOCUMENT:
+ # transformation time
+ c_pcontext = (<xslt.xsltTransformContext*>c_ctxt)._private
+ elif c_type == xslt.XSLT_LOAD_STYLESHEET:
+ # include/import resolution while parsing
+ c_pcontext = (<xslt.xsltStylesheet*>c_ctxt).doc._private
+ else:
+ c_pcontext = NULL
+
+ if c_pcontext is NULL:
+ # can't call Python without context, fall back to default loader
+ return XSLT_DOC_DEFAULT_LOADER(
+ c_uri, c_dict, parse_options, c_ctxt, c_type)
+
+ c_doc = _xslt_resolve_from_python(c_uri, c_pcontext, parse_options, &error)
+ if c_doc is NULL and not error:
+ c_doc = XSLT_DOC_DEFAULT_LOADER(
+ c_uri, c_dict, parse_options, c_ctxt, c_type)
+ if c_doc is NULL:
+ _xslt_store_resolver_exception(c_uri, c_pcontext, c_type)
+
+ if c_doc is not NULL and c_type == xslt.XSLT_LOAD_STYLESHEET:
+ c_doc._private = c_pcontext
+ return c_doc
+
+cdef xslt.xsltDocLoaderFunc XSLT_DOC_DEFAULT_LOADER = xslt.xsltDocDefaultLoader
+xslt.xsltSetLoaderFunc(<xslt.xsltDocLoaderFunc>_xslt_doc_loader)
+
+################################################################################
+# XSLT file/network access control
+
+cdef class XSLTAccessControl:
+ u"""XSLTAccessControl(self, read_file=True, write_file=True, create_dir=True, read_network=True, write_network=True)
+
+ Access control for XSLT: reading/writing files, directories and
+ network I/O. Access to a type of resource is granted or denied by
+ passing any of the following boolean keyword arguments. All of
+ them default to True to allow access.
+
+ - read_file
+ - write_file
+ - create_dir
+ - read_network
+ - write_network
+
+ For convenience, there is also a class member `DENY_ALL` that
+ provides an XSLTAccessControl instance that is readily configured
+ to deny everything, and a `DENY_WRITE` member that denies all
+ write access but allows read access.
+
+ See `XSLT`.
+ """
+ cdef xslt.xsltSecurityPrefs* _prefs
+ def __cinit__(self):
+ self._prefs = xslt.xsltNewSecurityPrefs()
+ if self._prefs is NULL:
+ raise MemoryError()
+
+ def __init__(self, *, bint read_file=True, bint write_file=True, bint create_dir=True,
+ bint read_network=True, bint write_network=True):
+ self._setAccess(xslt.XSLT_SECPREF_READ_FILE, read_file)
+ self._setAccess(xslt.XSLT_SECPREF_WRITE_FILE, write_file)
+ self._setAccess(xslt.XSLT_SECPREF_CREATE_DIRECTORY, create_dir)
+ self._setAccess(xslt.XSLT_SECPREF_READ_NETWORK, read_network)
+ self._setAccess(xslt.XSLT_SECPREF_WRITE_NETWORK, write_network)
+
+ DENY_ALL = XSLTAccessControl(
+ read_file=False, write_file=False, create_dir=False,
+ read_network=False, write_network=False)
+
+ DENY_WRITE = XSLTAccessControl(
+ read_file=True, write_file=False, create_dir=False,
+ read_network=True, write_network=False)
+
+ def __dealloc__(self):
+ if self._prefs is not NULL:
+ xslt.xsltFreeSecurityPrefs(self._prefs)
+
+ @cython.final
+ cdef _setAccess(self, xslt.xsltSecurityOption option, bint allow):
+ cdef xslt.xsltSecurityCheck function
+ if allow:
+ function = xslt.xsltSecurityAllow
+ else:
+ function = xslt.xsltSecurityForbid
+ xslt.xsltSetSecurityPrefs(self._prefs, option, function)
+
+ @cython.final
+ cdef void _register_in_context(self, xslt.xsltTransformContext* ctxt):
+ xslt.xsltSetCtxtSecurityPrefs(self._prefs, ctxt)
+
+ @property
+ def options(self):
+ """The access control configuration as a map of options."""
+ return {
+ u'read_file': self._optval(xslt.XSLT_SECPREF_READ_FILE),
+ u'write_file': self._optval(xslt.XSLT_SECPREF_WRITE_FILE),
+ u'create_dir': self._optval(xslt.XSLT_SECPREF_CREATE_DIRECTORY),
+ u'read_network': self._optval(xslt.XSLT_SECPREF_READ_NETWORK),
+ u'write_network': self._optval(xslt.XSLT_SECPREF_WRITE_NETWORK),
+ }
+
+ @cython.final
+ cdef _optval(self, xslt.xsltSecurityOption option):
+ cdef xslt.xsltSecurityCheck function
+ function = xslt.xsltGetSecurityPrefs(self._prefs, option)
+ if function is <xslt.xsltSecurityCheck>xslt.xsltSecurityAllow:
+ return True
+ elif function is <xslt.xsltSecurityCheck>xslt.xsltSecurityForbid:
+ return False
+ else:
+ return None
+
+ def __repr__(self):
+ items = sorted(self.options.items())
+ return u"%s(%s)" % (
+ python._fqtypename(self).decode('UTF-8').split(u'.')[-1],
+ u', '.join([u"%s=%r" % item for item in items]))
+
+################################################################################
+# XSLT
+
+cdef int _register_xslt_function(void* ctxt, name_utf, ns_utf):
+ if ns_utf is None:
+ return 0
+ # libxml2 internalises the strings if ctxt has a dict
+ return xslt.xsltRegisterExtFunction(
+ <xslt.xsltTransformContext*>ctxt, _xcstr(name_utf), _xcstr(ns_utf),
+ <xslt.xmlXPathFunction>_xpath_function_call)
+
+cdef dict EMPTY_DICT = {}
+
+@cython.final
+@cython.internal
+cdef class _XSLTContext(_BaseContext):
+ cdef xslt.xsltTransformContext* _xsltCtxt
+ cdef _ReadOnlyElementProxy _extension_element_proxy
+ cdef dict _extension_elements
+ def __cinit__(self):
+ self._xsltCtxt = NULL
+ self._extension_elements = EMPTY_DICT
+
+ def __init__(self, namespaces, extensions, error_log, enable_regexp,
+ build_smart_strings):
+ if extensions is not None and extensions:
+ for ns_name_tuple, extension in extensions.items():
+ if ns_name_tuple[0] is None:
+ raise XSLTExtensionError, \
+ u"extensions must not have empty namespaces"
+ if isinstance(extension, XSLTExtension):
+ if self._extension_elements is EMPTY_DICT:
+ self._extension_elements = {}
+ extensions = extensions.copy()
+ ns_utf = _utf8(ns_name_tuple[0])
+ name_utf = _utf8(ns_name_tuple[1])
+ self._extension_elements[(ns_utf, name_utf)] = extension
+ del extensions[ns_name_tuple]
+ _BaseContext.__init__(self, namespaces, extensions, error_log, enable_regexp,
+ build_smart_strings)
+
+ cdef _BaseContext _copy(self):
+ cdef _XSLTContext context
+ context = <_XSLTContext>_BaseContext._copy(self)
+ context._extension_elements = self._extension_elements
+ return context
+
+ cdef register_context(self, xslt.xsltTransformContext* xsltCtxt,
+ _Document doc):
+ self._xsltCtxt = xsltCtxt
+ self._set_xpath_context(xsltCtxt.xpathCtxt)
+ self._register_context(doc)
+ self.registerLocalFunctions(xsltCtxt, _register_xslt_function)
+ self.registerGlobalFunctions(xsltCtxt, _register_xslt_function)
+ _registerXSLTExtensions(xsltCtxt, self._extension_elements)
+
+ cdef free_context(self):
+ self._cleanup_context()
+ self._release_context()
+ if self._xsltCtxt is not NULL:
+ xslt.xsltFreeTransformContext(self._xsltCtxt)
+ self._xsltCtxt = NULL
+ self._release_temp_refs()
+
+
+@cython.final
+@cython.internal
+@cython.freelist(8)
+cdef class _XSLTQuotedStringParam:
+ u"""A wrapper class for literal XSLT string parameters that require
+ quote escaping.
+ """
+ cdef bytes strval
+ def __cinit__(self, strval):
+ self.strval = _utf8(strval)
+
+
+@cython.no_gc_clear
+cdef class XSLT:
+ u"""XSLT(self, xslt_input, extensions=None, regexp=True, access_control=None)
+
+ Turn an XSL document into an XSLT object.
+
+ Calling this object on a tree or Element will execute the XSLT::
+
+ transform = etree.XSLT(xsl_tree)
+ result = transform(xml_tree)
+
+ Keyword arguments of the constructor:
+
+ - extensions: a dict mapping ``(namespace, name)`` pairs to
+ extension functions or extension elements
+ - regexp: enable exslt regular expression support in XPath
+ (default: True)
+ - access_control: access restrictions for network or file
+ system (see `XSLTAccessControl`)
+
+ Keyword arguments of the XSLT call:
+
+ - profile_run: enable XSLT profiling (default: False)
+
+ Other keyword arguments of the call are passed to the stylesheet
+ as parameters.
+ """
+ cdef _XSLTContext _context
+ cdef xslt.xsltStylesheet* _c_style
+ cdef _XSLTResolverContext _xslt_resolver_context
+ cdef XSLTAccessControl _access_control
+ cdef _ErrorLog _error_log
+
+ def __cinit__(self):
+ self._c_style = NULL
+
+ def __init__(self, xslt_input, *, extensions=None, regexp=True,
+ access_control=None):
+ cdef xslt.xsltStylesheet* c_style = NULL
+ cdef xmlDoc* c_doc
+ cdef _Document doc
+ cdef _Element root_node
+
+ doc = _documentOrRaise(xslt_input)
+ root_node = _rootNodeOrRaise(xslt_input)
+
+ # set access control or raise TypeError
+ self._access_control = access_control
+
+ # make a copy of the document as stylesheet parsing modifies it
+ c_doc = _copyDocRoot(doc._c_doc, root_node._c_node)
+
+ # make sure we always have a stylesheet URL
+ if c_doc.URL is NULL:
+ doc_url_utf = python.PyUnicode_AsASCIIString(
+ f"string://__STRING__XSLT__/{id(self)}.xslt")
+ c_doc.URL = tree.xmlStrdup(_xcstr(doc_url_utf))
+
+ self._error_log = _ErrorLog()
+ self._xslt_resolver_context = _XSLTResolverContext()
+ _initXSLTResolverContext(self._xslt_resolver_context, doc._parser)
+ # keep a copy in case we need to access the stylesheet via 'document()'
+ self._xslt_resolver_context._c_style_doc = _copyDoc(c_doc, 1)
+ c_doc._private = <python.PyObject*>self._xslt_resolver_context
+
+ with self._error_log:
+ orig_loader = _register_document_loader()
+ c_style = xslt.xsltParseStylesheetDoc(c_doc)
+ _reset_document_loader(orig_loader)
+
+ if c_style is NULL or c_style.errors:
+ tree.xmlFreeDoc(c_doc)
+ if c_style is not NULL:
+ xslt.xsltFreeStylesheet(c_style)
+ self._xslt_resolver_context._raise_if_stored()
+ # last error seems to be the most accurate here
+ if self._error_log.last_error is not None and \
+ self._error_log.last_error.message:
+ raise XSLTParseError(self._error_log.last_error.message,
+ self._error_log)
+ else:
+ raise XSLTParseError(
+ self._error_log._buildExceptionMessage(
+ u"Cannot parse stylesheet"),
+ self._error_log)
+
+ c_doc._private = NULL # no longer used!
+ self._c_style = c_style
+ self._context = _XSLTContext(None, extensions, self._error_log, regexp, True)
+
+ def __dealloc__(self):
+ if self._xslt_resolver_context is not None and \
+ self._xslt_resolver_context._c_style_doc is not NULL:
+ tree.xmlFreeDoc(self._xslt_resolver_context._c_style_doc)
+ # this cleans up the doc copy as well
+ if self._c_style is not NULL:
+ xslt.xsltFreeStylesheet(self._c_style)
+
+ @property
+ def error_log(self):
+ """The log of errors and warnings of an XSLT execution."""
+ return self._error_log.copy()
+
+ @staticmethod
+ def strparam(strval):
+ u"""strparam(strval)
+
+ Mark an XSLT string parameter that requires quote escaping
+ before passing it into the transformation. Use it like this::
+
+ result = transform(doc, some_strval = XSLT.strparam(
+ '''it's \"Monty Python's\" ...'''))
+
+ Escaped string parameters can be reused without restriction.
+ """
+ return _XSLTQuotedStringParam(strval)
+
+ @staticmethod
+ def set_global_max_depth(int max_depth):
+ u"""set_global_max_depth(max_depth)
+
+ The maximum traversal depth that the stylesheet engine will allow.
+ This does not only count the template recursion depth but also takes
+ the number of variables/parameters into account. The required setting
+ for a run depends on both the stylesheet and the input data.
+
+ Example::
+
+ XSLT.set_global_max_depth(5000)
+
+ Note that this is currently a global, module-wide setting because
+ libxslt does not support it at a per-stylesheet level.
+ """
+ if max_depth < 0:
+ raise ValueError("cannot set a maximum stylesheet traversal depth < 0")
+ xslt.xsltMaxDepth = max_depth
+
+ def apply(self, _input, *, profile_run=False, **kw):
+ u"""apply(self, _input, profile_run=False, **kw)
+
+ :deprecated: call the object, not this method."""
+ return self(_input, profile_run=profile_run, **kw)
+
+ def tostring(self, _ElementTree result_tree):
+ u"""tostring(self, result_tree)
+
+ Save result doc to string based on stylesheet output method.
+
+ :deprecated: use str(result_tree) instead.
+ """
+ return str(result_tree)
+
+ def __deepcopy__(self, memo):
+ return self.__copy__()
+
+ def __copy__(self):
+ return _copyXSLT(self)
+
+ def __call__(self, _input, *, profile_run=False, **kw):
+ u"""__call__(self, _input, profile_run=False, **kw)
+
+ Execute the XSL transformation on a tree or Element.
+
+ Pass the ``profile_run`` option to get profile information
+ about the XSLT. The result of the XSLT will have a property
+ xslt_profile that holds an XML tree with profiling data.
+ """
+ cdef _XSLTContext context = None
+ cdef _XSLTResolverContext resolver_context
+ cdef _Document input_doc
+ cdef _Element root_node
+ cdef _Document result_doc
+ cdef _Document profile_doc = None
+ cdef xmlDoc* c_profile_doc
+ cdef xslt.xsltTransformContext* transform_ctxt
+ cdef xmlDoc* c_result = NULL
+ cdef xmlDoc* c_doc
+ cdef tree.xmlDict* c_dict
+ cdef const_char** params = NULL
+
+ assert self._c_style is not NULL, "XSLT stylesheet not initialised"
+ input_doc = _documentOrRaise(_input)
+ root_node = _rootNodeOrRaise(_input)
+
+ c_doc = _fakeRootDoc(input_doc._c_doc, root_node._c_node)
+
+ transform_ctxt = xslt.xsltNewTransformContext(self._c_style, c_doc)
+ if transform_ctxt is NULL:
+ _destroyFakeDoc(input_doc._c_doc, c_doc)
+ raise MemoryError()
+
+ # using the stylesheet dict is safer than using a possibly
+ # unrelated dict from the current thread. Almost all
+ # non-input tag/attr names will come from the stylesheet
+ # anyway.
+ if transform_ctxt.dict is not NULL:
+ xmlparser.xmlDictFree(transform_ctxt.dict)
+ if kw:
+ # parameter values are stored in the dict
+ # => avoid unnecessarily cluttering the global dict
+ transform_ctxt.dict = xmlparser.xmlDictCreateSub(self._c_style.doc.dict)
+ if transform_ctxt.dict is NULL:
+ xslt.xsltFreeTransformContext(transform_ctxt)
+ raise MemoryError()
+ else:
+ transform_ctxt.dict = self._c_style.doc.dict
+ xmlparser.xmlDictReference(transform_ctxt.dict)
+
+ xslt.xsltSetCtxtParseOptions(
+ transform_ctxt, input_doc._parser._parse_options)
+
+ if profile_run:
+ transform_ctxt.profile = 1
+
+ try:
+ context = self._context._copy()
+ context.register_context(transform_ctxt, input_doc)
+
+ resolver_context = self._xslt_resolver_context._copy()
+ transform_ctxt._private = <python.PyObject*>resolver_context
+
+ _convert_xslt_parameters(transform_ctxt, kw, &params)
+ c_result = self._run_transform(
+ c_doc, params, context, transform_ctxt)
+ if params is not NULL:
+ # deallocate space for parameters
+ python.lxml_free(params)
+
+ if transform_ctxt.state != xslt.XSLT_STATE_OK:
+ if c_result is not NULL:
+ tree.xmlFreeDoc(c_result)
+ c_result = NULL
+
+ if transform_ctxt.profile:
+ c_profile_doc = xslt.xsltGetProfileInformation(transform_ctxt)
+ if c_profile_doc is not NULL:
+ profile_doc = _documentFactory(
+ c_profile_doc, input_doc._parser)
+ finally:
+ if context is not None:
+ context.free_context()
+ _destroyFakeDoc(input_doc._c_doc, c_doc)
+
+ try:
+ if resolver_context is not None and resolver_context._has_raised():
+ if c_result is not NULL:
+ tree.xmlFreeDoc(c_result)
+ c_result = NULL
+ resolver_context._raise_if_stored()
+
+ if context._exc._has_raised():
+ if c_result is not NULL:
+ tree.xmlFreeDoc(c_result)
+ c_result = NULL
+ context._exc._raise_if_stored()
+
+ if c_result is NULL:
+ # last error seems to be the most accurate here
+ error = self._error_log.last_error
+ if error is not None and error.message:
+ if error.line > 0:
+ message = f"{error.message}, line {error.line}"
+ else:
+ message = error.message
+ elif error is not None and error.line > 0:
+ message = f"Error applying stylesheet, line {error.line}"
+ else:
+ message = u"Error applying stylesheet"
+ raise XSLTApplyError(message, self._error_log)
+ finally:
+ if resolver_context is not None:
+ resolver_context.clear()
+
+ result_doc = _documentFactory(c_result, input_doc._parser)
+
+ c_dict = c_result.dict
+ xmlparser.xmlDictReference(c_dict)
+ __GLOBAL_PARSER_CONTEXT.initThreadDictRef(&c_result.dict)
+ if c_dict is not c_result.dict or \
+ self._c_style.doc.dict is not c_result.dict or \
+ input_doc._c_doc.dict is not c_result.dict:
+ with nogil:
+ if c_dict is not c_result.dict:
+ fixThreadDictNames(<xmlNode*>c_result,
+ c_dict, c_result.dict)
+ if self._c_style.doc.dict is not c_result.dict:
+ fixThreadDictNames(<xmlNode*>c_result,
+ self._c_style.doc.dict, c_result.dict)
+ if input_doc._c_doc.dict is not c_result.dict:
+ fixThreadDictNames(<xmlNode*>c_result,
+ input_doc._c_doc.dict, c_result.dict)
+ xmlparser.xmlDictFree(c_dict)
+
+ return _xsltResultTreeFactory(result_doc, self, profile_doc)
+
+ cdef xmlDoc* _run_transform(self, xmlDoc* c_input_doc,
+ const_char** params, _XSLTContext context,
+ xslt.xsltTransformContext* transform_ctxt):
+ cdef xmlDoc* c_result
+ xslt.xsltSetTransformErrorFunc(transform_ctxt, <void*>self._error_log,
+ <xmlerror.xmlGenericErrorFunc>_receiveXSLTError)
+ if self._access_control is not None:
+ self._access_control._register_in_context(transform_ctxt)
+ with self._error_log, nogil:
+ orig_loader = _register_document_loader()
+ c_result = xslt.xsltApplyStylesheetUser(
+ self._c_style, c_input_doc, params, NULL, NULL, transform_ctxt)
+ _reset_document_loader(orig_loader)
+ return c_result
+
+
+cdef _convert_xslt_parameters(xslt.xsltTransformContext* transform_ctxt,
+ dict parameters, const_char*** params_ptr):
+ cdef Py_ssize_t i, parameter_count
+ cdef const_char** params
+ cdef tree.xmlDict* c_dict = transform_ctxt.dict
+ params_ptr[0] = NULL
+ parameter_count = len(parameters)
+ if parameter_count == 0:
+ return
+ # allocate space for parameters
+ # * 2 as we want an entry for both key and value,
+ # and + 1 as array is NULL terminated
+ params = <const_char**>python.lxml_malloc(parameter_count * 2 + 1, sizeof(const_char*))
+ if not params:
+ raise MemoryError()
+ try:
+ i = 0
+ for key, value in parameters.iteritems():
+ k = _utf8(key)
+ if isinstance(value, _XSLTQuotedStringParam):
+ v = (<_XSLTQuotedStringParam>value).strval
+ xslt.xsltQuoteOneUserParam(
+ transform_ctxt, _xcstr(k), _xcstr(v))
+ else:
+ if isinstance(value, XPath):
+ v = (<XPath>value)._path
+ else:
+ v = _utf8(value)
+ params[i] = <const_char*>tree.xmlDictLookup(c_dict, _xcstr(k), len(k))
+ i += 1
+ params[i] = <const_char*>tree.xmlDictLookup(c_dict, _xcstr(v), len(v))
+ i += 1
+ except:
+ python.lxml_free(params)
+ raise
+ params[i] = NULL
+ params_ptr[0] = params
+
+cdef XSLT _copyXSLT(XSLT stylesheet):
+ cdef XSLT new_xslt
+ cdef xmlDoc* c_doc
+ assert stylesheet._c_style is not NULL, "XSLT stylesheet not initialised"
+ new_xslt = XSLT.__new__(XSLT)
+ new_xslt._access_control = stylesheet._access_control
+ new_xslt._error_log = _ErrorLog()
+ new_xslt._context = stylesheet._context._copy()
+
+ new_xslt._xslt_resolver_context = stylesheet._xslt_resolver_context._copy()
+ new_xslt._xslt_resolver_context._c_style_doc = _copyDoc(
+ stylesheet._xslt_resolver_context._c_style_doc, 1)
+
+ c_doc = _copyDoc(stylesheet._c_style.doc, 1)
+ new_xslt._c_style = xslt.xsltParseStylesheetDoc(c_doc)
+ if new_xslt._c_style is NULL:
+ tree.xmlFreeDoc(c_doc)
+ raise MemoryError()
+
+ return new_xslt
+
+@cython.final
+cdef class _XSLTResultTree(_ElementTree):
+ """The result of an XSLT evaluation.
+
+ Use ``str()`` or ``bytes()`` (or ``unicode()`` in Python 2.x) to serialise to a string,
+ and the ``.write_output()`` method to write serialise to a file.
+ """
+ cdef XSLT _xslt
+ cdef _Document _profile
+ cdef xmlChar* _buffer
+ cdef Py_ssize_t _buffer_len
+ cdef Py_ssize_t _buffer_refcnt
+
+ def write_output(self, file, *, compression=0):
+ """write_output(self, file, *, compression=0)
+
+ Serialise the XSLT output to a file or file-like object.
+
+ As opposed to the generic ``.write()`` method, ``.write_output()`` serialises
+ the result as defined by the ``<xsl:output>`` tag.
+ """
+ cdef _FilelikeWriter writer = None
+ cdef _Document doc
+ cdef int r, rclose, c_compression
+ cdef const_xmlChar* c_encoding = NULL
+ cdef tree.xmlOutputBuffer* c_buffer
+
+ if self._context_node is not None:
+ doc = self._context_node._doc
+ else:
+ doc = None
+ if doc is None:
+ doc = self._doc
+ if doc is None:
+ raise XSLTSaveError("No document to serialise")
+ c_compression = compression or 0
+ xslt.LXML_GET_XSLT_ENCODING(c_encoding, self._xslt._c_style)
+ writer = _create_output_buffer(file, <const_char*>c_encoding, compression, &c_buffer, close=False)
+ if writer is None:
+ with nogil:
+ r = xslt.xsltSaveResultTo(c_buffer, doc._c_doc, self._xslt._c_style)
+ rclose = tree.xmlOutputBufferClose(c_buffer)
+ else:
+ r = xslt.xsltSaveResultTo(c_buffer, doc._c_doc, self._xslt._c_style)
+ rclose = tree.xmlOutputBufferClose(c_buffer)
+ if writer is not None:
+ writer._exc_context._raise_if_stored()
+ if r < 0 or rclose == -1:
+ python.PyErr_SetFromErrno(IOError) # raises IOError
+
+ cdef _saveToStringAndSize(self, xmlChar** s, int* l):
+ cdef _Document doc
+ cdef int r
+ if self._context_node is not None:
+ doc = self._context_node._doc
+ else:
+ doc = None
+ if doc is None:
+ doc = self._doc
+ if doc is None:
+ s[0] = NULL
+ return
+ with nogil:
+ r = xslt.xsltSaveResultToString(s, l, doc._c_doc,
+ self._xslt._c_style)
+ if r == -1:
+ raise MemoryError()
+
+ def __str__(self):
+ cdef xmlChar* s = NULL
+ cdef int l = 0
+ if not python.IS_PYTHON2:
+ return self.__unicode__()
+ self._saveToStringAndSize(&s, &l)
+ if s is NULL:
+ return ''
+ # we must not use 'funicode()' here as this is not always UTF-8
+ try:
+ result = <bytes>s[:l]
+ finally:
+ tree.xmlFree(s)
+ return result
+
+ def __unicode__(self):
+ cdef xmlChar* encoding
+ cdef xmlChar* s = NULL
+ cdef int l = 0
+ self._saveToStringAndSize(&s, &l)
+ if s is NULL:
+ return u''
+ encoding = self._xslt._c_style.encoding
+ try:
+ if encoding is NULL:
+ result = s[:l].decode('UTF-8')
+ else:
+ result = s[:l].decode(encoding)
+ finally:
+ tree.xmlFree(s)
+ return _stripEncodingDeclaration(result)
+
+ def __getbuffer__(self, Py_buffer* buffer, int flags):
+ cdef int l = 0
+ if buffer is NULL:
+ return
+ if self._buffer is NULL or flags & python.PyBUF_WRITABLE:
+ self._saveToStringAndSize(<xmlChar**>&buffer.buf, &l)
+ buffer.len = l
+ if self._buffer is NULL and not flags & python.PyBUF_WRITABLE:
+ self._buffer = <xmlChar*>buffer.buf
+ self._buffer_len = l
+ self._buffer_refcnt = 1
+ else:
+ buffer.buf = self._buffer
+ buffer.len = self._buffer_len
+ self._buffer_refcnt += 1
+ if flags & python.PyBUF_WRITABLE:
+ buffer.readonly = 0
+ else:
+ buffer.readonly = 1
+ if flags & python.PyBUF_FORMAT:
+ buffer.format = "B"
+ else:
+ buffer.format = NULL
+ buffer.ndim = 0
+ buffer.shape = NULL
+ buffer.strides = NULL
+ buffer.suboffsets = NULL
+ buffer.itemsize = 1
+ buffer.internal = NULL
+ if buffer.obj is not self: # set by Cython?
+ buffer.obj = self
+
+ def __releasebuffer__(self, Py_buffer* buffer):
+ if buffer is NULL:
+ return
+ if <xmlChar*>buffer.buf is self._buffer:
+ self._buffer_refcnt -= 1
+ if self._buffer_refcnt == 0:
+ tree.xmlFree(<char*>self._buffer)
+ self._buffer = NULL
+ else:
+ tree.xmlFree(<char*>buffer.buf)
+ buffer.buf = NULL
+
+ property xslt_profile:
+ """Return an ElementTree with profiling data for the stylesheet run.
+ """
+ def __get__(self):
+ cdef object root
+ if self._profile is None:
+ return None
+ root = self._profile.getroot()
+ if root is None:
+ return None
+ return ElementTree(root)
+
+ def __del__(self):
+ self._profile = None
+
+cdef _xsltResultTreeFactory(_Document doc, XSLT xslt, _Document profile):
+ cdef _XSLTResultTree result
+ result = <_XSLTResultTree>_newElementTree(doc, None, _XSLTResultTree)
+ result._xslt = xslt
+ result._profile = profile
+ return result
+
+# functions like "output" and "write" are a potential security risk, but we
+# rely on the user to configure XSLTAccessControl as needed
+xslt.xsltRegisterAllExtras()
+
+# enable EXSLT support for XSLT
+xslt.exsltRegisterAll()
+
+
+################################################################################
+# XSLT PI support
+
+cdef object _RE_PI_HREF = re.compile(ur'\s+href\s*=\s*(?:\'([^\']*)\'|"([^"]*)")')
+cdef object _FIND_PI_HREF = _RE_PI_HREF.findall
+cdef object _REPLACE_PI_HREF = _RE_PI_HREF.sub
+cdef XPath __findStylesheetByID = None
+
+cdef _findStylesheetByID(_Document doc, id):
+ global __findStylesheetByID
+ if __findStylesheetByID is None:
+ __findStylesheetByID = XPath(
+ u"//xsl:stylesheet[@xml:id = $id]",
+ namespaces={u"xsl" : u"http://www.w3.org/1999/XSL/Transform"})
+ return __findStylesheetByID(doc, id=id)
+
+cdef class _XSLTProcessingInstruction(PIBase):
+ def parseXSL(self, parser=None):
+ u"""parseXSL(self, parser=None)
+
+ Try to parse the stylesheet referenced by this PI and return
+ an ElementTree for it. If the stylesheet is embedded in the
+ same document (referenced via xml:id), find and return an
+ ElementTree for the stylesheet Element.
+
+ The optional ``parser`` keyword argument can be passed to specify the
+ parser used to read from external stylesheet URLs.
+ """
+ cdef _Document result_doc
+ cdef _Element result_node
+ cdef bytes href_utf
+ cdef const_xmlChar* c_href
+ cdef xmlAttr* c_attr
+ _assertValidNode(self)
+ if self._c_node.content is NULL:
+ raise ValueError, u"PI lacks content"
+ hrefs = _FIND_PI_HREF(u' ' + (<unsigned char*>self._c_node.content).decode('UTF-8'))
+ if len(hrefs) != 1:
+ raise ValueError, u"malformed PI attributes"
+ hrefs = hrefs[0]
+ href_utf = utf8(hrefs[0] or hrefs[1])
+ c_href = _xcstr(href_utf)
+
+ if c_href[0] != c'#':
+ # normal URL, try to parse from it
+ c_href = tree.xmlBuildURI(
+ c_href,
+ tree.xmlNodeGetBase(self._c_node.doc, self._c_node))
+ if c_href is not NULL:
+ try:
+ href_utf = <unsigned char*>c_href
+ finally:
+ tree.xmlFree(<char*>c_href)
+ result_doc = _parseDocumentFromURL(href_utf, parser)
+ return _elementTreeFactory(result_doc, None)
+
+ # ID reference to embedded stylesheet
+ # try XML:ID lookup
+ _assertValidDoc(self._doc)
+ c_href += 1 # skip leading '#'
+ c_attr = tree.xmlGetID(self._c_node.doc, c_href)
+ if c_attr is not NULL and c_attr.doc is self._c_node.doc:
+ result_node = _elementFactory(self._doc, c_attr.parent)
+ return _elementTreeFactory(result_node._doc, result_node)
+
+ # try XPath search
+ root = _findStylesheetByID(self._doc, funicode(c_href))
+ if not root:
+ raise ValueError, u"reference to non-existing embedded stylesheet"
+ elif len(root) > 1:
+ raise ValueError, u"ambiguous reference to embedded stylesheet"
+ result_node = root[0]
+ return _elementTreeFactory(result_node._doc, result_node)
+
+ def set(self, key, value):
+ u"""set(self, key, value)
+
+ Supports setting the 'href' pseudo-attribute in the text of
+ the processing instruction.
+ """
+ if key != u"href":
+ raise AttributeError, \
+ u"only setting the 'href' attribute is supported on XSLT-PIs"
+ if value is None:
+ attrib = u""
+ elif u'"' in value or u'>' in value:
+ raise ValueError, u"Invalid URL, must not contain '\"' or '>'"
+ else:
+ attrib = f' href="{value}"'
+ text = u' ' + self.text
+ if _FIND_PI_HREF(text):
+ self.text = _REPLACE_PI_HREF(attrib, text)
+ else:
+ self.text = text + attrib
diff --git a/src/lxml/xsltext.pxi b/src/lxml/xsltext.pxi
new file mode 100644
index 0000000..c98ae1f
--- /dev/null
+++ b/src/lxml/xsltext.pxi
@@ -0,0 +1,242 @@
+# XSLT extension elements
+
+cdef class XSLTExtension:
+ u"""Base class of an XSLT extension element.
+ """
+ def execute(self, context, self_node, input_node, output_parent):
+ u"""execute(self, context, self_node, input_node, output_parent)
+ Execute this extension element.
+
+ Subclasses must override this method. They may append
+ elements to the `output_parent` element here, or set its text
+ content. To this end, the `input_node` provides read-only
+ access to the current node in the input document, and the
+ `self_node` points to the extension element in the stylesheet.
+
+ Note that the `output_parent` parameter may be `None` if there
+ is no parent element in the current context (e.g. no content
+ was added to the output tree yet).
+ """
+ pass
+
+ def apply_templates(self, _XSLTContext context not None, node, output_parent=None,
+ *, elements_only=False, remove_blank_text=False):
+ u"""apply_templates(self, context, node, output_parent=None, elements_only=False, remove_blank_text=False)
+
+ Call this method to retrieve the result of applying templates
+ to an element.
+
+ The return value is a list of elements or text strings that
+ were generated by the XSLT processor. If you pass
+ ``elements_only=True``, strings will be discarded from the result
+ list. The option ``remove_blank_text=True`` will only discard
+ strings that consist entirely of whitespace (e.g. formatting).
+ These options do not apply to Elements, only to bare string results.
+
+ If you pass an Element as `output_parent` parameter, the result
+ will instead be appended to the element (including attributes
+ etc.) and the return value will be `None`. This is a safe way
+ to generate content into the output document directly, without
+ having to take care of special values like text or attributes.
+ Note that the string discarding options will be ignored in this
+ case.
+ """
+ cdef xmlNode* c_parent
+ cdef xmlNode* c_node
+ cdef xmlNode* c_context_node
+ assert context._xsltCtxt is not NULL, "XSLT context not initialised"
+ c_context_node = _roNodeOf(node)
+ #assert c_context_node.doc is context._xsltContext.node.doc, \
+ # "switching input documents during transformation is not currently supported"
+
+ if output_parent is not None:
+ c_parent = _nonRoNodeOf(output_parent)
+ else:
+ c_parent = tree.xmlNewDocNode(
+ context._xsltCtxt.output, NULL, <unsigned char*>"fake-parent", NULL)
+
+ c_node = context._xsltCtxt.insert
+ context._xsltCtxt.insert = c_parent
+ xslt.xsltProcessOneNode(
+ context._xsltCtxt, c_context_node, NULL)
+ context._xsltCtxt.insert = c_node
+
+ if output_parent is not None:
+ return None
+
+ try:
+ return self._collectXSLTResultContent(
+ context, c_parent, elements_only, remove_blank_text)
+ finally:
+ # free all intermediate nodes that will not be freed by proxies
+ tree.xmlFreeNode(c_parent)
+
+ def process_children(self, _XSLTContext context not None, output_parent=None,
+ *, elements_only=False, remove_blank_text=False):
+ u"""process_children(self, context, output_parent=None, elements_only=False, remove_blank_text=False)
+
+ Call this method to process the XSLT content of the extension
+ element itself.
+
+ The return value is a list of elements or text strings that
+ were generated by the XSLT processor. If you pass
+ ``elements_only=True``, strings will be discarded from the result
+ list. The option ``remove_blank_text=True`` will only discard
+ strings that consist entirely of whitespace (e.g. formatting).
+ These options do not apply to Elements, only to bare string results.
+
+ If you pass an Element as `output_parent` parameter, the result
+ will instead be appended to the element (including attributes
+ etc.) and the return value will be `None`. This is a safe way
+ to generate content into the output document directly, without
+ having to take care of special values like text or attributes.
+ Note that the string discarding options will be ignored in this
+ case.
+ """
+ cdef xmlNode* c_parent
+ cdef xslt.xsltTransformContext* c_ctxt = context._xsltCtxt
+ cdef xmlNode* c_old_output_parent = c_ctxt.insert
+ assert context._xsltCtxt is not NULL, "XSLT context not initialised"
+
+ # output_parent node is used for adding results instead of
+ # elements list used in apply_templates, that's easier and allows to
+ # use attributes added to extension element with <xsl:attribute>.
+
+ if output_parent is not None:
+ c_parent = _nonRoNodeOf(output_parent)
+ else:
+ c_parent = tree.xmlNewDocNode(
+ context._xsltCtxt.output, NULL, <unsigned char*>"fake-parent", NULL)
+
+ c_ctxt.insert = c_parent
+ xslt.xsltApplyOneTemplate(c_ctxt,
+ c_ctxt.node, c_ctxt.inst.children, NULL, NULL)
+ c_ctxt.insert = c_old_output_parent
+
+ if output_parent is not None:
+ return None
+
+ try:
+ return self._collectXSLTResultContent(
+ context, c_parent, elements_only, remove_blank_text)
+ finally:
+ # free all intermediate nodes that will not be freed by proxies
+ tree.xmlFreeNode(c_parent)
+
+ cdef _collectXSLTResultContent(self, _XSLTContext context, xmlNode* c_parent,
+ bint elements_only, bint remove_blank_text):
+ cdef xmlNode* c_node
+ cdef xmlNode* c_next
+ cdef _ReadOnlyProxy proxy
+ cdef list results = [] # or maybe _collectAttributes(c_parent, 2) ?
+ c_node = c_parent.children
+ while c_node is not NULL:
+ c_next = c_node.next
+ if c_node.type == tree.XML_TEXT_NODE:
+ if not elements_only:
+ s = funicode(c_node.content)
+ if not remove_blank_text or s.strip():
+ results.append(s)
+ s = None
+ elif c_node.type == tree.XML_ELEMENT_NODE:
+ proxy = _newReadOnlyProxy(
+ context._extension_element_proxy, c_node)
+ results.append(proxy)
+ # unlink node and make sure it will be freed later on
+ tree.xmlUnlinkNode(c_node)
+ proxy.free_after_use()
+ else:
+ raise TypeError, \
+ f"unsupported XSLT result type: {c_node.type}"
+ c_node = c_next
+ return results
+
+
+cdef _registerXSLTExtensions(xslt.xsltTransformContext* c_ctxt,
+ extension_dict):
+ for ns_utf, name_utf in extension_dict:
+ xslt.xsltRegisterExtElement(
+ c_ctxt, _xcstr(name_utf), _xcstr(ns_utf),
+ <xslt.xsltTransformFunction>_callExtensionElement)
+
+cdef void _callExtensionElement(xslt.xsltTransformContext* c_ctxt,
+ xmlNode* c_context_node,
+ xmlNode* c_inst_node,
+ void* dummy) with gil:
+ cdef _XSLTContext context
+ cdef XSLTExtension extension
+ cdef python.PyObject* dict_result
+ cdef xmlNode* c_node
+ cdef _ReadOnlyProxy context_node = None, self_node = None
+ cdef object output_parent # not restricted to ro-nodes
+ c_uri = _getNs(c_inst_node)
+ if c_uri is NULL:
+ # not allowed, and should never happen
+ return
+ if c_ctxt.xpathCtxt.userData is NULL:
+ # just for safety, should never happen
+ return
+ context = <_XSLTContext>c_ctxt.xpathCtxt.userData
+ try:
+ try:
+ dict_result = python.PyDict_GetItem(
+ context._extension_elements, (c_uri, c_inst_node.name))
+ if dict_result is NULL:
+ raise KeyError, f"extension element {funicode(c_inst_node.name)} not found"
+ extension = <object>dict_result
+
+ try:
+ # build the context proxy nodes
+ self_node = _newReadOnlyProxy(None, c_inst_node)
+ if _isElement(c_ctxt.insert):
+ output_parent = _newAppendOnlyProxy(self_node, c_ctxt.insert)
+ else:
+ # may be the document node or other stuff
+ output_parent = _newOpaqueAppendOnlyNodeWrapper(c_ctxt.insert)
+ if c_context_node.type in (tree.XML_DOCUMENT_NODE,
+ tree.XML_HTML_DOCUMENT_NODE):
+ c_node = tree.xmlDocGetRootElement(<xmlDoc*>c_context_node)
+ if c_node is not NULL:
+ context_node = _newReadOnlyProxy(self_node, c_node)
+ else:
+ context_node = None
+ elif c_context_node.type in (tree.XML_ATTRIBUTE_NODE,
+ tree.XML_TEXT_NODE,
+ tree.XML_CDATA_SECTION_NODE):
+ # this isn't easy to support using read-only
+ # nodes, as the smart-string factory must
+ # instantiate the parent proxy somehow...
+ raise TypeError(f"Unsupported element type: {c_context_node.type}")
+ else:
+ context_node = _newReadOnlyProxy(self_node, c_context_node)
+
+ # run the XSLT extension
+ context._extension_element_proxy = self_node
+ extension.execute(context, self_node, context_node, output_parent)
+ finally:
+ context._extension_element_proxy = None
+ if self_node is not None:
+ _freeReadOnlyProxies(self_node)
+ except Exception as e:
+ try:
+ e = unicode(e).encode(u"UTF-8")
+ except:
+ e = repr(e).encode(u"UTF-8")
+ message = python.PyBytes_FromFormat(
+ "Error executing extension element '%s': %s",
+ c_inst_node.name, _cstr(e))
+ xslt.xsltTransformError(c_ctxt, NULL, c_inst_node, "%s", message)
+ context._exc._store_raised()
+ except:
+ # just in case
+ message = python.PyBytes_FromFormat(
+ "Error executing extension element '%s'", c_inst_node.name)
+ xslt.xsltTransformError(c_ctxt, NULL, c_inst_node, "%s", message)
+ context._exc._store_raised()
+ except:
+ # no Python functions here - everything can fail...
+ xslt.xsltTransformError(c_ctxt, NULL, c_inst_node,
+ "Error during XSLT extension element evaluation")
+ context._exc._store_raised()
+ finally:
+ return # swallow any further exceptions
diff --git a/test.py b/test.py
new file mode 100644
index 0000000..dd05cf8
--- /dev/null
+++ b/test.py
@@ -0,0 +1,621 @@
+#!/usr/bin/env python
+#
+# SchoolTool - common information systems platform for school administration
+# Copyright (c) 2003 Shuttleworth Foundation
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+"""
+SchoolTool test runner.
+
+Syntax: test.py [options] [pathname-regexp [test-regexp]]
+
+There are two kinds of tests:
+ - unit tests (or programmer tests) test the internal workings of various
+ components of the system
+ - functional tests (acceptance tests, customer tests) test only externally
+ visible system behaviour
+
+You can choose to run unit tests (this is the default mode), functional tests
+(by giving a -f option to test.py) or both (by giving both -u and -f options).
+
+Test cases are located in the directory tree starting at the location of this
+script, in subdirectories named 'tests' for unit tests and 'ftests' for
+functional tests, in Python modules named 'test*.py'. They are then filtered
+according to pathname and test regexes. Alternatively, packages may just have
+'tests.py' and 'ftests.py' instead of subpackages 'tests' and 'ftests'
+respectively.
+
+A leading "!" in a regexp is stripped and negates the regexp. Pathname
+regexp is applied to the whole path (package/package/module.py). Test regexp
+is applied to a full test id (package.package.module.class.test_method).
+
+Options:
+ -h print this help message
+ -v verbose (print dots for each test run)
+ -vv very verbose (print test names)
+ -q quiet (do not print anything on success)
+ -w enable warnings about omitted test cases
+ -p show progress bar (can be combined with -v or -vv)
+ -u select unit tests (default)
+ -f select functional tests
+ --level n select only tests at level n or lower
+ --all-levels select all tests
+ --list-files list all selected test files
+ --list-tests list all selected test cases
+ --list-hooks list all loaded test hooks
+ --coverage create code coverage reports
+"""
+#
+# This script borrows ideas from Zope 3's test runner heavily. It is smaller
+# and cleaner though, at the expense of more limited functionality.
+#
+
+import re
+import os
+import sys
+import time
+import types
+import getopt
+import unittest
+import traceback
+
+try:
+ # Python >=2.7 and >=3.2
+ from unittest.runner import _TextTestResult
+except ImportError:
+ from unittest import _TextTestResult
+
+__metaclass__ = type
+
+def stderr(text):
+ sys.stderr.write(text)
+ sys.stderr.write("\n")
+
+class Options:
+ """Configurable properties of the test runner."""
+
+ # test location
+ basedir = '' # base directory for tests (defaults to
+ # basedir of argv[0] + 'src'), must be absolute
+ follow_symlinks = True # should symlinks to subdirectories be
+ # followed? (hardcoded, may cause loops)
+
+ # which tests to run
+ unit_tests = False # unit tests (default if both are false)
+ functional_tests = False # functional tests
+
+ # test filtering
+ level = 1 # run only tests at this or lower level
+ # (if None, runs all tests)
+ pathname_regex = '' # regexp for filtering filenames
+ test_regex = '' # regexp for filtering test cases
+
+ # actions to take
+ list_files = False # --list-files
+ list_tests = False # --list-tests
+ list_hooks = False # --list-hooks
+ run_tests = True # run tests (disabled by --list-foo)
+
+ # output verbosity
+ verbosity = 0 # verbosity level (-v)
+ quiet = 0 # do not print anything on success (-q)
+ warn_omitted = False # produce warnings when a test case is
+ # not included in a test suite (-w)
+ progress = False # show running progress (-p)
+ coverage = False # produce coverage reports (--coverage)
+ coverdir = 'coverage' # where to put them (currently hardcoded)
+ immediate_errors = False # show tracebacks twice (currently hardcoded)
+ screen_width = 80 # screen width (autodetected)
+
+
+def compile_matcher(regex):
+ """Returns a function that takes one argument and returns True or False.
+
+ Regex is a regular expression. Empty regex matches everything. There
+ is one expression: if the regex starts with "!", the meaning of it is
+ reversed.
+ """
+ if not regex:
+ return lambda x: True
+ elif regex == '!':
+ return lambda x: False
+ elif regex.startswith('!'):
+ rx = re.compile(regex[1:])
+ return lambda x: rx.search(x) is None
+ else:
+ rx = re.compile(regex)
+ return lambda x: rx.search(x) is not None
+
+
+def walk_with_symlinks(top, func, arg):
+ """Like os.path.walk, but follows symlinks on POSIX systems.
+
+ If the symlinks create a loop, this function will never finish.
+ """
+ try:
+ names = os.listdir(top)
+ except os.error:
+ return
+ func(arg, top, names)
+ exceptions = ('.', '..')
+ for name in names:
+ if name not in exceptions:
+ name = os.path.join(top, name)
+ if os.path.isdir(name):
+ walk_with_symlinks(name, func, arg)
+
+
+def get_test_files(cfg):
+ """Returns a list of test module filenames."""
+ matcher = compile_matcher(cfg.pathname_regex)
+ results = []
+ test_names = []
+ if cfg.unit_tests:
+ test_names.append('tests')
+ if cfg.functional_tests:
+ test_names.append('ftests')
+ baselen = len(cfg.basedir) + 1
+ def visit(ignored, dir, files):
+ if os.path.basename(dir) not in test_names:
+ for name in test_names:
+ if name + '.py' in files:
+ path = os.path.join(dir, name + '.py')
+ if matcher(path[baselen:]):
+ results.append(path)
+ return
+ if '__init__.py' not in files:
+ stderr("%s is not a package" % dir)
+ return
+ for file in files:
+ if file.startswith('test') and file.endswith('.py'):
+ path = os.path.join(dir, file)
+ if matcher(path[baselen:]):
+ results.append(path)
+ if cfg.follow_symlinks:
+ walker = walk_with_symlinks
+ else:
+ walker = os.path.walk
+ walker(cfg.basedir, visit, None)
+ results.sort()
+ return results
+
+
+def import_module(filename, cfg, cov=None):
+ """Imports and returns a module."""
+ filename = os.path.splitext(filename)[0]
+ modname = filename[len(cfg.basedir):].replace(os.path.sep, '.')
+ if modname.startswith('.'):
+ modname = modname[1:]
+ if cov is not None:
+ cov.start()
+ mod = __import__(modname)
+ if cov is not None:
+ cov.stop()
+ components = modname.split('.')
+ for comp in components[1:]:
+ mod = getattr(mod, comp)
+ return mod
+
+
+def filter_testsuite(suite, matcher, level=None):
+ """Returns a flattened list of test cases that match the given matcher."""
+ if not isinstance(suite, unittest.TestSuite):
+ raise TypeError('not a TestSuite', suite)
+ results = []
+ for test in suite._tests:
+ if level is not None and getattr(test, 'level', 0) > level:
+ continue
+ if isinstance(test, unittest.TestCase):
+ testname = test.id() # package.module.class.method
+ if matcher(testname):
+ results.append(test)
+ else:
+ filtered = filter_testsuite(test, matcher, level)
+ results.extend(filtered)
+ return results
+
+
+def get_all_test_cases(module):
+ """Returns a list of all test case classes defined in a given module."""
+ results = []
+ for name in dir(module):
+ if not name.startswith('Test'):
+ continue
+ item = getattr(module, name)
+ if (isinstance(item, (type, types.ClassType)) and
+ issubclass(item, unittest.TestCase)):
+ results.append(item)
+ return results
+
+
+def get_test_classes_from_testsuite(suite):
+ """Returns a set of test case classes used in a test suite."""
+ if not isinstance(suite, unittest.TestSuite):
+ raise TypeError('not a TestSuite', suite)
+ results = set()
+ for test in suite._tests:
+ if isinstance(test, unittest.TestCase):
+ results.add(test.__class__)
+ else:
+ classes = get_test_classes_from_testsuite(test)
+ results.update(classes)
+ return results
+
+
+def get_test_cases(test_files, cfg, cov=None):
+ """Returns a list of test cases from a given list of test modules."""
+ matcher = compile_matcher(cfg.test_regex)
+ results = []
+ for file in test_files:
+ module = import_module(file, cfg, cov=cov)
+ if cov is not None:
+ cov.start()
+ test_suite = module.test_suite()
+ if cov is not None:
+ cov.stop()
+ if test_suite is None:
+ continue
+ if cfg.warn_omitted:
+ all_classes = set(get_all_test_cases(module))
+ classes_in_suite = get_test_classes_from_testsuite(test_suite)
+ difference = all_classes - classes_in_suite
+ for test_class in difference:
+ # surround the warning with blank lines, otherwise it tends
+ # to get lost in the noise
+ stderr("\n%s: WARNING: %s not in test suite\n"
+ % (file, test_class.__name__))
+ if (cfg.level is not None and
+ getattr(test_suite, 'level', 0) > cfg.level):
+ continue
+ filtered = filter_testsuite(test_suite, matcher, cfg.level)
+ results.extend(filtered)
+ return results
+
+
+def get_test_hooks(test_files, cfg, cov=None):
+ """Returns a list of test hooks from a given list of test modules."""
+ results = []
+ dirs = set(map(os.path.dirname, test_files))
+ for dir in list(dirs):
+ if os.path.basename(dir) == 'ftests':
+ dirs.add(os.path.join(os.path.dirname(dir), 'tests'))
+ dirs = list(dirs)
+ dirs.sort()
+ for dir in dirs:
+ filename = os.path.join(dir, 'checks.py')
+ if os.path.exists(filename):
+ module = import_module(filename, cfg, tracer=tracer)
+ if cov is not None:
+ cov.start()
+ hooks = module.test_hooks()
+ if cov is not None:
+ cov.stop()
+ results.extend(hooks)
+ return results
+
+
+class CustomTestResult(_TextTestResult):
+ """Customised TestResult.
+
+ It can show a progress bar, and displays tracebacks for errors and failures
+ as soon as they happen, in addition to listing them all at the end.
+ """
+
+ __super = _TextTestResult
+ __super_init = __super.__init__
+ __super_startTest = __super.startTest
+ __super_stopTest = __super.stopTest
+ __super_printErrors = __super.printErrors
+
+ def __init__(self, stream, descriptions, verbosity, count, cfg, hooks):
+ self.__super_init(stream, descriptions, verbosity)
+ self.count = count
+ self.cfg = cfg
+ self.hooks = hooks
+ if cfg.progress:
+ self.dots = False
+ self._lastWidth = 0
+ self._maxWidth = cfg.screen_width - len("xxxx/xxxx (xxx.x%): ") - 1
+
+ def startTest(self, test):
+ if self.cfg.progress:
+ # verbosity == 0: 'xxxx/xxxx (xxx.x%)'
+ # verbosity == 1: 'xxxx/xxxx (xxx.x%): test name'
+ # verbosity >= 2: 'xxxx/xxxx (xxx.x%): test name ... ok'
+ n = self.testsRun + 1
+ self.stream.write("\r%4d" % n)
+ if self.count:
+ self.stream.write("/%d (%5.1f%%)"
+ % (self.count, n * 100.0 / self.count))
+ if self.showAll: # self.cfg.verbosity == 1
+ self.stream.write(": ")
+ elif self.cfg.verbosity:
+ name = self.getShortDescription(test)
+ width = len(name)
+ if width < self._lastWidth:
+ name += " " * (self._lastWidth - width)
+ self.stream.write(": %s" % name)
+ self._lastWidth = width
+ self.stream.flush()
+ self.__super_startTest(test)
+ for hook in self.hooks:
+ hook.startTest(test)
+
+ def stopTest(self, test):
+ for hook in self.hooks:
+ hook.stopTest(test)
+ self.__super_stopTest(test)
+
+ def getShortDescription(self, test):
+ s = self.getDescription(test)
+ if len(s) > self._maxWidth:
+ # s is 'testname (package.module.class)'
+ # try to shorten it to 'testname (...age.module.class)'
+ # if it is still too long, shorten it to 'testnam...'
+ # limit case is 'testname (...)'
+ pos = s.find(" (")
+ if pos + len(" (...)") > self._maxWidth:
+ s = s[:self._maxWidth - 3] + "..."
+ else:
+ s = "%s...%s" % (s[:pos + 2], s[pos + 5 - self._maxWidth:])
+ return s
+
+ def printErrors(self):
+ if self.cfg.progress and not (self.dots or self.showAll):
+ self.stream.writeln()
+ self.__super_printErrors()
+
+ def formatError(self, err):
+ return "".join(traceback.format_exception(*err))
+
+ def printTraceback(self, kind, test, err):
+ self.stream.writeln()
+ self.stream.writeln()
+ self.stream.writeln("%s: %s" % (kind, test))
+ self.stream.writeln(self.formatError(err))
+ self.stream.writeln()
+
+ def addFailure(self, test, err):
+ if self.cfg.immediate_errors:
+ self.printTraceback("FAIL", test, err)
+ self.failures.append((test, self.formatError(err)))
+
+ def addError(self, test, err):
+ if self.cfg.immediate_errors:
+ self.printTraceback("ERROR", test, err)
+ self.errors.append((test, self.formatError(err)))
+
+
+class CustomTestRunner(unittest.TextTestRunner):
+ """Customised TestRunner.
+
+ See CustomisedTextResult for a list of extensions.
+ """
+
+ __super = unittest.TextTestRunner
+ __super_init = __super.__init__
+ __super_run = __super.run
+
+ def __init__(self, cfg, hooks=None):
+ self.__super_init(verbosity=cfg.verbosity)
+ self.cfg = cfg
+ if hooks is not None:
+ self.hooks = hooks
+ else:
+ self.hooks = []
+
+ def run(self, test):
+ """Run the given test case or test suite."""
+ self.count = test.countTestCases()
+ result = self._makeResult()
+ startTime = time.time()
+ test(result)
+ stopTime = time.time()
+ timeTaken = float(stopTime - startTime)
+ result.printErrors()
+ run = result.testsRun
+ if not self.cfg.quiet:
+ self.stream.writeln(result.separator2)
+ self.stream.writeln("Ran %d test%s in %.3fs" %
+ (run, run != 1 and "s" or "", timeTaken))
+ self.stream.writeln()
+ if not result.wasSuccessful():
+ self.stream.write("FAILED (")
+ failed, errored = list(map(len, (result.failures, result.errors)))
+ if failed:
+ self.stream.write("failures=%d" % failed)
+ if errored:
+ if failed: self.stream.write(", ")
+ self.stream.write("errors=%d" % errored)
+ self.stream.writeln(")")
+ elif not self.cfg.quiet:
+ self.stream.writeln("OK")
+ return result
+
+ def _makeResult(self):
+ return CustomTestResult(self.stream, self.descriptions, self.verbosity,
+ cfg=self.cfg, count=self.count,
+ hooks=self.hooks)
+
+
+def main(argv):
+ """Main program."""
+
+ # Environment
+ if sys.version_info < (2, 7):
+ stderr('%s: need Python 2.7 or later' % argv[0])
+ stderr('your python is %s' % sys.version)
+ return 1
+
+ # Defaults
+ cfg = Options()
+ cfg.basedir = os.path.join(os.path.dirname(argv[0]), 'src')
+ cfg.basedir = os.path.abspath(cfg.basedir)
+
+ # Figure out terminal size
+ try:
+ import curses
+ except ImportError:
+ pass
+ else:
+ try:
+ curses.setupterm()
+ cols = curses.tigetnum('cols')
+ if cols > 0:
+ cfg.screen_width = cols
+ except (curses.error, TypeError):
+ # tigetnum() is broken in PyPy3 and raises TypeError
+ pass
+
+ # Option processing
+ opts, args = getopt.gnu_getopt(argv[1:], 'hvpqufw',
+ ['list-files', 'list-tests', 'list-hooks',
+ 'level=', 'all-levels', 'coverage'])
+ for k, v in opts:
+ if k == '-h':
+ print(__doc__)
+ return 0
+ elif k == '-v':
+ cfg.verbosity += 1
+ cfg.quiet = False
+ elif k == '-p':
+ cfg.progress = True
+ cfg.quiet = False
+ elif k == '-q':
+ cfg.verbosity = 0
+ cfg.progress = False
+ cfg.quiet = True
+ elif k == '-u':
+ cfg.unit_tests = True
+ elif k == '-f':
+ cfg.functional_tests = True
+ elif k == '-w':
+ cfg.warn_omitted = True
+ elif k == '--list-files':
+ cfg.list_files = True
+ cfg.run_tests = False
+ elif k == '--list-tests':
+ cfg.list_tests = True
+ cfg.run_tests = False
+ elif k == '--list-hooks':
+ cfg.list_hooks = True
+ cfg.run_tests = False
+ elif k == '--coverage':
+ cfg.coverage = True
+ elif k == '--level':
+ try:
+ cfg.level = int(v)
+ except ValueError:
+ stderr('%s: invalid level: %s' % (argv[0], v))
+ stderr('run %s -h for help')
+ return 1
+ elif k == '--all-levels':
+ cfg.level = None
+ else:
+ stderr('%s: invalid option: %s' % (argv[0], k))
+ stderr('run %s -h for help')
+ return 1
+ if args:
+ cfg.pathname_regex = args[0]
+ if len(args) > 1:
+ cfg.test_regex = args[1]
+ if len(args) > 2:
+ stderr('%s: too many arguments: %s' % (argv[0], args[2]))
+ stderr('run %s -h for help')
+ return 1
+ if not cfg.unit_tests and not cfg.functional_tests:
+ cfg.unit_tests = True
+
+ # Set up the python path
+ sys.path[0] = cfg.basedir
+
+ # Set up tracing before we start importing things
+ cov = None
+ if cfg.run_tests and cfg.coverage:
+ from coverage import coverage
+ cov = coverage(omit=['test.py'])
+
+ # Finding and importing
+ test_files = get_test_files(cfg)
+
+ if cov is not None:
+ cov.start()
+ if cfg.list_tests or cfg.run_tests:
+ test_cases = get_test_cases(test_files, cfg, cov=cov)
+ if cfg.list_hooks or cfg.run_tests:
+ test_hooks = get_test_hooks(test_files, cfg, cov=cov)
+
+ # Configure the logging module
+ import logging
+ logging.basicConfig()
+ logging.root.setLevel(logging.CRITICAL)
+
+ # Running
+ success = True
+ if cfg.list_files:
+ baselen = len(cfg.basedir) + 1
+ print("\n".join([fn[baselen:] for fn in test_files]))
+ if cfg.list_tests:
+ print("\n".join([test.id() for test in test_cases]))
+ if cfg.list_hooks:
+ print("\n".join([str(hook) for hook in test_hooks]))
+ if cfg.run_tests:
+ runner = CustomTestRunner(cfg, test_hooks)
+ suite = unittest.TestSuite()
+ suite.addTests(test_cases)
+ if cov is not None:
+ cov.start()
+ run_result = runner.run(suite)
+ if cov is not None:
+ cov.stop()
+ success = run_result.wasSuccessful()
+ del run_result
+
+ if cov is not None:
+ traced_file_types = ('.py', '.pyx', '.pxi', '.pxd')
+ modules = []
+
+ def add_file(_, path, files):
+ if 'tests' in os.path.relpath(path, cfg.basedir).split(os.sep):
+ return
+ for filename in files:
+ if filename.endswith(traced_file_types):
+ modules.append(os.path.join(path, filename))
+
+ if cfg.follow_symlinks:
+ walker = walk_with_symlinks
+ else:
+ walker = os.path.walk
+ walker(os.path.abspath(cfg.basedir), add_file, None)
+
+ try:
+ cov.xml_report(modules, outfile='coverage.xml')
+ if cfg.coverdir:
+ cov.html_report(modules, directory=cfg.coverdir)
+ finally:
+ # test runs can take a while, so at least try to print something
+ cov.report()
+
+ # That's all
+ if success:
+ return 0
+ else:
+ return 1
+
+
+if __name__ == '__main__':
+ exitcode = main(sys.argv)
+ sys.exit(exitcode)
diff --git a/tools/manylinux/build-wheels.sh b/tools/manylinux/build-wheels.sh
new file mode 100755
index 0000000..65d7602
--- /dev/null
+++ b/tools/manylinux/build-wheels.sh
@@ -0,0 +1,81 @@
+#!/bin/bash
+#
+# Called inside the manylinux image
+echo "Started $0 $@"
+
+set -e -x
+REQUIREMENTS=/io/requirements.txt
+[ -n "$WHEELHOUSE" ] || WHEELHOUSE=wheelhouse
+SDIST=$1
+PACKAGE=$(basename ${SDIST%-*})
+SDIST_PREFIX=$(basename ${SDIST%%.tar.gz})
+
+build_wheel() {
+ pybin="$1"
+ source="$2"
+ [ -n "$source" ] || source=/io
+
+ env STATIC_DEPS=true \
+ LDFLAGS="$LDFLAGS -fPIC" \
+ CFLAGS="$CFLAGS -fPIC" \
+ ${pybin}/pip \
+ wheel \
+ "$source" \
+ -w /io/$WHEELHOUSE
+}
+
+run_tests() {
+ # Install packages and test
+ for PYBIN in /opt/python/*/bin/; do
+ ${PYBIN}/python -m pip install $PACKAGE --no-index -f /io/$WHEELHOUSE || exit 1
+
+ # check import as a quick test
+ (cd $HOME; ${PYBIN}/python -c 'import lxml.etree, lxml.objectify')
+ done
+}
+
+prepare_system() {
+ #yum install -y zlib-devel
+ #rm -fr /opt/python/cp34-*
+ echo "Python versions found: $(cd /opt/python && echo cp* | sed -e 's|[^ ]*-||g')"
+ ${CC:-gcc} --version
+}
+
+build_wheels() {
+ # Compile wheels for all python versions
+ test -e "$SDIST" && source="$SDIST" || source=
+ FIRST=
+ SECOND=
+ THIRD=
+ for PYBIN in /opt/python/*/bin; do
+ # Install build requirements if we need them and file exists
+ test -n "$source" -o ! -e "$REQUIREMENTS" \
+ || ${PYBIN}/python -m pip install -r "$REQUIREMENTS"
+
+ echo "Starting build with $($PYBIN/python -V)"
+ build_wheel "$PYBIN" "$source" &
+ THIRD=$!
+
+ [ -z "$FIRST" ] || wait ${FIRST}
+ if [ "$(uname -m)" == "aarch64" ]; then FIRST=$THIRD; else FIRST=$SECOND; fi
+ SECOND=$THIRD
+ done
+ wait || exit 1
+}
+
+repair_wheels() {
+ # Bundle external shared libraries into the wheels
+ for whl in /io/$WHEELHOUSE/${SDIST_PREFIX}-*.whl; do
+ auditwheel repair $whl -w /io/$WHEELHOUSE || exit 1
+ done
+}
+
+show_wheels() {
+ ls -l /io/$WHEELHOUSE/${SDIST_PREFIX}-*.whl
+}
+
+prepare_system
+build_wheels
+repair_wheels
+run_tests
+show_wheels
diff --git a/tools/xpathgrep.py b/tools/xpathgrep.py
new file mode 100644
index 0000000..b566e5f
--- /dev/null
+++ b/tools/xpathgrep.py
@@ -0,0 +1,334 @@
+#!/usr/bin/env python
+
+import sys
+import os.path
+
+def error(message, *args):
+ if args:
+ message = message % args
+ sys.stderr.write('ERROR: %s\n' % message)
+
+try:
+ import lxml.etree as et
+except ImportError:
+ error(sys.exc_info()[1])
+ sys.exit(5)
+
+try:
+ basestring
+except NameError:
+ basestring = (str, bytes)
+
+try:
+ unicode
+except NameError:
+ unicode = str
+
+SHORT_DESCRIPTION = "An XPath file finder for XML files."
+
+__doc__ = SHORT_DESCRIPTION + '''
+
+Evaluates an XPath expression against a series of files and prints the
+matching subtrees to stdout.
+
+Examples::
+
+ $ cat test.xml
+ <root>
+ <a num="1234" notnum="1234abc"/>
+ <b text="abc"/>
+ <c text="aBc"/>
+ <d xmlns="http://www.example.org/ns/example" num="2"/>
+ <d xmlns="http://www.example.org/ns/example" num="4"/>
+ </root>
+
+ # find all leaf elements:
+ $ SCRIPT '//*[not(*)]' test.xml
+ <a num="1234" notnum="1234abc"/>
+ <b text="abc"/>
+ <c text="aBc"/>
+
+ # find all elements with attribute values containing "abc" ignoring case:
+ $ SCRIPT '//*[@*[contains(py:lower(.), "abc")]]' test.xml
+ <a num="1234" notnum="1234abc"/>
+ <b text="abc"/>
+ <c text="aBc"/>
+
+ # find all numeric attribute values:
+ $ SCRIPT '//@*[re:match(., "^[0-9]+$")]' test.xml
+ 1234
+
+ * find all elements with numeric attribute values:
+ $ SCRIPT '//*[@*[re:match(., "^[0-9]+$")]]' test.xml
+ <a num="1234" notnum="1234abc"/>
+
+ * find all elements with numeric attribute values in more than one file:
+ $ SCRIPT '//*[@*[re:match(., "^[0-9]+$")]]' test.xml test.xml test.xml
+ >> test.xml
+ <a num="1234" notnum="1234abc"/>
+ >> test.xml
+ <a num="1234" notnum="1234abc"/>
+ >> test.xml
+ <a num="1234" notnum="1234abc"/>
+
+ * find XML files that have non-empty root nodes:
+ $ SCRIPT -q '*' test.xml test.xml test.xml
+ >> test.xml
+ >> test.xml
+ >> test.xml
+
+ * find out if an XML file has at most depth three:
+ $ SCRIPT 'not(/*/*/*)' test.xml
+ True
+
+ * find all elements that belong to a specific namespace and have @num=2
+ $ SCRIPT --ns e=http://www.example.org/ns/example '//e:*[@num="2"]' test.xml
+ <d xmlns="http://www.example.org/ns/example" num="2"/>
+
+By default, all Python builtins and string methods are available as
+XPath functions through the ``py`` prefix. There is also a string
+comparison function ``py:within(x, a, b)`` that tests the string x for
+being lexicographically within the interval ``a <= x <= b``.
+'''.replace('SCRIPT', os.path.basename(sys.argv[0]))
+
+REGEXP_NS = "http://exslt.org/regular-expressions"
+PYTHON_BUILTINS_NS = "PYTHON-BUILTINS"
+
+def make_parser(remove_blank_text=True, **kwargs):
+ return et.XMLParser(remove_blank_text=remove_blank_text, **kwargs)
+
+def print_result(result, pretty_print, encoding=None, _is_py3=sys.version_info[0] >= 3):
+ stdout = sys.stdout
+ if not stdout.isatty() and not encoding:
+ encoding = 'utf8'
+ if et.iselement(result):
+ result = et.tostring(result, xml_declaration=False, with_tail=False,
+ pretty_print=pretty_print, encoding=encoding)
+ if not pretty_print:
+ # pretty printing appends newline, otherwise we do it
+ if isinstance(result, unicode):
+ result += '\n'
+ else:
+ result += '\n'.encode('ascii')
+ elif isinstance(result, basestring):
+ result += '\n'
+ else:
+ result = '%r\n' % result # '%r' for better number formatting
+
+ if encoding and encoding != 'unicode' and isinstance(result, unicode):
+ result = result.encode(encoding)
+
+ if _is_py3 and not isinstance(result, unicode):
+ stdout.buffer.write(result)
+ else:
+ stdout.write(result)
+
+def print_results(results, pretty_print):
+ if isinstance(results, list):
+ for result in results:
+ print_result(result, pretty_print)
+ else:
+ print_result(results, pretty_print)
+
+def iter_input(input, filename, parser, line_by_line):
+ if isinstance(input, basestring):
+ with open(input, 'rb') as f:
+ for tree in iter_input(f, filename, parser, line_by_line):
+ yield tree
+ else:
+ try:
+ if line_by_line:
+ for line in input:
+ if line:
+ yield et.ElementTree(et.fromstring(line, parser))
+ else:
+ yield et.parse(input, parser)
+ except IOError:
+ e = sys.exc_info()[1]
+ error("parsing %r failed: %s: %s",
+ filename, e.__class__.__name__, e)
+
+def find_in_file(f, xpath, print_name=True, xinclude=False, pretty_print=True, line_by_line=False,
+ encoding=None, verbose=True):
+ try:
+ filename = f.name
+ except AttributeError:
+ filename = f
+
+ xml_parser = et.XMLParser(encoding=encoding)
+
+ try:
+ if not callable(xpath):
+ xpath = et.XPath(xpath)
+
+ found = False
+ for tree in iter_input(f, filename, xml_parser, line_by_line):
+ try:
+ if xinclude:
+ tree.xinclude()
+ except IOError:
+ e = sys.exc_info()[1]
+ error("XInclude for %r failed: %s: %s",
+ filename, e.__class__.__name__, e)
+
+ results = xpath(tree)
+ if results is not None and results != []:
+ found = True
+ if verbose:
+ print_results(results, pretty_print)
+
+ if not found:
+ return False
+ if not verbose and print_name:
+ print(filename)
+ return True
+ except Exception:
+ e = sys.exc_info()[1]
+ error("%r: %s: %s",
+ filename, e.__class__.__name__, e)
+ return False
+
+def register_builtins():
+ ns = et.FunctionNamespace(PYTHON_BUILTINS_NS)
+ tostring = et.tostring
+
+ def make_string(s):
+ if isinstance(s, list):
+ if not s:
+ return ''
+ s = s[0]
+ if not isinstance(s, unicode):
+ if et.iselement(s):
+ s = tostring(s, method="text", encoding='unicode')
+ else:
+ s = unicode(s)
+ return s
+
+ def wrap_builtin(b):
+ def wrapped_builtin(_, *args):
+ return b(*args)
+ return wrapped_builtin
+
+ for (name, builtin) in vars(__builtins__).items():
+ if callable(builtin):
+ if not name.startswith('_') and name == name.lower():
+ ns[name] = wrap_builtin(builtin)
+
+ def wrap_str_method(b):
+ def wrapped_method(_, *args):
+ args = tuple(map(make_string, args))
+ return b(*args)
+ return wrapped_method
+
+ for (name, method) in vars(unicode).items():
+ if callable(method):
+ if not name.startswith('_'):
+ ns[name] = wrap_str_method(method)
+
+ def within(_, s, a, b):
+ return make_string(a) <= make_string(s) <= make_string(b)
+ ns["within"] = within
+
+
+def parse_options():
+ from optparse import OptionParser
+
+ usage = "usage: %prog [options] XPATH [FILE ...]"
+
+ parser = OptionParser(
+ usage = usage,
+ version = "%prog using lxml.etree " + et.__version__,
+ description = SHORT_DESCRIPTION)
+ parser.add_option("-H", "--long-help",
+ action="store_true", dest="long_help", default=False,
+ help="a longer help text including usage examples")
+ parser.add_option("-i", "--xinclude",
+ action="store_true", dest="xinclude", default=False,
+ help="run XInclude on the file before XPath")
+ parser.add_option("--no-python",
+ action="store_false", dest="python", default=True,
+ help="disable Python builtins and functions (prefix 'py')")
+ parser.add_option("--no-regexp",
+ action="store_false", dest="regexp", default=True,
+ help="disable regular expressions (prefix 're')")
+ parser.add_option("-q", "--quiet",
+ action="store_false", dest="verbose", default=True,
+ help="don't print status messages to stdout")
+ parser.add_option("-t", "--root-tag",
+ dest="root_tag", metavar="TAG",
+ help="surround output with <TAG>...</TAG> to produce a well-formed XML document")
+ parser.add_option("-p", "--plain",
+ action="store_false", dest="pretty_print", default=True,
+ help="do not pretty-print the output")
+ parser.add_option("-l", "--lines",
+ action="store_true", dest="line_by_line", default=False,
+ help="parse each line of input separately (e.g. grep output)")
+ parser.add_option("-e", "--encoding",
+ dest="encoding",
+ help="use a specific encoding for parsing (may be required with --lines)")
+ parser.add_option("-N", "--ns", metavar="PREFIX=NS",
+ action="append", dest="namespaces", default=[],
+ help="add a namespace declaration")
+
+ options, args = parser.parse_args()
+
+ if options.long_help:
+ parser.print_help()
+ print(__doc__[__doc__.find('\n\n')+1:])
+ sys.exit(0)
+
+ if len(args) < 1:
+ parser.error("first argument must be an XPath expression")
+
+ return options, args
+
+
+def main(options, args):
+ namespaces = {}
+ if options.regexp:
+ namespaces["re"] = REGEXP_NS
+ if options.python:
+ register_builtins()
+ namespaces["py"] = PYTHON_BUILTINS_NS
+
+ for ns in options.namespaces:
+ prefix, NS = ns.split("=", 1)
+ namespaces[prefix.strip()] = NS.strip()
+
+ xpath = et.XPath(args[0], namespaces=namespaces)
+ files = args[1:] or [sys.stdin]
+
+ if options.root_tag and options.verbose:
+ print('<%s>' % options.root_tag)
+
+ found = False
+ print_name = len(files) > 1 and not options.root_tag
+ for input in files:
+ found |= find_in_file(
+ input, xpath,
+ print_name=print_name,
+ xinclude=options.xinclude,
+ pretty_print=options.pretty_print,
+ line_by_line=options.line_by_line,
+ encoding=options.encoding,
+ verbose=options.verbose,
+ )
+
+ if options.root_tag and options.verbose:
+ print('</%s>' % options.root_tag)
+
+ return found
+
+if __name__ == "__main__":
+ try:
+ options, args = parse_options()
+ found = main(options, args)
+ if found:
+ sys.exit(0)
+ else:
+ sys.exit(1)
+ except et.XPathSyntaxError:
+ error(sys.exc_info()[1])
+ sys.exit(4)
+ except KeyboardInterrupt:
+ pass
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..575d7a1
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,19 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+envlist = py27, py35, py36, py37, py38
+
+[testenv]
+setenv =
+ CFLAGS = -g -O0
+commands =
+ {envpython} setup.py clean
+ {envpython} setup.py build_ext --inplace
+ make test
+install_command = pip install {opts} {packages}
+deps =
+ -r{toxinidir}/requirements.txt
+ html5lib
diff --git a/update-error-constants.py b/update-error-constants.py
new file mode 100644
index 0000000..8a83685
--- /dev/null
+++ b/update-error-constants.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+
+from __future__ import print_function, absolute_import
+
+import sys, os, os.path, re, codecs
+
+BUILD_SOURCE_FILE = os.path.join("src", "lxml", "xmlerror.pxi")
+BUILD_DEF_FILE = os.path.join("src", "lxml", "includes", "xmlerror.pxd")
+
+if len(sys.argv) < 2 or sys.argv[1].lower() in ('-h', '--help'):
+ print("This script generates the constants in file %s" % BUILD_SOURCE_FILE)
+ print("Call as")
+ print(sys.argv[0], "/path/to/libxml2-doc-dir")
+ sys.exit(len(sys.argv) > 1)
+
+HTML_DIR = os.path.join(sys.argv[1], 'html')
+os.stat(HTML_DIR) # raise an error if we can't find it
+
+sys.path.insert(0, 'src')
+from lxml import etree
+
+# map enum name to Python variable name and alignment for constant name
+ENUM_MAP = {
+ 'xmlErrorLevel' : ('__ERROR_LEVELS', 'XML_ERR_'),
+ 'xmlErrorDomain' : ('__ERROR_DOMAINS', 'XML_FROM_'),
+ 'xmlParserErrors' : ('__PARSER_ERROR_TYPES', 'XML_'),
+# 'xmlXPathError' : ('__XPATH_ERROR_TYPES', ''),
+# 'xmlSchemaValidError' : ('__XMLSCHEMA_ERROR_TYPES', 'XML_'),
+ 'xmlRelaxNGValidErr' : ('__RELAXNG_ERROR_TYPES', 'XML_'),
+ }
+
+ENUM_ORDER = (
+ 'xmlErrorLevel',
+ 'xmlErrorDomain',
+ 'xmlParserErrors',
+# 'xmlXPathError',
+# 'xmlSchemaValidError',
+ 'xmlRelaxNGValidErr')
+
+COMMENT = """
+# This section is generated by the script '%s'.
+
+""" % os.path.basename(sys.argv[0])
+
+def split(lines):
+ lines = iter(lines)
+ pre = []
+ for line in lines:
+ pre.append(line)
+ if line.startswith('#') and "BEGIN: GENERATED CONSTANTS" in line:
+ break
+ pre.append('')
+ for line in lines:
+ if line.startswith('#') and "END: GENERATED CONSTANTS" in line:
+ break
+ post = ['', line]
+ post.extend(lines)
+ post.append('')
+ return pre, post
+
+def regenerate_file(filename, result):
+ # read .pxi source file
+ f = codecs.open(filename, 'r', encoding="utf-8")
+ pre, post = split(f)
+ f.close()
+
+ # write .pxi source file
+ f = codecs.open(filename, 'w', encoding="utf-8")
+ f.write(''.join(pre))
+ f.write(COMMENT)
+ f.write('\n'.join(result))
+ f.write(''.join(post))
+ f.close()
+
+collect_text = etree.XPath("string()")
+find_enums = etree.XPath(
+ "//html:pre[@class = 'programlisting' and contains(text(), 'Enum')]",
+ namespaces = {'html' : 'http://www.w3.org/1999/xhtml'})
+
+def parse_enums(html_dir, html_filename, enum_dict):
+ PARSE_ENUM_NAME = re.compile(r'\s*enum\s+(\w+)\s*{', re.I).match
+ PARSE_ENUM_VALUE = re.compile(r'\s*=\s+([0-9]+)\s*(?::\s*(.*))?').match
+ tree = etree.parse(os.path.join(html_dir, html_filename))
+ enums = find_enums(tree)
+ for enum in enums:
+ enum_name = PARSE_ENUM_NAME(collect_text(enum))
+ if not enum_name:
+ continue
+ enum_name = enum_name.group(1)
+ if enum_name not in ENUM_MAP:
+ continue
+ print("Found enum", enum_name)
+ entries = []
+ for child in enum:
+ name = child.text
+ match = PARSE_ENUM_VALUE(child.tail)
+ if not match:
+ print("Ignoring enum %s (failed to parse field '%s')" % (
+ enum_name, name))
+ break
+ value, descr = match.groups()
+ entries.append((name, int(value), descr))
+ else:
+ enum_dict[enum_name] = entries
+ return enum_dict
+
+enum_dict = {}
+parse_enums(HTML_DIR, 'libxml-xmlerror.html', enum_dict)
+#parse_enums(HTML_DIR, 'libxml-xpath.html', enum_dict)
+#parse_enums(HTML_DIR, 'libxml-xmlschemas.html', enum_dict)
+parse_enums(HTML_DIR, 'libxml-relaxng.html', enum_dict)
+
+# regenerate source files
+pxi_result = []
+append_pxi = pxi_result.append
+pxd_result = []
+append_pxd = pxd_result.append
+
+append_pxd('cdef extern from "libxml/xmlerror.h":')
+
+ctypedef_indent = ' '*4
+constant_indent = ctypedef_indent*2
+
+for enum_name in ENUM_ORDER:
+ constants = enum_dict[enum_name]
+ pxi_name, prefix = ENUM_MAP[enum_name]
+
+ append_pxd(ctypedef_indent + 'ctypedef enum %s:' % enum_name)
+ append_pxi('cdef object %s = """\\' % pxi_name)
+
+ prefix_len = len(prefix)
+ length = 2 # each string ends with '\n\0'
+ for name, val, descr in constants:
+ if descr and descr != str(val):
+ line = '%-50s = %7d # %s' % (name, val, descr)
+ else:
+ line = '%-50s = %7d' % (name, val)
+ append_pxd(constant_indent + line)
+
+ if name[:prefix_len] == prefix and len(name) > prefix_len:
+ name = name[prefix_len:]
+ line = '%s=%d' % (name, val)
+ append_pxi(line)
+ length += len(line) + 2 # + '\n\0'
+
+ append_pxd('')
+ append_pxi('"""')
+ append_pxi('')
+
+# write source files
+print("Updating file %s" % BUILD_SOURCE_FILE)
+regenerate_file(BUILD_SOURCE_FILE, pxi_result)
+
+print("Updating file %s" % BUILD_DEF_FILE)
+regenerate_file(BUILD_DEF_FILE, pxd_result)
+
+print("Done")
diff --git a/valgrind-python.supp b/valgrind-python.supp
new file mode 100644
index 0000000..4c5050d
--- /dev/null
+++ b/valgrind-python.supp
@@ -0,0 +1,480 @@
+#
+# This is a valgrind suppression file that should be used when using valgrind.
+#
+# Here's an example of running valgrind:
+#
+# cd python/dist/src
+# valgrind --tool=memcheck --suppressions=Misc/valgrind-python.supp \
+# ./python -E ./Lib/test/regrtest.py -u gui,network
+#
+# You must edit Objects/obmalloc.c and uncomment Py_USING_MEMORY_DEBUGGER
+# to use the preferred suppressions with address_in_range.
+#
+# If you do not want to recompile Python, you can uncomment
+# suppressions for _PyObject_Free and _PyObject_Realloc.
+#
+# See Misc/README.valgrind for more information.
+
+# all tool names: Addrcheck,Memcheck,cachegrind,helgrind,massif
+{
+ ADDRESS_IN_RANGE/Invalid read of size 4
+ Memcheck:Addr4
+ fun:address_in_range
+}
+
+{
+ ADDRESS_IN_RANGE/Invalid read of size 4
+ Memcheck:Value4
+ fun:address_in_range
+}
+
+{
+ ADDRESS_IN_RANGE/Invalid read of size 8 (x86_64 aka amd64)
+ Memcheck:Value8
+ fun:address_in_range
+}
+
+{
+ ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value
+ Memcheck:Cond
+ fun:address_in_range
+}
+
+#
+# Leaks (including possible leaks)
+# Hmmm, I wonder if this masks some real leaks. I think it does.
+# Will need to fix that.
+#
+
+{
+ Suppress leaking the GIL. Happens once per process, see comment in ceval.c.
+ Memcheck:Leak
+ fun:malloc
+ fun:PyThread_allocate_lock
+ fun:PyEval_InitThreads
+}
+
+{
+ Suppress leaking the GIL after a fork.
+ Memcheck:Leak
+ fun:malloc
+ fun:PyThread_allocate_lock
+ fun:PyEval_ReInitThreads
+}
+
+{
+ Suppress leaking the autoTLSkey. This looks like it shouldn't leak though.
+ Memcheck:Leak
+ fun:malloc
+ fun:PyThread_create_key
+ fun:_PyGILState_Init
+ fun:Py_InitializeEx
+ fun:Py_Main
+}
+
+{
+ Hmmm, is this a real leak or like the GIL?
+ Memcheck:Leak
+ fun:malloc
+ fun:PyThread_ReInitTLS
+}
+
+{
+ Handle PyMalloc confusing valgrind (possibly leaked)
+ Memcheck:Leak
+ fun:realloc
+ fun:_PyObject_GC_Resize
+ fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING
+}
+
+{
+ Handle PyMalloc confusing valgrind (possibly leaked)
+ Memcheck:Leak
+ fun:malloc
+ fun:_PyObject_GC_New
+ fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING
+}
+
+{
+ Handle PyMalloc confusing valgrind (possibly leaked)
+ Memcheck:Leak
+ fun:malloc
+ fun:_PyObject_GC_NewVar
+ fun:COMMENT_THIS_LINE_TO_DISABLE_LEAK_WARNING
+}
+
+#
+# Non-python specific leaks
+#
+
+{
+ Handle pthread issue (possibly leaked)
+ Memcheck:Leak
+ fun:calloc
+ fun:allocate_dtv
+ fun:_dl_allocate_tls_storage
+ fun:_dl_allocate_tls
+}
+
+{
+ Handle pthread issue (possibly leaked)
+ Memcheck:Leak
+ fun:memalign
+ fun:_dl_allocate_tls_storage
+ fun:_dl_allocate_tls
+}
+
+{
+ ADDRESS_IN_RANGE/Invalid read of size 4
+ Memcheck:Addr4
+ fun:_PyObject_Free
+}
+
+{
+ ADDRESS_IN_RANGE/Invalid read of size 4
+ Memcheck:Value4
+ fun:_PyObject_Free
+}
+
+{
+ ADDRESS_IN_RANGE/Use of uninitialised value of size 8
+ Memcheck:Addr8
+ fun:_PyObject_Free
+}
+
+{
+ ADDRESS_IN_RANGE/Use of uninitialised value of size 8
+ Memcheck:Value8
+ fun:_PyObject_Free
+}
+
+{
+ ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value
+ Memcheck:Cond
+ fun:_PyObject_Free
+}
+
+{
+ ADDRESS_IN_RANGE/Invalid read of size 4
+ Memcheck:Addr4
+ fun:_PyObject_Realloc
+}
+
+{
+ ADDRESS_IN_RANGE/Invalid read of size 4
+ Memcheck:Value4
+ fun:_PyObject_Realloc
+}
+
+{
+ ADDRESS_IN_RANGE/Use of uninitialised value of size 8
+ Memcheck:Addr8
+ fun:_PyObject_Realloc
+}
+
+{
+ ADDRESS_IN_RANGE/Use of uninitialised value of size 8
+ Memcheck:Value8
+ fun:_PyObject_Realloc
+}
+
+{
+ ADDRESS_IN_RANGE/Conditional jump or move depends on uninitialised value
+ Memcheck:Cond
+ fun:_PyObject_Realloc
+}
+
+###
+### All the suppressions below are for errors that occur within libraries
+### that Python uses. The problems to not appear to be related to Python's
+### use of the libraries.
+###
+
+{
+ Generic ubuntu ld problems
+ Memcheck:Addr8
+ obj:/lib/ld-2.4.so
+ obj:/lib/ld-2.4.so
+ obj:/lib/ld-2.4.so
+ obj:/lib/ld-2.4.so
+}
+
+{
+ Generic gentoo ld problems
+ Memcheck:Cond
+ obj:/lib/ld-2.3.4.so
+ obj:/lib/ld-2.3.4.so
+ obj:/lib/ld-2.3.4.so
+ obj:/lib/ld-2.3.4.so
+}
+
+{
+ DBM problems, see test_dbm
+ Memcheck:Param
+ write(buf)
+ fun:write
+ obj:/usr/lib/libdb1.so.2
+ obj:/usr/lib/libdb1.so.2
+ obj:/usr/lib/libdb1.so.2
+ obj:/usr/lib/libdb1.so.2
+ fun:dbm_close
+}
+
+{
+ DBM problems, see test_dbm
+ Memcheck:Value8
+ fun:memmove
+ obj:/usr/lib/libdb1.so.2
+ obj:/usr/lib/libdb1.so.2
+ obj:/usr/lib/libdb1.so.2
+ obj:/usr/lib/libdb1.so.2
+ fun:dbm_store
+ fun:dbm_ass_sub
+}
+
+{
+ DBM problems, see test_dbm
+ Memcheck:Cond
+ obj:/usr/lib/libdb1.so.2
+ obj:/usr/lib/libdb1.so.2
+ obj:/usr/lib/libdb1.so.2
+ fun:dbm_store
+ fun:dbm_ass_sub
+}
+
+{
+ DBM problems, see test_dbm
+ Memcheck:Cond
+ fun:memmove
+ obj:/usr/lib/libdb1.so.2
+ obj:/usr/lib/libdb1.so.2
+ obj:/usr/lib/libdb1.so.2
+ obj:/usr/lib/libdb1.so.2
+ fun:dbm_store
+ fun:dbm_ass_sub
+}
+
+{
+ GDBM problems, see test_gdbm
+ Memcheck:Param
+ write(buf)
+ fun:write
+ fun:gdbm_open
+
+}
+
+{
+ ZLIB problems, see test_gzip
+ Memcheck:Cond
+ obj:/lib/libz.so.1.2.3
+ obj:/lib/libz.so.1.2.3
+ fun:deflate
+}
+
+{
+ Avoid problems w/readline doing a putenv and leaking on exit
+ Memcheck:Leak
+ fun:malloc
+ fun:xmalloc
+ fun:sh_set_lines_and_columns
+ fun:_rl_get_screen_size
+ fun:_rl_init_terminal_io
+ obj:/lib/libreadline.so.4.3
+ fun:rl_initialize
+}
+
+###
+### These occur from somewhere within the SSL, when running
+### test_socket_sll. They are too general to leave on by default.
+###
+###{
+### somewhere in SSL stuff
+### Memcheck:Cond
+### fun:memset
+###}
+###{
+### somewhere in SSL stuff
+### Memcheck:Value4
+### fun:memset
+###}
+###
+###{
+### somewhere in SSL stuff
+### Memcheck:Cond
+### fun:MD5_Update
+###}
+###
+###{
+### somewhere in SSL stuff
+### Memcheck:Value4
+### fun:MD5_Update
+###}
+
+# Fedora's package "openssl-1.0.1-0.1.beta2.fc17.x86_64" on x86_64
+# See http://bugs.python.org/issue14171
+{
+ openssl 1.0.1 prng 1
+ Memcheck:Cond
+ fun:bcmp
+ fun:fips_get_entropy
+ fun:FIPS_drbg_instantiate
+ fun:RAND_init_fips
+ fun:OPENSSL_init_library
+ fun:SSL_library_init
+ fun:init_hashlib
+}
+
+{
+ openssl 1.0.1 prng 2
+ Memcheck:Cond
+ fun:fips_get_entropy
+ fun:FIPS_drbg_instantiate
+ fun:RAND_init_fips
+ fun:OPENSSL_init_library
+ fun:SSL_library_init
+ fun:init_hashlib
+}
+
+{
+ openssl 1.0.1 prng 3
+ Memcheck:Value8
+ fun:_x86_64_AES_encrypt_compact
+ fun:AES_encrypt
+}
+
+#
+# All of these problems come from using test_socket_ssl
+#
+{
+ from test_socket_ssl
+ Memcheck:Cond
+ fun:BN_bin2bn
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Cond
+ fun:BN_num_bits_word
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Value4
+ fun:BN_num_bits_word
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Cond
+ fun:BN_mod_exp_mont_word
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Cond
+ fun:BN_mod_exp_mont
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Param
+ write(buf)
+ fun:write
+ obj:/usr/lib/libcrypto.so.0.9.7
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Cond
+ fun:RSA_verify
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Value4
+ fun:RSA_verify
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Value4
+ fun:DES_set_key_unchecked
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Value4
+ fun:DES_encrypt2
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Cond
+ obj:/usr/lib/libssl.so.0.9.7
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Value4
+ obj:/usr/lib/libssl.so.0.9.7
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Cond
+ fun:BUF_MEM_grow_clean
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Cond
+ fun:memcpy
+ fun:ssl3_read_bytes
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Cond
+ fun:SHA1_Update
+}
+
+{
+ from test_socket_ssl
+ Memcheck:Value4
+ fun:SHA1_Update
+}
+
+{
+ test_buffer_non_debug
+ Memcheck:Addr4
+ fun:PyUnicodeUCS2_FSConverter
+}
+
+{
+ test_buffer_non_debug
+ Memcheck:Addr4
+ fun:PyUnicode_FSConverter
+}
+
+{
+ wcscmp_false_positive
+ Memcheck:Addr8
+ fun:wcscmp
+ fun:_PyOS_GetOpt
+ fun:Py_Main
+ fun:main
+}
+
+# Additional suppressions for the unified decimal tests:
+{
+ test_decimal
+ Memcheck:Addr4
+ fun:PyUnicodeUCS2_FSConverter
+}
+
+{
+ test_decimal2
+ Memcheck:Addr4
+ fun:PyUnicode_FSConverter
+}
+
diff --git a/versioninfo.py b/versioninfo.py
new file mode 100644
index 0000000..34c273f
--- /dev/null
+++ b/versioninfo.py
@@ -0,0 +1,81 @@
+import io
+import os
+import re
+import sys
+
+__LXML_VERSION = None
+
+
+def version():
+ global __LXML_VERSION
+ if __LXML_VERSION is None:
+ with open(os.path.join(get_base_dir(), 'src', 'lxml', '__init__.py')) as f:
+ __LXML_VERSION = re.search(r'__version__\s*=\s*"([^"]+)"', f.read(250)).group(1)
+ assert __LXML_VERSION
+ return __LXML_VERSION
+
+
+def branch_version():
+ return version()[:3]
+
+
+def is_pre_release():
+ version_string = version()
+ return "a" in version_string or "b" in version_string
+
+
+def dev_status():
+ _version = version()
+ if 'a' in _version:
+ return 'Development Status :: 3 - Alpha'
+ elif 'b' in _version or 'c' in _version:
+ return 'Development Status :: 4 - Beta'
+ else:
+ return 'Development Status :: 5 - Production/Stable'
+
+
+def changes():
+ """Extract part of changelog pertaining to version.
+ """
+ _version = version()
+ with io.open(os.path.join(get_base_dir(), "CHANGES.txt"), 'r', encoding='utf8') as f:
+ lines = []
+ for line in f:
+ if line.startswith('====='):
+ if len(lines) > 1:
+ break
+ if lines:
+ lines.append(line)
+ elif line.startswith(_version):
+ lines.append(line)
+ return ''.join(lines[:-1])
+
+
+def create_version_h():
+ """Create lxml-version.h
+ """
+ lxml_version = version()
+ # make sure we have a triple part version number
+ parts = lxml_version.split('-')
+ while parts[0].count('.') < 2:
+ parts[0] += '.0'
+ lxml_version = '-'.join(parts).replace('a', '.alpha').replace('b', '.beta')
+
+ file_path = os.path.join(get_base_dir(), 'src', 'lxml', 'includes', 'lxml-version.h')
+
+ # Avoid changing file timestamp if content didn't change.
+ if os.path.isfile(file_path):
+ with open(file_path, 'r') as version_h:
+ if ('"%s"' % lxml_version) in version_h.read(100):
+ return
+
+ with open(file_path, 'w') as version_h:
+ version_h.write('''\
+#ifndef LXML_VERSION_STRING
+#define LXML_VERSION_STRING "%s"
+#endif
+''' % lxml_version)
+
+
+def get_base_dir():
+ return os.path.abspath(os.path.dirname(sys.argv[0]))