lxml.tests.test_unicode
Package lxml :: Package tests :: Module test_unicode
[hide private]
[frames] | no frames]

Source Code for Module lxml.tests.test_unicode

  1  # -*- coding: utf-8 -*- 
  2  import unittest 
  3  import sys 
  4  import os.path 
  5   
  6  this_dir = os.path.dirname(__file__) 
  7  if this_dir not in sys.path: 
  8      sys.path.insert(0, this_dir)  # needed for Py3 
  9   
 10  from common_imports import StringIO, etree, SillyFileLike, HelperTestCase 
 11  from common_imports import _str, _bytes, _chr 
 12   
 13  try: 
 14      unicode 
 15  except NameError: 
 16      unicode = str 
 17   
 18  ascii_uni = _bytes('a').decode('utf8') 
 19   
 20  klingon = _bytes("\\uF8D2").decode("unicode_escape") # not valid for XML names 
 21   
 22  invalid_tag = _bytes("test").decode('utf8') + klingon 
 23   
 24  uni = _bytes('\\xc3\\u0680\\u3120').decode("unicode_escape") # some non-ASCII characters 
 25   
 26  uxml = _bytes("<test><title>test \\xc3\\xa1\\u3120</title><h1>page \\xc3\\xa1\\u3120 title</h1></test>" 
 27                ).decode("unicode_escape") 
 28   
 29   
30 -class UnicodeTestCase(HelperTestCase):
31 - def test__str(self):
32 # test the testing framework, namely _str from common_imports 33 self.assertEqual(_str('\x10'), _str('\u0010')) 34 self.assertEqual(_str('\x10'), _str('\U00000010')) 35 self.assertEqual(_str('\u1234'), _str('\U00001234'))
36
37 - def test_unicode_xml(self):
38 tree = etree.XML('<p>%s</p>' % uni) 39 self.assertEqual(uni, tree.text)
40
41 - def test_wide_unicode_xml(self):
42 if sys.maxunicode < 1114111: 43 return # skip test 44 tree = etree.XML(_bytes('<p>\\U00026007</p>').decode('unicode_escape')) 45 self.assertEqual(1, len(tree.text)) 46 self.assertEqual(_bytes('\\U00026007').decode('unicode_escape'), 47 tree.text)
48
49 - def test_unicode_xml_broken(self):
50 uxml = ('<?xml version="1.0" encoding="UTF-8"?>' + 51 '<p>%s</p>' % uni) 52 self.assertRaises(ValueError, etree.XML, uxml)
53
54 - def test_unicode_tag(self):
55 el = etree.Element(uni) 56 self.assertEqual(uni, el.tag)
57
58 - def test_unicode_tag_invalid(self):
59 # sadly, Klingon is not well-formed 60 self.assertRaises(ValueError, etree.Element, invalid_tag)
61
62 - def test_unicode_nstag(self):
63 tag = "{http://abc/}%s" % uni 64 el = etree.Element(tag) 65 self.assertEqual(tag, el.tag)
66
67 - def test_unicode_ns_invalid(self):
68 # namespace URIs must conform to RFC 3986 69 tag = "{http://%s/}abc" % uni 70 self.assertRaises(ValueError, etree.Element, tag)
71
73 # sadly, Klingon is not well-formed 74 tag = "{http://abc/}%s" % invalid_tag 75 self.assertRaises(ValueError, etree.Element, tag)
76
77 - def test_unicode_qname(self):
78 qname = etree.QName(uni, uni) 79 tag = "{%s}%s" % (uni, uni) 80 self.assertEqual(qname.text, tag) 81 self.assertEqual(unicode(qname), tag)
82
84 self.assertRaises(ValueError, etree.QName, invalid_tag)
85
86 - def test_unicode_attr(self):
87 el = etree.Element('foo', {'bar': uni}) 88 self.assertEqual(uni, el.attrib['bar'])
89
90 - def test_unicode_comment(self):
91 el = etree.Comment(uni) 92 self.assertEqual(uni, el.text)
93
94 - def test_unicode_repr1(self):
95 x = etree.Element(_str('å')) 96 # must not raise UnicodeEncodeError 97 repr(x)
98
99 - def test_unicode_repr2(self):
100 x = etree.Comment(_str('ö')) 101 repr(x)
102
103 - def test_unicode_repr3(self):
104 x = etree.ProcessingInstruction(_str('Å'), _str('\u0131')) 105 repr(x)
106
107 - def test_unicode_repr4(self):
108 x = etree.Entity(_str('ä')) 109 repr(x)
110
111 - def test_unicode_text(self):
112 e = etree.Element('e') 113 114 def settext(text): 115 e.text = text
116 117 self.assertRaises(ValueError, settext, _str('ab\ufffe')) 118 self.assertRaises(ValueError, settext, _str('ö\ffff')) 119 self.assertRaises(ValueError, settext, _str('\u0123\ud800')) 120 self.assertRaises(ValueError, settext, _str('x\ud8ff')) 121 self.assertRaises(ValueError, settext, _str('\U00010000\udfff')) 122 self.assertRaises(ValueError, settext, _str('abd\x00def')) 123 # should not Raise 124 settext(_str('\ud7ff\ue000\U00010000\U0010FFFFäöas')) 125 126 for char_val in range(0xD800, 0xDFFF+1): 127 self.assertRaises(ValueError, settext, 'abc' + _chr(char_val)) 128 self.assertRaises(ValueError, settext, _chr(char_val)) 129 self.assertRaises(ValueError, settext, _chr(char_val) + 'abc') 130 131 self.assertRaises(ValueError, settext, _bytes('\xe4')) 132 self.assertRaises(ValueError, settext, _bytes('\x80')) 133 self.assertRaises(ValueError, settext, _bytes('\xff')) 134 self.assertRaises(ValueError, settext, _bytes('\x08')) 135 self.assertRaises(ValueError, settext, _bytes('\x19')) 136 self.assertRaises(ValueError, settext, _bytes('\x20\x00')) 137 # should not Raise 138 settext(_bytes('\x09\x0A\x0D\x20\x60\x7f'))
139
140 - def test_uniname(self):
141 Element = etree.Element 142 def el(name): 143 return Element(name)
144 145 self.assertRaises(ValueError, el, ':') 146 self.assertRaises(ValueError, el, '0a') 147 self.assertRaises(ValueError, el, _str('\u203f')) 148 # should not Raise 149 el(_str('\u0132')) 150 151 152
153 - def test_unicode_parse_stringio(self):
154 el = etree.parse(StringIO('<p>%s</p>' % uni)).getroot() 155 self.assertEqual(uni, el.text)
156 157 ## def test_parse_fileobject_unicode(self): 158 ## # parse unicode from unamed file object (not support by ElementTree) 159 ## f = SillyFileLike(uxml) 160 ## root = etree.parse(f).getroot() 161 ## self.assertEqual(unicode(etree.tostring(root, 'UTF-8'), 'UTF-8'), 162 ## uxml) 163 164
165 -class EncodingsTestCase(HelperTestCase):
166 - def test_illegal_utf8(self):
167 data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1') 168 self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data)
169
171 data = _bytes('<test>\x80\x80\x80</test>', encoding='iso8859-1') 172 parser = etree.XMLParser(recover=True) 173 self.assertRaises(etree.XMLSyntaxError, etree.fromstring, data, parser)
174
175 - def _test_encoding(self, encoding, xml_encoding_name=None):
176 foo = """<?xml version='1.0' encoding='%s'?>\n<tag attrib='123'></tag>""" % ( 177 xml_encoding_name or encoding) 178 root = etree.fromstring(foo.encode(encoding)) 179 self.assertEqual('tag', root.tag) 180 181 doc_encoding = root.getroottree().docinfo.encoding 182 self.assertTrue( 183 doc_encoding.lower().rstrip('lbe'), 184 (xml_encoding_name or encoding).lower().rstrip('lbe'))
185
186 - def test_utf8_fromstring(self):
187 self._test_encoding('utf-8')
188
189 - def test_utf8sig_fromstring(self):
190 self._test_encoding('utf_8_sig', 'utf-8')
191
192 - def test_utf16_fromstring(self):
193 self._test_encoding('utf-16')
194
195 - def test_utf16LE_fromstring(self):
196 self._test_encoding('utf-16le', 'utf-16')
197
198 - def test_utf16BE_fromstring(self):
199 self._test_encoding('utf-16be', 'utf-16')
200
201 - def test_utf32_fromstring(self):
202 self._test_encoding('utf-32', 'utf-32')
203
204 - def test_utf32LE_fromstring(self):
205 self._test_encoding('utf-32le', 'utf-32')
206
207 - def test_utf32BE_fromstring(self):
208 self._test_encoding('utf-32be', 'utf-32')
209 210
211 -def test_suite():
212 suite = unittest.TestSuite() 213 suite.addTests([unittest.makeSuite(UnicodeTestCase)]) 214 suite.addTests([unittest.makeSuite(EncodingsTestCase)]) 215 return suite
216