请转到http://iMath.cnblogs.com

■■■■■■■■■■本博客打算放弃,请转到http://iMath.cnblogs.com

Python自带的html、xml解析器、lxml还有beautifulsoup比起来,感觉还是beautifulsoup比较宽松好用,好入门,下面这些就是例证。Python通过beautifulsoup获取CData的方法看这里http://stackoverflow.com/questions/2032172/how-can-i-grab-cdata-out-of-beautifulsoup/41364528#41364528


>>> import xml.etree.ElementTree as ElementTree

>>> xmlstring = '''

<ckplayer>

    <flashvars>

{lv->0}{v->80}{e->0}{p->1}{q->start}{h->3}{f->http://www.vipjiexi.com/api.php?url=fV51p8qrobgO4P0hA/CDlizUV9BAS7fu_A0OmtsfCoaCydcITVGG3SBZllJUIEPQYIAOHORWuX_rbjY0v0a4IgtyCiicwCdRLXo9l1Q&[$pat]}{a->hd=3}{defa->hd=1|hd=2|hd=3|hd=4}{deft->标清|高清|超清|原画}

</flashvars>

    <video>

        <file>

            <![CDATA[

http://wgdcnccdn.inter.qiyi.com/videos/v0/20160412/17/d6/720751061cd521d59cb2878cb50732f0.mp4?key=00b542c96c1c885045b2d094a549eec83&dis_k=1c8a22706d5da8228d3b715720a7a3fc&dis_t=1482761545&src=iqiyi.com&v=1033870602&qd_sc=0d65cf15bb30b80960b2852937a2171a&qd_tm=1482761545000&qd_ip=42.123.83.108&qd_src=app&mbd=20210202f5452f04d6d63114a334b6e8_5.7.0_from_nebula=0&qyid=264612700_32&uuid=7a72985c-58612549-c6

]]>

        </file>

    </video>

</ckplayer>

'''

>>> ElementTree.fromstring(xmlstring)

Traceback (most recent call last):

  File "<pyshell#37>", line 1, in <module>

    ElementTree.fromstring(xmlstring)

  File "C:\Users\i\AppData\Local\Programs\Python\Python35-32\lib\xml\etree\ElementTree.py", line 1333, in XML

    parser.feed(text)

  File "<string>", line None

xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 4, column 186

>>> from lxml import etree

>>> etree.fromstring(xmlstring)

Traceback (most recent call last):

  File "<pyshell#39>", line 1, in <module>

    etree.fromstring(xmlstring)

  File "src\lxml\lxml.etree.pyx", line 3213, in lxml.etree.fromstring (src\lxml\lxml.etree.c:78990)

  File "src\lxml\parser.pxi", line 1848, in lxml.etree._parseMemoryDocument (src\lxml\lxml.etree.c:118321)

  File "src\lxml\parser.pxi", line 1729, in lxml.etree._parseDoc (src\lxml\lxml.etree.c:116879)

  File "src\lxml\parser.pxi", line 1063, in lxml.etree._BaseParser._parseUnicodeDoc (src\lxml\lxml.etree.c:110866)

  File "src\lxml\parser.pxi", line 595, in lxml.etree._ParserContext._handleParseResultDoc (src\lxml\lxml.etree.c:105089)

  File "src\lxml\parser.pxi", line 706, in lxml.etree._handleParseResult (src\lxml\lxml.etree.c:106797)

  File "src\lxml\parser.pxi", line 635, in lxml.etree._raiseParseError (src\lxml\lxml.etree.c:105651)

  File "<string>", line 4

lxml.etree.XMLSyntaxError: xmlParseEntityRef: no name, line 4, column 187

>>> import html

>>> etree.fromstring(html.escape(xmlstring))

Traceback (most recent call last):

  File "<pyshell#41>", line 1, in <module>

    etree.fromstring(html.escape(xmlstring))

  File "src\lxml\lxml.etree.pyx", line 3213, in lxml.etree.fromstring (src\lxml\lxml.etree.c:78990)

  File "src\lxml\parser.pxi", line 1848, in lxml.etree._parseMemoryDocument (src\lxml\lxml.etree.c:118321)

  File "src\lxml\parser.pxi", line 1729, in lxml.etree._parseDoc (src\lxml\lxml.etree.c:116879)

  File "src\lxml\parser.pxi", line 1063, in lxml.etree._BaseParser._parseUnicodeDoc (src\lxml\lxml.etree.c:110866)

  File "src\lxml\parser.pxi", line 595, in lxml.etree._ParserContext._handleParseResultDoc (src\lxml\lxml.etree.c:105089)

  File "src\lxml\parser.pxi", line 706, in lxml.etree._handleParseResult (src\lxml\lxml.etree.c:106797)

  File "src\lxml\parser.pxi", line 635, in lxml.etree._raiseParseError (src\lxml\lxml.etree.c:105651)

  File "<string>", line 2

lxml.etree.XMLSyntaxError: Start tag expected, '<' not found, line 2, column 1

>>> etree.fromstring(html.escape(xmlstring))

Traceback (most recent call last):

  File "<pyshell#42>", line 1, in <module>

    etree.fromstring(html.escape(xmlstring))

  File "src\lxml\lxml.etree.pyx", line 3213, in lxml.etree.fromstring (src\lxml\lxml.etree.c:78990)

  File "src\lxml\parser.pxi", line 1848, in lxml.etree._parseMemoryDocument (src\lxml\lxml.etree.c:118321)

  File "src\lxml\parser.pxi", line 1729, in lxml.etree._parseDoc (src\lxml\lxml.etree.c:116879)

  File "src\lxml\parser.pxi", line 1063, in lxml.etree._BaseParser._parseUnicodeDoc (src\lxml\lxml.etree.c:110866)

  File "src\lxml\parser.pxi", line 595, in lxml.etree._ParserContext._handleParseResultDoc (src\lxml\lxml.etree.c:105089)

  File "src\lxml\parser.pxi", line 706, in lxml.etree._handleParseResult (src\lxml\lxml.etree.c:106797)

  File "src\lxml\parser.pxi", line 635, in lxml.etree._raiseParseError (src\lxml\lxml.etree.c:105651)

  File "<string>", line 2

lxml.etree.XMLSyntaxError: Start tag expected, '<' not found, line 2, column 1

>>> etree.fromstring(xmlstring.replace('&','&amp;'))

<Element ckplayer at 0x24ed378>

>>> 

>>> from xml.dom import minidom

>>> minidom.parseString(xmlstring)

Traceback (most recent call last):

  File "<pyshell#51>", line 1, in <module>

    minidom.parseString(xmlstring)

  File "C:\Users\i\AppData\Local\Programs\Python\Python35-32\lib\xml\dom\minidom.py", line 1968, in parseString

    return expatbuilder.parseString(string)

  File "C:\Users\i\AppData\Local\Programs\Python\Python35-32\lib\xml\dom\expatbuilder.py", line 925, in parseString

    return builder.parseString(string)

  File "C:\Users\i\AppData\Local\Programs\Python\Python35-32\lib\xml\dom\expatbuilder.py", line 223, in parseString

    parser.Parse(string, True)

xml.parsers.expat.ExpatError: not well-formed (invalid token): line 4, column 186

>>> 


评论

© 请转到http://iMath.cnblogs.com | Powered by LOFTER