""" Manipulate HTML or XHTML documents. Version 1.1.1. This source code has been placed in the public domain by Connelly Barnes. Features: - Translate HTML back and forth to data structures. This allows you to read and write HTML documents programmably, with much flexibility. - Extract and modify URLs in an HTML document. - Compatible with Python 3+ See the L{examples} for a quick start. Moved to Python3 by Jack Thomson May 2020 """ __version__ = '1.1.2' __all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract', 'urljoin', 'URLMatch'] # ------------------------------------------------------------------- # Globals # ------------------------------------------------------------------- import re import shlex import string import urllib.request, urllib.parse, urllib.error import urllib.parse import types # Translate text between these strings as plain text (not HTML). _IGNORE_TAGS = [('script', '/script'), ('style', '/style')] # Special tags where we have to look for _END_X as part of the # HTML/XHTML parsing rules. _BEGIN_COMMENT = '' _BEGIN_CDATA = '' # Mime types that can be parsed as HTML or HTML-like. _HTML_MIMETYPES = ['text/html', 'application/xhtml', 'application/xhtml+xml', 'text/xml', 'application/xml'] # Mime types that can be parsed as CSS. _CSS_MIMETYPES = ['text/css'] # ------------------------------------------------------------------- # HTML <-> Data structure # ------------------------------------------------------------------- def tagextract(doc): """ Convert HTML to data structure. Returns a list. HTML tags become C{(name, keyword_dict)} tuples within the list, while plain text becomes strings within the list. All tag names are lowercased and stripped of whitespace. Tags which end with forward slashes have a single forward slash placed at the end of their name, to indicate that they are XML unclosed tags. Example: >>> tagextract('hifoo

') [('img', {'src': 'hi.gif', 'alt': 'hi'}), 'foo', ('br', {}), ('br/', {}), ('/body', {})] Text between C{'') [('script', {'type': 'a'}), 'var x; ', ('/script', {})] Comment strings and XML directives are rendered as a single long tag with no attributes. The case of the tag "name" is not changed: >>> tagextract('') [('!-- blah --', {})] >>> tagextract('') [('?xml version="1.0" encoding="utf-8" ?', {})] >>> tagextract('') [('!DOCTYPE html PUBLIC etc...', {})] Greater-than and less-than characters occurring inside comments or CDATA blocks are correctly kept as part of the block: >>> tagextract('') [('!-- <><><><>>..> --', {})] >>> tagextract('<>><>]<> ]]>') [('!CDATA[[><>><>]<> ]]', {})] Note that if one modifies these tags, it is important to retain the C{"--"} (for comments) or C{"]]"} (for C{CDATA}) at the end of the tag name, so that output from L{tagjoin} will be correct HTML/XHTML. """ L = _full_tag_extract(doc) for i in range(len(L)): if isinstance(L[i], _TextTag): # _TextTag object. L[i] = L[i].text else: # _HTMLTag object. L[i] = (L[i].name, L[i].attrs) return L def _is_str(s): """ True iff s is a string (checks via duck typing). """ return hasattr(s, 'capitalize') def tagjoin(L): """ Convert data structure back to HTML. This reverses the L{tagextract} function. More precisely, if an HTML string is turned into a data structure, then back into HTML, the resulting string will be functionally equivalent to the original HTML. >>> tagjoin(tagextract(s)) (string that is functionally equivalent to s) Three changes are made to the HTML by L{tagjoin}: tags are lowercased, C{key=value} pairs are sorted, and values are placed in double-quotes. """ if _is_str(L): raise ValueError('got string arg, expected non-string iterable') ans = [] for item in L: # Check for string using duck typing. if _is_str(item): # Handle plain text. ans.append(item) elif item[0] == '--': # Handle closing comment. ans.append('-->') elif item[0] == '!--': # Handle opening comment. ans.append('' + ... ' end') [' blah', '', '', ' ', '', 'end'] """ s_lower = s.lower() L = [] i = 0 # Index of char being processed while i < len(s): c = s[i] if c == '<': # Left bracket, handle various cases. if s[i:i + len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT): # HTML begin comment tag, ''. i2 = s.find(_END_COMMENT, i) if i2 < 0: # No '-->'. Append the remaining malformed content and stop. L.append(s[i:]) break else: # Append the comment. L.append(s[i:i2 + len(_END_COMMENT)]) i = i2 + len(_END_COMMENT) elif s[i:i + len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA): # XHTML begin CDATA tag. Scan for ']]>'. i2 = s.find(_END_CDATA, i) if i2 < 0: # No ']]>'. Append the remaining malformed content and stop. L.append(s[i:]) break else: # Append the CDATA. L.append(s[i:i2 + len(_END_CDATA)]) i = i2 + len(_END_CDATA) else: # Regular HTML tag. Scan for '>'. orig_i = i found = False in_quot1 = False in_quot2 = False for i2 in range(i + 1, len(s)): c2 = s[i2] if c2 == '"' and not in_quot1: in_quot2 = not in_quot2 # Only turn on double quote if it's in a realistic place. if in_quot2 and not in_quot1: if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']: in_quot2 = False elif c2 == "'" and not in_quot2: in_quot1 = not in_quot1 # Only turn on single quote if it's in a realistic place. if in_quot1 and not in_quot2: if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']: in_quot1 = False elif c2 == '>' and (not in_quot2 and not in_quot1): found = True break if not found: # No end '>'. Append the rest as text. L.append(s[i:]) break else: # Append the tag. L.append(s[i:i2 + 1]) i = i2 + 1 # Check whether we found a special ignore tag, eg '') doc3 = f('\r\t< html >< tag> ' + '') doc4 = f('' + ' ] ][]]>' + '') # ----------------------------------------------------------------- # Test _html_split() # ----------------------------------------------------------------- s = doc1 assert s == f('').join(_html_split(s)) assert _html_split(s) == f( ['\n\n', '', '', 'Hi', '

', 'Ho', '

', '
', '
', '', '', '', '', '', '\nBye!\n']) s = doc2 assert s == f('').join(_html_split(s)) # Test single quotes s = doc2.replace(f('"'), f("'")) assert s == f('').join(_html_split(s)) s = f('

Header' + '

') assert s == f('').join(_html_split(s)) assert _html_split(s) == f( ['', ' ', '

', 'Header', '

']) s = f(' blah ok whata') assert s == f('').join(_html_split(s)) assert _html_split(s) == f( ['', ' blah ok ', '', ' what', '', '', 'a']) s = f('! -' + '') assert s == f('').join(_html_split(s)) assert _html_split(s) == f( ['', '!', '', '', ' ', '', ' ', '', ' ', '', '-', '', '']) s = doc4 assert s == f('').join(_html_split(s)) assert _html_split(s) == f( ['', '', '', '', ' ] ][]]>', '', '', '') == [('a', {'href': 'a.png', 'size': '10'})]) s = doc1 s2 = doc1.replace(f('"'), f("'")) # Test single quotes, too. assert tagextract(f('')) == [] assert tagextract(s) == tagextract(s2) == \ f(['\n\n', ('html', {}), ('body', {'bgcolor': '#ffffff'}), 'Hi', ('h1', {}), 'Ho', ('/h1', {}), ('br', {}), ('br/', {}), ('img', {'src': 'text%5f.gif'}), ('tag', {'noshow': None}), ('img/', {'test': '5%ff'}), ('/body', {}), ('/html', {}), '\nBye!\n']) s2 = f('\n\nHi

Ho


' + '
' + '\nBye!\n') assert tagjoin(tagextract(s)) == s2 doc2old = doc2 doc2 = f('\r' + '' + 'end ' + '') assert doc2old == doc2 s = doc2 assert tagextract(s) == f( ['\r', ('html', {}), ('!-- Comment
--', {}), ('hiya', {}), ('foo', {}), ('test', {'content': '6', 'tag': '5'}), ('is', {'broken': 'False'}), ('yay', {}), ('style', {}), '<><>><', ('/style', {}), ('foo', {'bar': '5'}), 'end', ('!-- !_-', {}), ('/script', {})]) assert tagjoin(tagextract(s)) == f( '\rend ' + '') s = doc5 assert tagextract(s) == f( [('a', {'href':'foobar/ \t=', 'base':'10', 'x':'15'}), ('a', {'x':'9', 't':'20'})]) assert tagjoin(tagextract(s)) == f( '') # ----------------------------------------------------------------- # Test _full_tag_extract() # ----------------------------------------------------------------- for s in [doc1, doc2, doc3, doc1.replace(f('"'), f("'")), doc2.replace(f('"'), f("'")), doc3.replace(f('"'), f("'"))]: L = _full_tag_extract(s) for (i, item) in _enumerate(L): if isinstance(item, _HTMLTag): for key in list(item.attrs.keys()): assert s[item.key_pos[key][0]:item.key_pos[key][1]].lower()\ == key if item.attrs[key] != None: assert s[item.value_pos[key][0]:item.value_pos[key][1]] \ == item.attrs[key] n = 1000 doc4 = f('') doc4 *= n L = tagextract(doc4) assert len(L) == n for i in range(n): assert L[i] == f([('tag/', {'name':'5', 'value':'6afdjherknc4 cdk j', 'a':'7', 'b':'8'})])[0] # ----------------------------------------------------------------- # Test tagextract() and tagjoin() with XML directives. # ----------------------------------------------------------------- doc1 = f( 'a' + 'bc' + '' + 'zrx' + 'tt') doc1join = f( 'abczrxtt') ans1 = f( ['a', ('?xml version="1.0"?', {}), 'b', ('!DOCTYPE html' + 'PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"' + '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"', {}), 'c', ('html', {'a':'b'}), ('!-- Comment <><> hi! --', {}), 'z', ('![CDATA[ some content ]]', {}), 'rx', ('![C[DATA[ more and weirder ] ][]]', {}), 'tt']) assert (tagextract(f('')) == f([('?xml version="1.0" encoding="utf-8" ?', {})])) assert (tagextract(f('')) == f([('!DOCTYPE html PUBLIC etc...', {})])) assert tagextract(doc1) == ans1 assert tagjoin(tagextract(doc1)) == doc1join # ------------------------------------------------------------------- # Unit Tests: URL Parsing # ------------------------------------------------------------------- def _test_urlextract(str_class=str): """ Unit tests for L{urlextract} and L{urljoin}. Strings are cast to the string class argument str_class. """ # Work around lack of nested scopes in Python <= 2.1. def f(obj, str_class2=str_class): return _cast_to_str(obj, str_class2) doc1 = f('urlblah, url ( blah2, url( blah3) url(blah4) ' + 'url("blah5") hum("blah6") url)"blah7"( url ( " blah8 " );;') doc2 = f('b' + 'http://www.ignore.us/' + '\nhttp://www.nowhere.com c') doc3 = f('@import foo;\n@import bar\n@import url(\'foo2\');' + '@import url(\'http://bar2\')\n@import\turl("foo!");' + '@import \'foo3\'\n@import "bar3";\n@importfails;' + '@import;@import\n;url(\'howdy!\')\n@import foo5 ;' + '@import \'foo6\' \n@import "foo7";') doc4 = f('@import foo handheld;\n@import \'bar\' handheld\n' + '@import url(\'foo2\') handheld; @import url(bar2) ha\n' + '@import url("foo3") handheld\n') doc5 = f('b' + '') doc6 = doc2.replace(f('"'), f("'")) # Test single quotes, too. # Test CSS. s = doc1 L = urlextract(s, mimetype='text/css') L2 = [x.url for x in L] assert L2 == f([' blah3', 'blah4', 'blah5', ' blah8 ']) assert [s[x.start:x.end] == x.url for x in L].count(False) == 0 # Test CSS more. s = doc3 L = urlextract(s, mimetype='text/css') L2 = [x.url for x in L] assert L2 == f(['foo', 'bar', 'foo2', 'http://bar2', 'foo!', 'foo3', 'bar3', 'howdy!', 'foo5', 'foo6', 'foo7']) assert [s[x.start:x.end] == x.url for x in L].count(False) == 0 # Test CSS even more. s = doc4 L = urlextract(s, mimetype='text/css') L2 = [x.url for x in L] assert L2 == f(['foo', 'bar', 'foo2', 'bar2', 'foo3']) assert [s[x.start:x.end] == x.url for x in L].count(False) == 0 # Test HTML. s = doc2 L = urlextract(s) L2 = [x.url for x in L] L3 = [x.url for x in urlextract(doc6)] ans = f(['a.gif', 'b.html', './c.png', 'http://www.abc.edu/d.tga', 'h.gif', 'http://www.testdomain.com/', 'a.gif', '/i.png']) assert L2 == L3 == ans for i in range(len(L)): assert s[L[i].start:L[i].end] == L[i].url # Test HTML more. n = 100 s2 = s * n L3 = urlextract(s2) L4 = [x.url for x in L3] assert L4 == L2 * n for i in range(len(L3)): assert s2[L3[i].start:L3[i].end] == L3[i].url # Test HTML w/ siteurl. base = f('http://www.python.org/~guido/') L = urlextract(s, base) L2 = [x.url for x in L] assert L2 == [urllib.parse.urljoin(base, x) for x in ans] # Test urljoin(). assert urljoin(doc1, urlextract(doc1, mimetype='text/css')) == doc1 assert urljoin(doc2, urlextract(doc2)) == doc2 s = doc2 L = urlextract(s) L[3].url = f('FOO') L[5].url = f('BAR') L[7].url = f('F00!') assert urljoin(s, L) == f( 'b' + '' + 'http://www.ignore.us/\nhttp://www.nowhere.com ' + '' + 'c') # Test HTML yet more. s = doc5 L = urlextract(s) L2 = [x.url for x in L] assert L2 == f(['foo', 'a.gif', 'bar.css', 'b.html']) assert [s[x.start:x.end] == x.url for x in L].count(False) == 0 # ------------------------------------------------------------------- # Unit Test Main Routine # ------------------------------------------------------------------- def _test(): """ Unit test main routine. """ print('Unit tests:') _test_remove_comments() print(' _remove_comments: OK') _test_shlex_split() print(' _shlex_split: OK') _test_tag_dict() print(' _tag_dict: OK') _test_tuple_replace() print(' _tuple_replace: OK') _test_tagextract() print(' tagextract*: OK') _test_tagextract(str) print(' tagextract (unicode)*: OK') _test_urlextract() print(' urlextract*: OK') _test_urlextract(str) print(' urlextract (unicode)*: OK') print() print('* The corresponding join method has been tested as well.') if __name__ == '__main__': _test()