""" Manipulate HTML or XHTML documents. Version 1.1.1. This source code has been placed in the public domain by Connelly Barnes. Features: - Translate HTML back and forth to data structures. This allows you to read and write HTML documents programmably, with much flexibility. - Extract and modify URLs in an HTML document. - Compatible with Python 3+ See the L{examples} for a quick start. Moved to Python3 by Jack Thomson May 2020 """ __version__ = '1.1.2' __all__ = ['examples', 'tagextract', 'tagjoin', 'urlextract', 'urljoin', 'URLMatch'] # ------------------------------------------------------------------- # Globals # ------------------------------------------------------------------- import re import shlex import string import urllib.request, urllib.parse, urllib.error import urllib.parse import types # Translate text between these strings as plain text (not HTML). _IGNORE_TAGS = [('script', '/script'), ('style', '/style')] # Special tags where we have to look for _END_X as part of the # HTML/XHTML parsing rules. _BEGIN_COMMENT = '' _BEGIN_CDATA = '' # Mime types that can be parsed as HTML or HTML-like. _HTML_MIMETYPES = ['text/html', 'application/xhtml', 'application/xhtml+xml', 'text/xml', 'application/xml'] # Mime types that can be parsed as CSS. _CSS_MIMETYPES = ['text/css'] # ------------------------------------------------------------------- # HTML <-> Data structure # ------------------------------------------------------------------- def tagextract(doc): """ Convert HTML to data structure. Returns a list. HTML tags become C{(name, keyword_dict)} tuples within the list, while plain text becomes strings within the list. All tag names are lowercased and stripped of whitespace. Tags which end with forward slashes have a single forward slash placed at the end of their name, to indicate that they are XML unclosed tags. Example: >>> tagextract('

foo

') [('img', {'src': 'hi.gif', 'alt': 'hi'}), 'foo', ('br', {}), ('br/', {}), ('/body', {})] Text between C{'') [('script', {'type': 'a'}), 'var x; ', ('/script', {})] Comment strings and XML directives are rendered as a single long tag with no attributes. The case of the tag "name" is not changed: >>> tagextract('') [('!-- blah --', {})] >>> tagextract('') [('?xml version="1.0" encoding="utf-8" ?', {})] >>> tagextract('') [('!DOCTYPE html PUBLIC etc...', {})] Greater-than and less-than characters occurring inside comments or CDATA blocks are correctly kept as part of the block: >>> tagextract('') [('!-- <><><><>>..> --', {})] >>> tagextract('<>><>]<> ]]>') [('!CDATA[[><>><>]<> ]]', {})] Note that if one modifies these tags, it is important to retain the C{"--"} (for comments) or C{"]]"} (for C{CDATA}) at the end of the tag name, so that output from L{tagjoin} will be correct HTML/XHTML. """ L = _full_tag_extract(doc) for i in range(len(L)): if isinstance(L[i], _TextTag): # _TextTag object. L[i] = L[i].text else: # _HTMLTag object. L[i] = (L[i].name, L[i].attrs) return L def _is_str(s): """ True iff s is a string (checks via duck typing). """ return hasattr(s, 'capitalize') def tagjoin(L): """ Convert data structure back to HTML. This reverses the L{tagextract} function. More precisely, if an HTML string is turned into a data structure, then back into HTML, the resulting string will be functionally equivalent to the original HTML. >>> tagjoin(tagextract(s)) (string that is functionally equivalent to s) Three changes are made to the HTML by L{tagjoin}: tags are lowercased, C{key=value} pairs are sorted, and values are placed in double-quotes. """ if _is_str(L): raise ValueError('got string arg, expected non-string iterable') ans = [] for item in L: # Check for string using duck typing. if _is_str(item): # Handle plain text. ans.append(item) elif item[0] == '--': # Handle closing comment. ans.append('-->') elif item[0] == '!--': # Handle opening comment. ans.append('' + ... ' end') [' blah', '', '', ' ', '', 'end'] """ s_lower = s.lower() L = [] i = 0 # Index of char being processed while i < len(s): c = s[i] if c == '<': # Left bracket, handle various cases. if s[i:i + len(_BEGIN_COMMENT)].startswith(_BEGIN_COMMENT): # HTML begin comment tag, ''. i2 = s.find(_END_COMMENT, i) if i2 < 0: # No '-->'. Append the remaining malformed content and stop. L.append(s[i:]) break else: # Append the comment. L.append(s[i:i2 + len(_END_COMMENT)]) i = i2 + len(_END_COMMENT) elif s[i:i + len(_BEGIN_CDATA)].startswith(_BEGIN_CDATA): # XHTML begin CDATA tag. Scan for ']]>'. i2 = s.find(_END_CDATA, i) if i2 < 0: # No ']]>'. Append the remaining malformed content and stop. L.append(s[i:]) break else: # Append the CDATA. L.append(s[i:i2 + len(_END_CDATA)]) i = i2 + len(_END_CDATA) else: # Regular HTML tag. Scan for '>'. orig_i = i found = False in_quot1 = False in_quot2 = False for i2 in range(i + 1, len(s)): c2 = s[i2] if c2 == '"' and not in_quot1: in_quot2 = not in_quot2 # Only turn on double quote if it's in a realistic place. if in_quot2 and not in_quot1: if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']: in_quot2 = False elif c2 == "'" and not in_quot2: in_quot1 = not in_quot1 # Only turn on single quote if it's in a realistic place. if in_quot1 and not in_quot2: if i2 > 0 and s[i2 - 1] not in [' ', '\t', '=']: in_quot1 = False elif c2 == '>' and (not in_quot2 and not in_quot1): found = True break if not found: # No end '>'. Append the rest as text. L.append(s[i:]) break else: # Append the tag. L.append(s[i:i2 + 1]) i = i2 + 1 # Check whether we found a special ignore tag, eg '') doc3 = f('\r\t< html >< tag> ' + '') doc4 = f('' + ' ] ][]]>' + '') # ----------------------------------------------------------------- # Test _html_split() # ----------------------------------------------------------------- s = doc1 assert s == f('').join(_html_split(s)) assert _html_split(s) == f( ['\n\n', '', '', 'Hi', '