html5lib · Mic92 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024 · Sep 9, 2024
diff --git a/.appveyor.yml b/.appveyor.yml
diff --git a/.github/workflows/python-tox.yml b/.github/workflows/python-tox.yml
@@ -12,9 +12,6 @@ jobs:
         os: [ubuntu-latest, windows-latest]
         deps: [base, optional]
         include:
-          - python: "pypy-2.7"
-            os: ubuntu-latest
-            deps: base
           - python: "pypy-3.10"
             os: ubuntu-latest
             deps: base

diff --git a/README.rst b/README.rst
@@ -29,7 +29,7 @@ or:
 
 By default, the ``document`` will be an ``xml.etree`` element instance.
 Whenever possible, html5lib chooses the accelerated ``ElementTree``
-implementation (i.e. ``xml.etree.cElementTree`` on Python 2.x).
+implementation.
 
 Two other tree types are supported: ``xml.dom.minidom`` and
 ``lxml.etree``. To use an alternative format, specify the name of
@@ -41,18 +41,6 @@ a treebuilder:
   with open("mydocument.html", "rb") as f:
       lxml_etree_document = html5lib.parse(f, treebuilder="lxml")
 
-When using with ``urllib2`` (Python 2), the charset from HTTP should be
-pass into html5lib as follows:
-
-.. code-block:: python
-
-  from contextlib import closing
-  from urllib2 import urlopen
-  import html5lib
-
-  with closing(urlopen("http://example.com/")) as f:
-      document = html5lib.parse(f, transport_encoding=f.info().getparam("charset"))
-
 When using with ``urllib.request`` (Python 3), the charset from HTTP
 should be pass into html5lib as follows:
 
@@ -90,7 +78,7 @@ More documentation is available at https://html5lib.readthedocs.io/.
 Installation
 ------------
 
-html5lib works on CPython 2.7+, CPython 3.5+ and PyPy. To install:
+html5lib works on CPython 3.8+ and PyPy. To install:
 
 .. code-block:: bash
 

diff --git a/debug-info.py b/debug-info.py
@@ -1,4 +1,3 @@
-from __future__ import print_function, unicode_literals
 
 import platform
 import sys
@@ -12,7 +11,7 @@
     "maxsize": sys.maxsize
 }
 
-search_modules = ["chardet", "genshi", "html5lib", "lxml", "six"]
+search_modules = ["chardet", "genshi", "html5lib", "lxml"]
 found_modules = []
 
 for m in search_modules:

diff --git a/doc/conf.py b/doc/conf.py
@@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 #
 # html5lib documentation build configuration file, created by
 # sphinx-quickstart on Wed May  8 00:04:49 2013.
@@ -100,7 +99,7 @@
 }
 
 
-class CExtMock(object):
+class CExtMock:
     """Required for autodoc on readthedocs.org where you cannot build C extensions."""
     def __init__(self, *args, **kwargs):
         pass

diff --git a/html5lib/__init__.py b/html5lib/__init__.py
@@ -20,7 +20,6 @@
 * :func:`~.serializer.serialize`
 """
 
-from __future__ import absolute_import, division, unicode_literals
 
 from .html5parser import HTMLParser, parse, parseFragment
 from .treebuilders import getTreeBuilder

diff --git a/html5lib/_ihatexml.py b/html5lib/_ihatexml.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 import re
 import warnings
@@ -181,7 +180,7 @@ def escapeRegexp(string):
 nonPubidCharRegexp = re.compile("[^\x20\x0D\x0Aa-zA-Z0-9\\-'()+,./:=?;!*#@$_%]")
 
 
-class InfosetFilter(object):
+class InfosetFilter:
     replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
 
     def __init__(self,

diff --git a/html5lib/_inputstream.py b/html5lib/_inputstream.py
@@ -1,7 +1,6 @@
-from __future__ import absolute_import, division, unicode_literals
 
-from six import text_type
-from six.moves import http_client, urllib
+import http.client
+import urllib.response
 
 import codecs
 import re
@@ -48,7 +47,7 @@
 charsUntilRegEx = {}
 
 
-class BufferedStream(object):
+class BufferedStream:
     """Buffering for streams that do not have buffering of their own
 
     The buffer is implemented as a list of chunks on the assumption that
@@ -125,10 +124,10 @@ def _readFromBuffer(self, bytes):
 def HTMLInputStream(source, **kwargs):
     # Work around Python bug #20007: read(0) closes the connection.
     # http://bugs.python.org/issue20007
-    if (isinstance(source, http_client.HTTPResponse) or
+    if (isinstance(source, http.client.HTTPResponse) or
         # Also check for addinfourl wrapping HTTPResponse
         (isinstance(source, urllib.response.addbase) and
-         isinstance(source.fp, http_client.HTTPResponse))):
+         isinstance(source.fp, http.client.HTTPResponse))):
         isUnicode = False
     elif hasattr(source, "read"):
         isUnicode = isinstance(source.read(0), text_type)
@@ -145,7 +144,7 @@ def HTMLInputStream(source, **kwargs):
         return HTMLBinaryInputStream(source, **kwargs)
 
 
-class HTMLUnicodeInputStream(object):
+class HTMLUnicodeInputStream:
     """Provides a unicode stream of characters to the HTMLTokenizer.
 
     This class takes care of character encoding and removing or replacing
@@ -673,7 +672,7 @@ def jumpTo(self, bytes):
         return True
 
 
-class EncodingParser(object):
+class EncodingParser:
     """Mini parser for detecting character encoding from meta elements"""
 
     def __init__(self, data):
@@ -861,7 +860,7 @@ def getAttribute(self):
                 attrValue.append(c)
 
 
-class ContentAttrParser(object):
+class ContentAttrParser:
     def __init__(self, data):
         assert isinstance(data, bytes)
         self.data = data

diff --git a/html5lib/_tokenizer.py b/html5lib/_tokenizer.py
@@ -1,6 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from six import unichr as chr
 
 from collections import deque, OrderedDict
 from sys import version_info
@@ -24,7 +21,7 @@
     attributeMap = OrderedDict
 
 
-class HTMLTokenizer(object):
+class HTMLTokenizer:
     """ This class takes care of tokenizing HTML.
 
     * self.currentToken

diff --git a/html5lib/_trie/__init__.py b/html5lib/_trie/__init__.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from .py import Trie
 

diff --git a/html5lib/_trie/_base.py b/html5lib/_trie/_base.py
@@ -1,9 +1,5 @@
-from __future__ import absolute_import, division, unicode_literals
 
-try:
-    from collections.abc import Mapping
-except ImportError:  # Python 2.7
-    from collections import Mapping
+from collections.abc import Mapping
 
 
 class Trie(Mapping):

diff --git a/html5lib/_trie/py.py b/html5lib/_trie/py.py
@@ -1,14 +1,11 @@
-from __future__ import absolute_import, division, unicode_literals
-from six import text_type
-
 from bisect import bisect_left
 
 from ._base import Trie as ABCTrie
 
 
 class Trie(ABCTrie):
     def __init__(self, data):
-        if not all(isinstance(x, text_type) for x in data.keys()):
+        if not all(isinstance(x, str) for x in data.keys()):
             raise TypeError("All keys must be strings")
 
         self._data = data

diff --git a/html5lib/_utils.py b/html5lib/_utils.py
@@ -1,21 +1,9 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from types import ModuleType
 
-try:
-    from collections.abc import Mapping
-except ImportError:
-    from collections import Mapping
-
-from six import text_type, PY3
+from collections.abc import Mapping
 
-if PY3:
-    import xml.etree.ElementTree as default_etree
-else:
-    try:
-        import xml.etree.cElementTree as default_etree
-    except ImportError:
-        import xml.etree.ElementTree as default_etree
+import xml.etree.ElementTree as default_etree
 
 
 __all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
@@ -31,10 +19,10 @@
 # escapes.
 try:
     _x = eval('"\\uD800"')  # pylint:disable=eval-used
-    if not isinstance(_x, text_type):
+    if not isinstance(_x, str):
         # We need this with u"" because of http://bugs.jython.org/issue2039
         _x = eval('u"\\uD800"')  # pylint:disable=eval-used
-        assert isinstance(_x, text_type)
+        assert isinstance(_x, str)
 except Exception:
     supports_lone_surrogates = False
 else:
@@ -122,7 +110,7 @@ def moduleFactoryFactory(factory):
     moduleCache = {}
 
     def moduleFactory(baseModule, *args, **kwargs):
-        if isinstance(ModuleType.__name__, type("")):
+        if isinstance(ModuleType.__name__, str):
             name = "_%s_factory" % baseModule.__name__
         else:
             name = b"_%s_factory" % baseModule.__name__

diff --git a/html5lib/constants.py b/html5lib/constants.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 import string
 

diff --git a/html5lib/filters/alphabeticalattributes.py b/html5lib/filters/alphabeticalattributes.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from . import base
 

diff --git a/html5lib/filters/base.py b/html5lib/filters/base.py
@@ -1,7 +1,6 @@
-from __future__ import absolute_import, division, unicode_literals
 
 
-class Filter(object):
+class Filter:
     def __init__(self, source):
         self.source = source
 

diff --git a/html5lib/filters/inject_meta_charset.py b/html5lib/filters/inject_meta_charset.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from . import base
 

diff --git a/html5lib/filters/lint.py b/html5lib/filters/lint.py
@@ -1,6 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
-
-from six import text_type
 
 from . import base
 from ..constants import namespaces, voidElements
@@ -33,9 +30,9 @@ def __iter__(self):
             if type in ("StartTag", "EmptyTag"):
                 namespace = token["namespace"]
                 name = token["name"]
-                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace is None or isinstance(namespace, str)
                 assert namespace != ""
-                assert isinstance(name, text_type)
+                assert isinstance(name, str)
                 assert name != ""
                 assert isinstance(token["data"], dict)
                 if (not namespace or namespace == namespaces["html"]) and name in voidElements:
@@ -45,18 +42,18 @@ def __iter__(self):
                 if type == "StartTag" and self.require_matching_tags:
                     open_elements.append((namespace, name))
                 for (namespace, name), value in token["data"].items():
-                    assert namespace is None or isinstance(namespace, text_type)
+                    assert namespace is None or isinstance(namespace, str)
                     assert namespace != ""
-                    assert isinstance(name, text_type)
+                    assert isinstance(name, str)
                     assert name != ""
-                    assert isinstance(value, text_type)
+                    assert isinstance(value, str)
 
             elif type == "EndTag":
                 namespace = token["namespace"]
                 name = token["name"]
-                assert namespace is None or isinstance(namespace, text_type)
+                assert namespace is None or isinstance(namespace, str)
                 assert namespace != ""
-                assert isinstance(name, text_type)
+                assert isinstance(name, str)
                 assert name != ""
                 if (not namespace or namespace == namespaces["html"]) and name in voidElements:
                     assert False, "Void element reported as EndTag token: %(tag)s" % {"tag": name}
@@ -66,26 +63,26 @@ def __iter__(self):
 
             elif type == "Comment":
                 data = token["data"]
-                assert isinstance(data, text_type)
+                assert isinstance(data, str)
 
             elif type in ("Characters", "SpaceCharacters"):
                 data = token["data"]
-                assert isinstance(data, text_type)
+                assert isinstance(data, str)
                 assert data != ""
                 if type == "SpaceCharacters":
                     assert data.strip(spaceCharacters) == ""
 
             elif type == "Doctype":
                 name = token["name"]
-                assert name is None or isinstance(name, text_type)
-                assert token["publicId"] is None or isinstance(name, text_type)
-                assert token["systemId"] is None or isinstance(name, text_type)
+                assert name is None or isinstance(name, str)
+                assert token["publicId"] is None or isinstance(name, str)
+                assert token["systemId"] is None or isinstance(name, str)
 
             elif type == "Entity":
-                assert isinstance(token["name"], text_type)
+                assert isinstance(token["name"], str)
 
             elif type == "SerializerError":
-                assert isinstance(token["data"], text_type)
+                assert isinstance(token["data"], str)
 
             else:
                 assert False, "Unknown token type: %(type)s" % {"type": type}

diff --git a/html5lib/filters/optionaltags.py b/html5lib/filters/optionaltags.py
@@ -1,4 +1,3 @@
-from __future__ import absolute_import, division, unicode_literals
 
 from . import base