Import python venv for stability
This commit is contained in:
WebServer/AIPython/python/lib/python3.11/site-packages/_cffi_backend.cpython-311-x86_64-linux-gnu.so
Executable
BIN
Binary file not shown.
@@ -0,0 +1,239 @@
|
||||
# don't import any costly modules
|
||||
import os
|
||||
import sys
|
||||
|
||||
report_url = (
|
||||
"https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml"
|
||||
)
|
||||
|
||||
|
||||
def warn_distutils_present():
|
||||
if 'distutils' not in sys.modules:
|
||||
return
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"Distutils was imported before Setuptools, but importing Setuptools "
|
||||
"also replaces the `distutils` module in `sys.modules`. This may lead "
|
||||
"to undesirable behaviors or errors. To avoid these issues, avoid "
|
||||
"using distutils directly, ensure that setuptools is installed in the "
|
||||
"traditional way (e.g. not an editable install), and/or make sure "
|
||||
"that setuptools is always imported before distutils."
|
||||
)
|
||||
|
||||
|
||||
def clear_distutils():
|
||||
if 'distutils' not in sys.modules:
|
||||
return
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"Setuptools is replacing distutils. Support for replacing "
|
||||
"an already imported distutils is deprecated. In the future, "
|
||||
"this condition will fail. "
|
||||
f"Register concerns at {report_url}"
|
||||
)
|
||||
mods = [
|
||||
name
|
||||
for name in sys.modules
|
||||
if name == "distutils" or name.startswith("distutils.")
|
||||
]
|
||||
for name in mods:
|
||||
del sys.modules[name]
|
||||
|
||||
|
||||
def enabled():
|
||||
"""
|
||||
Allow selection of distutils by environment variable.
|
||||
"""
|
||||
which = os.environ.get('SETUPTOOLS_USE_DISTUTILS', 'local')
|
||||
if which == 'stdlib':
|
||||
import warnings
|
||||
|
||||
warnings.warn(
|
||||
"Reliance on distutils from stdlib is deprecated. Users "
|
||||
"must rely on setuptools to provide the distutils module. "
|
||||
"Avoid importing distutils or import setuptools first, "
|
||||
"and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. "
|
||||
f"Register concerns at {report_url}"
|
||||
)
|
||||
return which == 'local'
|
||||
|
||||
|
||||
def ensure_local_distutils():
|
||||
import importlib
|
||||
|
||||
clear_distutils()
|
||||
|
||||
# With the DistutilsMetaFinder in place,
|
||||
# perform an import to cause distutils to be
|
||||
# loaded from setuptools._distutils. Ref #2906.
|
||||
with shim():
|
||||
importlib.import_module('distutils')
|
||||
|
||||
# check that submodules load as expected
|
||||
core = importlib.import_module('distutils.core')
|
||||
assert '_distutils' in core.__file__, core.__file__
|
||||
assert 'setuptools._distutils.log' not in sys.modules
|
||||
|
||||
|
||||
def do_override():
|
||||
"""
|
||||
Ensure that the local copy of distutils is preferred over stdlib.
|
||||
|
||||
See https://github.com/pypa/setuptools/issues/417#issuecomment-392298401
|
||||
for more motivation.
|
||||
"""
|
||||
if enabled():
|
||||
warn_distutils_present()
|
||||
ensure_local_distutils()
|
||||
|
||||
|
||||
class _TrivialRe:
|
||||
def __init__(self, *patterns) -> None:
|
||||
self._patterns = patterns
|
||||
|
||||
def match(self, string):
|
||||
return all(pat in string for pat in self._patterns)
|
||||
|
||||
|
||||
class DistutilsMetaFinder:
|
||||
def find_spec(self, fullname, path, target=None):
|
||||
# optimization: only consider top level modules and those
|
||||
# found in the CPython test suite.
|
||||
if path is not None and not fullname.startswith('test.'):
|
||||
return None
|
||||
|
||||
method_name = 'spec_for_{fullname}'.format(**locals())
|
||||
method = getattr(self, method_name, lambda: None)
|
||||
return method()
|
||||
|
||||
def spec_for_distutils(self):
|
||||
if self.is_cpython():
|
||||
return None
|
||||
|
||||
import importlib
|
||||
import importlib.abc
|
||||
import importlib.util
|
||||
|
||||
try:
|
||||
mod = importlib.import_module('setuptools._distutils')
|
||||
except Exception:
|
||||
# There are a couple of cases where setuptools._distutils
|
||||
# may not be present:
|
||||
# - An older Setuptools without a local distutils is
|
||||
# taking precedence. Ref #2957.
|
||||
# - Path manipulation during sitecustomize removes
|
||||
# setuptools from the path but only after the hook
|
||||
# has been loaded. Ref #2980.
|
||||
# In either case, fall back to stdlib behavior.
|
||||
return None
|
||||
|
||||
class DistutilsLoader(importlib.abc.Loader):
|
||||
def create_module(self, spec):
|
||||
mod.__name__ = 'distutils'
|
||||
return mod
|
||||
|
||||
def exec_module(self, module):
|
||||
pass
|
||||
|
||||
return importlib.util.spec_from_loader(
|
||||
'distutils', DistutilsLoader(), origin=mod.__file__
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def is_cpython():
|
||||
"""
|
||||
Suppress supplying distutils for CPython (build and tests).
|
||||
Ref #2965 and #3007.
|
||||
"""
|
||||
return os.path.isfile('pybuilddir.txt')
|
||||
|
||||
def spec_for_pip(self):
|
||||
"""
|
||||
Ensure stdlib distutils when running under pip.
|
||||
See pypa/pip#8761 for rationale.
|
||||
"""
|
||||
if sys.version_info >= (3, 12) or self.pip_imported_during_build():
|
||||
return
|
||||
clear_distutils()
|
||||
self.spec_for_distutils = lambda: None
|
||||
|
||||
@classmethod
|
||||
def pip_imported_during_build(cls):
|
||||
"""
|
||||
Detect if pip is being imported in a build script. Ref #2355.
|
||||
"""
|
||||
import traceback
|
||||
|
||||
return any(
|
||||
cls.frame_file_is_setup(frame) for frame, line in traceback.walk_stack(None)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def frame_file_is_setup(frame):
|
||||
"""
|
||||
Return True if the indicated frame suggests a setup.py file.
|
||||
"""
|
||||
# some frames may not have __file__ (#2940)
|
||||
return frame.f_globals.get('__file__', '').endswith('setup.py')
|
||||
|
||||
def spec_for_sensitive_tests(self):
|
||||
"""
|
||||
Ensure stdlib distutils when running select tests under CPython.
|
||||
|
||||
python/cpython#91169
|
||||
"""
|
||||
clear_distutils()
|
||||
self.spec_for_distutils = lambda: None
|
||||
|
||||
sensitive_tests = (
|
||||
[
|
||||
'test.test_distutils',
|
||||
'test.test_peg_generator',
|
||||
'test.test_importlib',
|
||||
]
|
||||
if sys.version_info < (3, 10)
|
||||
else [
|
||||
'test.test_distutils',
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
for name in DistutilsMetaFinder.sensitive_tests:
|
||||
setattr(
|
||||
DistutilsMetaFinder,
|
||||
f'spec_for_{name}',
|
||||
DistutilsMetaFinder.spec_for_sensitive_tests,
|
||||
)
|
||||
|
||||
|
||||
DISTUTILS_FINDER = DistutilsMetaFinder()
|
||||
|
||||
|
||||
def add_shim():
|
||||
DISTUTILS_FINDER in sys.meta_path or insert_shim()
|
||||
|
||||
|
||||
class shim:
|
||||
def __enter__(self) -> None:
|
||||
insert_shim()
|
||||
|
||||
def __exit__(self, exc: object, value: object, tb: object) -> None:
|
||||
_remove_shim()
|
||||
|
||||
|
||||
def insert_shim():
|
||||
sys.meta_path.insert(0, DISTUTILS_FINDER)
|
||||
|
||||
|
||||
def _remove_shim():
|
||||
try:
|
||||
sys.meta_path.remove(DISTUTILS_FINDER)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
if sys.version_info < (3, 12):
|
||||
# DistutilsMetaFinder can only be disabled in Python < 3.12 (PEP 632)
|
||||
remove_shim = _remove_shim
|
||||
@@ -0,0 +1 @@
|
||||
__import__('_distutils_hack').do_override()
|
||||
+1
@@ -0,0 +1 @@
|
||||
pip
|
||||
+123
@@ -0,0 +1,123 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: beautifulsoup4
|
||||
Version: 4.14.3
|
||||
Summary: Screen-scraping library
|
||||
Project-URL: Download, https://www.crummy.com/software/BeautifulSoup/bs4/download/
|
||||
Project-URL: Homepage, https://www.crummy.com/software/BeautifulSoup/bs4/
|
||||
Author-email: Leonard Richardson <leonardr@segfault.org>
|
||||
License: MIT License
|
||||
License-File: AUTHORS
|
||||
License-File: LICENSE
|
||||
Keywords: HTML,XML,parse,soup
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: MIT License
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
||||
Classifier: Topic :: Text Processing :: Markup :: HTML
|
||||
Classifier: Topic :: Text Processing :: Markup :: SGML
|
||||
Classifier: Topic :: Text Processing :: Markup :: XML
|
||||
Requires-Python: >=3.7.0
|
||||
Requires-Dist: soupsieve>=1.6.1
|
||||
Requires-Dist: typing-extensions>=4.0.0
|
||||
Provides-Extra: cchardet
|
||||
Requires-Dist: cchardet; extra == 'cchardet'
|
||||
Provides-Extra: chardet
|
||||
Requires-Dist: chardet; extra == 'chardet'
|
||||
Provides-Extra: charset-normalizer
|
||||
Requires-Dist: charset-normalizer; extra == 'charset-normalizer'
|
||||
Provides-Extra: html5lib
|
||||
Requires-Dist: html5lib; extra == 'html5lib'
|
||||
Provides-Extra: lxml
|
||||
Requires-Dist: lxml; extra == 'lxml'
|
||||
Description-Content-Type: text/markdown
|
||||
|
||||
Beautiful Soup is a library that makes it easy to scrape information
|
||||
from web pages. It sits atop an HTML or XML parser, providing Pythonic
|
||||
idioms for iterating, searching, and modifying the parse tree.
|
||||
|
||||
# Quick start
|
||||
|
||||
```
|
||||
>>> from bs4 import BeautifulSoup
|
||||
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
|
||||
>>> print(soup.prettify())
|
||||
<html>
|
||||
<body>
|
||||
<p>
|
||||
Some
|
||||
<b>
|
||||
bad
|
||||
<i>
|
||||
HTML
|
||||
</i>
|
||||
</b>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
>>> soup.find(string="bad")
|
||||
'bad'
|
||||
>>> soup.i
|
||||
<i>HTML</i>
|
||||
#
|
||||
>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
|
||||
#
|
||||
>>> print(soup.prettify())
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<tag1>
|
||||
Some
|
||||
<tag2/>
|
||||
bad
|
||||
<tag3>
|
||||
XML
|
||||
</tag3>
|
||||
</tag1>
|
||||
```
|
||||
|
||||
To go beyond the basics, [comprehensive documentation is available](https://www.crummy.com/software/BeautifulSoup/bs4/doc/).
|
||||
|
||||
# Links
|
||||
|
||||
* [Homepage](https://www.crummy.com/software/BeautifulSoup/bs4/)
|
||||
* [Documentation](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)
|
||||
* [Discussion group](https://groups.google.com/group/beautifulsoup/)
|
||||
* [Development](https://code.launchpad.net/beautifulsoup/)
|
||||
* [Bug tracker](https://bugs.launchpad.net/beautifulsoup/)
|
||||
* [Complete changelog](https://git.launchpad.net/beautifulsoup/tree/CHANGELOG)
|
||||
|
||||
# Note on Python 2 sunsetting
|
||||
|
||||
Beautiful Soup's support for Python 2 was discontinued on December 31,
|
||||
2020: one year after the sunset date for Python 2 itself. From this
|
||||
point onward, new Beautiful Soup development will exclusively target
|
||||
Python 3. The final release of Beautiful Soup 4 to support Python 2
|
||||
was 4.9.3.
|
||||
|
||||
# Supporting the project
|
||||
|
||||
If you use Beautiful Soup as part of your professional work, please consider a
|
||||
[Tidelift subscription](https://tidelift.com/subscription/pkg/pypi-beautifulsoup4?utm_source=pypi-beautifulsoup4&utm_medium=referral&utm_campaign=readme).
|
||||
This will support many of the free software projects your organization
|
||||
depends on, not just Beautiful Soup.
|
||||
|
||||
If you use Beautiful Soup for personal projects, the best way to say
|
||||
thank you is to read
|
||||
[Tool Safety](https://www.crummy.com/software/BeautifulSoup/zine/), a zine I
|
||||
wrote about what Beautiful Soup has taught me about software
|
||||
development.
|
||||
|
||||
# Building the documentation
|
||||
|
||||
The bs4/doc/ directory contains full documentation in Sphinx
|
||||
format. Run `make html` in that directory to create HTML
|
||||
documentation.
|
||||
|
||||
# Running the unit tests
|
||||
|
||||
Beautiful Soup supports unit test discovery using Pytest:
|
||||
|
||||
```
|
||||
$ pytest
|
||||
```
|
||||
|
||||
+37
@@ -0,0 +1,37 @@
|
||||
beautifulsoup4-4.14.3.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
beautifulsoup4-4.14.3.dist-info/METADATA,sha256=Ac93vA8Xp9FtgOcKXFM8ESfVdztimUfJ3WUpVlhKtsY,3812
|
||||
beautifulsoup4-4.14.3.dist-info/RECORD,,
|
||||
beautifulsoup4-4.14.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
||||
beautifulsoup4-4.14.3.dist-info/licenses/AUTHORS,sha256=uYkjiRjh_aweRnF8tAW2PpJJeickE68NmJwd9siry28,2201
|
||||
beautifulsoup4-4.14.3.dist-info/licenses/LICENSE,sha256=VbTY1LHlvIbRDvrJG3TIe8t3UmsPW57a-LnNKtxzl7I,1441
|
||||
bs4/__init__.py,sha256=E7wiVp7oQK0JhdAYxpehZa8drv3W_sJv5oeTFiBfR5o,44386
|
||||
bs4/__pycache__/__init__.cpython-311.pyc,,
|
||||
bs4/__pycache__/_deprecation.cpython-311.pyc,,
|
||||
bs4/__pycache__/_typing.cpython-311.pyc,,
|
||||
bs4/__pycache__/_warnings.cpython-311.pyc,,
|
||||
bs4/__pycache__/css.cpython-311.pyc,,
|
||||
bs4/__pycache__/dammit.cpython-311.pyc,,
|
||||
bs4/__pycache__/diagnose.cpython-311.pyc,,
|
||||
bs4/__pycache__/element.cpython-311.pyc,,
|
||||
bs4/__pycache__/exceptions.cpython-311.pyc,,
|
||||
bs4/__pycache__/filter.cpython-311.pyc,,
|
||||
bs4/__pycache__/formatter.cpython-311.pyc,,
|
||||
bs4/_deprecation.py,sha256=niHJCk37APg8KEuFOa57ZXaxLdBmc_2V6uuaJqu7r30,2408
|
||||
bs4/_typing.py,sha256=zNcx7R1yCTK8WwtumP28hc7CJ3pMyZXj_VAeYaNXMZA,7549
|
||||
bs4/_warnings.py,sha256=ZuOETgcnEbZgw2N0nnNXn6wvtrn2ut7AF0d98bvkMFc,4711
|
||||
bs4/builder/__init__.py,sha256=Rl4qjOXvdyyyjayOFqbkgoUoo81IgoyKD-RwWeVK59g,31194
|
||||
bs4/builder/__pycache__/__init__.cpython-311.pyc,,
|
||||
bs4/builder/__pycache__/_html5lib.cpython-311.pyc,,
|
||||
bs4/builder/__pycache__/_htmlparser.cpython-311.pyc,,
|
||||
bs4/builder/__pycache__/_lxml.cpython-311.pyc,,
|
||||
bs4/builder/_html5lib.py,sha256=hL6xUk4_I2i5CMguFoYFlrI26cY4Dut7fOEQrUctHIM,23607
|
||||
bs4/builder/_htmlparser.py,sha256=CnULPQV2rm4vLojJABpQ7Xm9diddnEZx2Wcz_VTC1Mg,17445
|
||||
bs4/builder/_lxml.py,sha256=ks1e8boA_nOA2oomAhxeudccR6ThbEE-EllFqHRoPLA,18969
|
||||
bs4/css.py,sha256=_m_l_4SGWHnY620VJ21j_qQH1RX3p91sYVemgKxaLsM,12713
|
||||
bs4/dammit.py,sha256=ZJWa9K32X6N2imFHleqUq0ekf592weU1lvULN_WYWYk,57024
|
||||
bs4/diagnose.py,sha256=at98iuxyOrqec4V8iwkTIbNUqBCsq9Lr3fDAQx2129Y,7846
|
||||
bs4/element.py,sha256=oXmj7LG_2NpsDK90mq73q0PMK0FjFBIGSeTTJLVwwTc,120237
|
||||
bs4/exceptions.py,sha256=Q9FOadNe8QRvzDMaKSXe2Wtl8JK_oAZW7mbFZBVP_GE,951
|
||||
bs4/filter.py,sha256=rw8ZNhTDLEJVCEiSifou5tZR_3zBLeuvAyouY82qU_E,29201
|
||||
bs4/formatter.py,sha256=uBT0k6W8O5kJ9PCuJYjra97yoUqC-dlM9D_v-oRM0r8,10478
|
||||
bs4/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
+4
@@ -0,0 +1,4 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: hatchling 1.27.0
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
+49
@@ -0,0 +1,49 @@
|
||||
Behold, mortal, the origins of Beautiful Soup...
|
||||
================================================
|
||||
|
||||
Leonard Richardson is the primary maintainer.
|
||||
|
||||
Aaron DeVore, Isaac Muse and Chris Papademetrious have made
|
||||
significant contributions to the code base.
|
||||
|
||||
Mark Pilgrim provided the encoding detection code that forms the base
|
||||
of UnicodeDammit.
|
||||
|
||||
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
|
||||
Soup 4 working under Python 3.
|
||||
|
||||
Simon Willison wrote soupselect, which was used to make Beautiful Soup
|
||||
support CSS selectors. Isaac Muse wrote SoupSieve, which made it
|
||||
possible to _remove_ the CSS selector code from Beautiful Soup.
|
||||
|
||||
Sam Ruby helped with a lot of edge cases.
|
||||
|
||||
Jonathan Ellis was awarded the prestigious Beau Potage D'Or for his
|
||||
work in solving the nestable tags conundrum.
|
||||
|
||||
An incomplete list of people have contributed patches to Beautiful
|
||||
Soup:
|
||||
|
||||
Istvan Albert, Andrew Lin, Anthony Baxter, Oliver Beattie, Andrew
|
||||
Boyko, Tony Chang, Francisco Canas, "Delong", Zephyr Fang, Fuzzy,
|
||||
Roman Gaufman, Yoni Gilad, Richie Hindle, Toshihiro Kamiya, Peteris
|
||||
Krumins, Kent Johnson, Marek Kapolka, Andreas Kostyrka, Roel Kramer,
|
||||
Ben Last, Robert Leftwich, Stefaan Lippens, "liquider", Staffan
|
||||
Malmgren, Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon",
|
||||
Ed Oskiewicz, Martijn Peters, Greg Phillips, Giles Radford, Stefano
|
||||
Revera, Arthur Rudolph, Marko Samastur, James Salter, Jouni Seppänen,
|
||||
Alexander Schmolck, Tim Shirley, Geoffrey Sneddon, Ville Skyttä,
|
||||
"Vikas", Jens Svalgaard, Andy Theyers, Eric Weiser, Glyn Webster, John
|
||||
Wiseman, Paul Wright, Danny Yoo
|
||||
|
||||
An incomplete list of people who made suggestions or found bugs or
|
||||
found ways to break Beautiful Soup:
|
||||
|
||||
Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
|
||||
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
|
||||
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
|
||||
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
|
||||
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
|
||||
Summers, Dennis Sutch, Chris Smith, Aaron Swartz, Stuart
|
||||
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
|
||||
Sousa Rocha, Yichun Wei, Per Vognsen
|
||||
+31
@@ -0,0 +1,31 @@
|
||||
Beautiful Soup is made available under the MIT license:
|
||||
|
||||
Copyright (c) Leonard Richardson
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
Beautiful Soup incorporates code from the html5lib library, which is
|
||||
also made available under the MIT license. Copyright (c) James Graham
|
||||
and other contributors
|
||||
|
||||
Beautiful Soup has an optional dependency on the soupsieve library,
|
||||
which is also made available under the MIT license. Copyright (c)
|
||||
Isaac Muse
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,80 @@
|
||||
"""Helper functions for deprecation.
|
||||
|
||||
This interface is itself unstable and may change without warning. Do
|
||||
not use these functions yourself, even as a joke. The underscores are
|
||||
there for a reason. No support will be given.
|
||||
|
||||
In particular, most of this will go away without warning once
|
||||
Beautiful Soup drops support for Python 3.11, since Python 3.12
|
||||
defines a `@typing.deprecated()
|
||||
decorator. <https://peps.python.org/pep-0702/>`_
|
||||
"""
|
||||
|
||||
import functools
|
||||
import warnings
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
)
|
||||
|
||||
|
||||
def _deprecated_alias(old_name: str, new_name: str, version: str):
|
||||
"""Alias one attribute name to another for backward compatibility
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
|
||||
@property # type:ignore
|
||||
def alias(self) -> Any:
|
||||
":meta private:"
|
||||
warnings.warn(
|
||||
f"Access to deprecated property {old_name}. (Replaced by {new_name}) -- Deprecated since version {version}.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return getattr(self, new_name)
|
||||
|
||||
@alias.setter
|
||||
def alias(self, value: str) -> None:
|
||||
":meta private:"
|
||||
warnings.warn(
|
||||
f"Write to deprecated property {old_name}. (Replaced by {new_name}) -- Deprecated since version {version}.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return setattr(self, new_name, value)
|
||||
|
||||
return alias
|
||||
|
||||
|
||||
def _deprecated_function_alias(
|
||||
old_name: str, new_name: str, version: str
|
||||
) -> Callable[[Any], Any]:
|
||||
def alias(self, *args: Any, **kwargs: Any) -> Any:
|
||||
":meta private:"
|
||||
warnings.warn(
|
||||
f"Call to deprecated method {old_name}. (Replaced by {new_name}) -- Deprecated since version {version}.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return getattr(self, new_name)(*args, **kwargs)
|
||||
|
||||
return alias
|
||||
|
||||
|
||||
def _deprecated(replaced_by: str, version: str) -> Callable:
|
||||
def deprecate(func: Callable) -> Callable:
|
||||
@functools.wraps(func)
|
||||
def with_warning(*args: Any, **kwargs: Any) -> Any:
|
||||
":meta private:"
|
||||
warnings.warn(
|
||||
f"Call to deprecated method {func.__name__}. (Replaced by {replaced_by}) -- Deprecated since version {version}.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return with_warning
|
||||
|
||||
return deprecate
|
||||
@@ -0,0 +1,205 @@
|
||||
# Custom type aliases used throughout Beautiful Soup to improve readability.
|
||||
|
||||
# Notes on improvements to the type system in newer versions of Python
|
||||
# that can be used once Beautiful Soup drops support for older
|
||||
# versions:
|
||||
#
|
||||
# * ClassVar can be put on class variables now.
|
||||
# * In 3.10, x|y is an accepted shorthand for Union[x,y].
|
||||
# * In 3.10, TypeAlias gains capabilities that can be used to
|
||||
# improve the tree matching types (I don't remember what, exactly).
|
||||
# * In 3.9 it's possible to specialize the re.Match type,
|
||||
# e.g. re.Match[str]. In 3.8 there's a typing.re namespace for this,
|
||||
# but it's removed in 3.12, so to support the widest possible set of
|
||||
# versions I'm not using it.
|
||||
|
||||
from typing_extensions import (
|
||||
runtime_checkable,
|
||||
Protocol,
|
||||
TypeAlias,
|
||||
)
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
IO,
|
||||
Iterable,
|
||||
Mapping,
|
||||
Optional,
|
||||
Pattern,
|
||||
TYPE_CHECKING,
|
||||
Union,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from bs4.element import (
|
||||
AttributeValueList,
|
||||
NamespacedAttribute,
|
||||
NavigableString,
|
||||
PageElement,
|
||||
ResultSet,
|
||||
Tag,
|
||||
)
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class _RegularExpressionProtocol(Protocol):
|
||||
"""A protocol object which can accept either Python's built-in
|
||||
`re.Pattern` objects, or the similar ``Regex`` objects defined by the
|
||||
third-party ``regex`` package.
|
||||
"""
|
||||
|
||||
def search(
|
||||
self, string: str, pos: int = ..., endpos: int = ...
|
||||
) -> Optional[Any]: ...
|
||||
|
||||
@property
|
||||
def pattern(self) -> str: ...
|
||||
|
||||
|
||||
# Aliases for markup in various stages of processing.
|
||||
#
|
||||
#: The rawest form of markup: either a string, bytestring, or an open filehandle.
|
||||
_IncomingMarkup: TypeAlias = Union[str, bytes, IO[str], IO[bytes]]
|
||||
|
||||
#: Markup that is in memory but has (potentially) yet to be converted
|
||||
#: to Unicode.
|
||||
_RawMarkup: TypeAlias = Union[str, bytes]
|
||||
|
||||
# Aliases for character encodings
|
||||
#
|
||||
|
||||
#: A data encoding.
|
||||
_Encoding: TypeAlias = str
|
||||
|
||||
#: One or more data encodings.
|
||||
_Encodings: TypeAlias = Iterable[_Encoding]
|
||||
|
||||
# Aliases for XML namespaces
|
||||
#
|
||||
|
||||
#: The prefix for an XML namespace.
|
||||
_NamespacePrefix: TypeAlias = str
|
||||
|
||||
#: The URL of an XML namespace
|
||||
_NamespaceURL: TypeAlias = str
|
||||
|
||||
#: A mapping of prefixes to namespace URLs.
|
||||
_NamespaceMapping: TypeAlias = Dict[_NamespacePrefix, _NamespaceURL]
|
||||
|
||||
#: A mapping of namespace URLs to prefixes
|
||||
_InvertedNamespaceMapping: TypeAlias = Dict[_NamespaceURL, _NamespacePrefix]
|
||||
|
||||
# Aliases for the attribute values associated with HTML/XML tags.
|
||||
#
|
||||
|
||||
#: The value associated with an HTML or XML attribute. This is the
|
||||
#: relatively unprocessed value Beautiful Soup expects to come from a
|
||||
#: `TreeBuilder`.
|
||||
_RawAttributeValue: TypeAlias = str
|
||||
|
||||
#: A dictionary of names to `_RawAttributeValue` objects. This is how
|
||||
#: Beautiful Soup expects a `TreeBuilder` to represent a tag's
|
||||
#: attribute values.
|
||||
_RawAttributeValues: TypeAlias = (
|
||||
"Mapping[Union[str, NamespacedAttribute], _RawAttributeValue]"
|
||||
)
|
||||
|
||||
#: An attribute value in its final form, as stored in the
|
||||
# `Tag` class, after it has been processed and (in some cases)
|
||||
# split into a list of strings.
|
||||
_AttributeValue: TypeAlias = Union[str, "AttributeValueList"]
|
||||
|
||||
#: A dictionary of names to :py:data:`_AttributeValue` objects. This is what
|
||||
#: a tag's attributes look like after processing.
|
||||
_AttributeValues: TypeAlias = Dict[str, _AttributeValue]
|
||||
|
||||
#: The methods that deal with turning :py:data:`_RawAttributeValue` into
|
||||
#: :py:data:`_AttributeValue` may be called several times, even after the values
|
||||
#: are already processed (e.g. when cloning a tag), so they need to
|
||||
#: be able to acommodate both possibilities.
|
||||
_RawOrProcessedAttributeValues: TypeAlias = Union[_RawAttributeValues, _AttributeValues]
|
||||
|
||||
#: A number of tree manipulation methods can take either a `PageElement` or a
|
||||
#: normal Python string (which will be converted to a `NavigableString`).
|
||||
_InsertableElement: TypeAlias = Union["PageElement", str]
|
||||
|
||||
# Aliases to represent the many possibilities for matching bits of a
|
||||
# parse tree.
|
||||
#
|
||||
# This is very complicated because we're applying a formal type system
|
||||
# to some very DWIM code. The types we end up with will be the types
|
||||
# of the arguments to the SoupStrainer constructor and (more
|
||||
# familiarly to Beautiful Soup users) the find* methods.
|
||||
|
||||
#: A function that takes a PageElement and returns a yes-or-no answer.
|
||||
_PageElementMatchFunction: TypeAlias = Callable[["PageElement"], bool]
|
||||
|
||||
#: A function that takes the raw parsed ingredients of a markup tag
|
||||
#: and returns a yes-or-no answer.
|
||||
# Not necessary at the moment.
|
||||
# _AllowTagCreationFunction:TypeAlias = Callable[[Optional[str], str, Optional[_RawAttributeValues]], bool]
|
||||
|
||||
#: A function that takes the raw parsed ingredients of a markup string node
|
||||
#: and returns a yes-or-no answer.
|
||||
# Not necessary at the moment.
|
||||
# _AllowStringCreationFunction:TypeAlias = Callable[[Optional[str]], bool]
|
||||
|
||||
#: A function that takes a `Tag` and returns a yes-or-no answer.
|
||||
#: A `TagNameMatchRule` expects this kind of function, if you're
|
||||
#: going to pass it a function.
|
||||
_TagMatchFunction: TypeAlias = Callable[["Tag"], bool]
|
||||
|
||||
#: A function that takes a string (or None) and returns a yes-or-no
|
||||
#: answer. An `AttributeValueMatchRule` expects this kind of function, if
|
||||
#: you're going to pass it a function.
|
||||
_NullableStringMatchFunction: TypeAlias = Callable[[Optional[str]], bool]
|
||||
|
||||
#: A function that takes a string and returns a yes-or-no answer. A
|
||||
# `StringMatchRule` expects this kind of function, if you're going to
|
||||
# pass it a function.
|
||||
_StringMatchFunction: TypeAlias = Callable[[str], bool]
|
||||
|
||||
#: Either a tag name, an attribute value or a string can be matched
|
||||
#: against a string, bytestring, regular expression, or a boolean.
|
||||
_BaseStrainable: TypeAlias = Union[str, bytes, Pattern[str], bool]
|
||||
|
||||
#: A tag can be matched either with the `_BaseStrainable` options, or
|
||||
#: using a function that takes the `Tag` as its sole argument.
|
||||
_BaseStrainableElement: TypeAlias = Union[_BaseStrainable, _TagMatchFunction]
|
||||
|
||||
#: A tag's attribute value can be matched either with the
|
||||
#: `_BaseStrainable` options, or using a function that takes that
|
||||
#: value as its sole argument.
|
||||
_BaseStrainableAttribute: TypeAlias = Union[_BaseStrainable, _NullableStringMatchFunction]
|
||||
|
||||
#: A tag can be matched using either a single criterion or a list of
|
||||
#: criteria.
|
||||
_StrainableElement: TypeAlias = Union[
|
||||
_BaseStrainableElement, Iterable[_BaseStrainableElement]
|
||||
]
|
||||
|
||||
#: An attribute value can be matched using either a single criterion
|
||||
#: or a list of criteria.
|
||||
_StrainableAttribute: TypeAlias = Union[
|
||||
_BaseStrainableAttribute, Iterable[_BaseStrainableAttribute]
|
||||
]
|
||||
|
||||
#: An string can be matched using the same techniques as
|
||||
#: an attribute value.
|
||||
_StrainableString: TypeAlias = _StrainableAttribute
|
||||
|
||||
#: A dictionary may be used to match against multiple attribute vlaues at once.
|
||||
_StrainableAttributes: TypeAlias = Dict[str, _StrainableAttribute]
|
||||
|
||||
#: Many Beautiful soup methods return a PageElement or an ResultSet of
|
||||
#: PageElements. A PageElement is either a Tag or a NavigableString.
|
||||
#: These convenience aliases make it easier for IDE users to see which methods
|
||||
#: are available on the objects they're dealing with.
|
||||
_OneElement: TypeAlias = Union["PageElement", "Tag", "NavigableString"]
|
||||
_AtMostOneElement: TypeAlias = Optional[_OneElement]
|
||||
_AtMostOneTag: TypeAlias = Optional["Tag"]
|
||||
_AtMostOneNavigableString: TypeAlias = Optional["NavigableString"]
|
||||
_QueryResults: TypeAlias = "ResultSet[_OneElement]"
|
||||
_SomeTags: TypeAlias = "ResultSet[Tag]"
|
||||
_SomeNavigableStrings: TypeAlias = "ResultSet[NavigableString]"
|
||||
@@ -0,0 +1,98 @@
|
||||
"""Define some custom warnings."""
|
||||
|
||||
|
||||
class GuessedAtParserWarning(UserWarning):
|
||||
"""The warning issued when BeautifulSoup has to guess what parser to
|
||||
use -- probably because no parser was specified in the constructor.
|
||||
"""
|
||||
|
||||
MESSAGE: str = """No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system ("%(parser)s"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
|
||||
|
||||
The code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features="%(parser)s"' to the BeautifulSoup constructor.
|
||||
"""
|
||||
|
||||
|
||||
class UnusualUsageWarning(UserWarning):
|
||||
"""A superclass for warnings issued when Beautiful Soup sees
|
||||
something that is typically the result of a mistake in the calling
|
||||
code, but might be intentional on the part of the user. If it is
|
||||
in fact intentional, you can filter the individual warning class
|
||||
to get rid of the warning. If you don't like Beautiful Soup
|
||||
second-guessing what you are doing, you can filter the
|
||||
UnusualUsageWarningclass itself and get rid of these entirely.
|
||||
"""
|
||||
|
||||
|
||||
class MarkupResemblesLocatorWarning(UnusualUsageWarning):
|
||||
"""The warning issued when BeautifulSoup is given 'markup' that
|
||||
actually looks like a resource locator -- a URL or a path to a file
|
||||
on disk.
|
||||
"""
|
||||
|
||||
#: :meta private:
|
||||
GENERIC_MESSAGE: str = """
|
||||
|
||||
However, if you want to parse some data that happens to look like a %(what)s, then nothing has gone wrong: you are using Beautiful Soup correctly, and this warning is spurious and can be filtered. To make this warning go away, run this code before calling the BeautifulSoup constructor:
|
||||
|
||||
from bs4 import MarkupResemblesLocatorWarning
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
|
||||
"""
|
||||
|
||||
URL_MESSAGE: str = (
|
||||
"""The input passed in on this line looks more like a URL than HTML or XML.
|
||||
|
||||
If you meant to use Beautiful Soup to parse the web page found at a certain URL, then something has gone wrong. You should use an Python package like 'requests' to fetch the content behind the URL. Once you have the content as a string, you can feed that string into Beautiful Soup."""
|
||||
+ GENERIC_MESSAGE
|
||||
)
|
||||
|
||||
FILENAME_MESSAGE: str = (
|
||||
"""The input passed in on this line looks more like a filename than HTML or XML.
|
||||
|
||||
If you meant to use Beautiful Soup to parse the contents of a file on disk, then something has gone wrong. You should open the file first, using code like this:
|
||||
|
||||
filehandle = open(your filename)
|
||||
|
||||
You can then feed the open filehandle into Beautiful Soup instead of using the filename."""
|
||||
+ GENERIC_MESSAGE
|
||||
)
|
||||
|
||||
|
||||
class AttributeResemblesVariableWarning(UnusualUsageWarning, SyntaxWarning):
|
||||
"""The warning issued when Beautiful Soup suspects a provided
|
||||
attribute name may actually be the misspelled name of a Beautiful
|
||||
Soup variable. Generally speaking, this is only used in cases like
|
||||
"_class" where it's very unlikely the user would be referencing an
|
||||
XML attribute with that name.
|
||||
"""
|
||||
|
||||
MESSAGE: str = """%(original)r is an unusual attribute name and is a common misspelling for %(autocorrect)r.
|
||||
|
||||
If you meant %(autocorrect)r, change your code to use it, and this warning will go away.
|
||||
|
||||
If you really did mean to check the %(original)r attribute, this warning is spurious and can be filtered. To make it go away, run this code before creating your BeautifulSoup object:
|
||||
|
||||
from bs4 import AttributeResemblesVariableWarning
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore", category=AttributeResemblesVariableWarning)
|
||||
"""
|
||||
|
||||
|
||||
class XMLParsedAsHTMLWarning(UnusualUsageWarning):
|
||||
"""The warning issued when an HTML parser is used to parse
|
||||
XML that is not (as far as we can tell) XHTML.
|
||||
"""
|
||||
|
||||
MESSAGE: str = """It looks like you're using an HTML parser to parse an XML document.
|
||||
|
||||
Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.
|
||||
|
||||
If you want or need to use an HTML parser on this document, you can make this warning go away by filtering it. To do that, run this code before calling the BeautifulSoup constructor:
|
||||
|
||||
from bs4 import XMLParsedAsHTMLWarning
|
||||
import warnings
|
||||
|
||||
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
|
||||
"""
|
||||
@@ -0,0 +1,848 @@
|
||||
from __future__ import annotations
|
||||
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
from collections import defaultdict
|
||||
import re
|
||||
from types import ModuleType
|
||||
from typing import (
|
||||
Any,
|
||||
cast,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Pattern,
|
||||
Set,
|
||||
Tuple,
|
||||
Type,
|
||||
TYPE_CHECKING,
|
||||
)
|
||||
import warnings
|
||||
import sys
|
||||
from bs4.element import (
|
||||
AttributeDict,
|
||||
AttributeValueList,
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
RubyParenthesisString,
|
||||
RubyTextString,
|
||||
Stylesheet,
|
||||
Script,
|
||||
TemplateString,
|
||||
nonwhitespace_re,
|
||||
)
|
||||
|
||||
# Exceptions were moved to their own module in 4.13. Import here for
|
||||
# backwards compatibility.
|
||||
from bs4.exceptions import ParserRejectedMarkup
|
||||
|
||||
from bs4._typing import (
|
||||
_AttributeValues,
|
||||
_RawAttributeValue,
|
||||
)
|
||||
|
||||
from bs4._warnings import XMLParsedAsHTMLWarning
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import (
|
||||
NavigableString,
|
||||
Tag,
|
||||
)
|
||||
from bs4._typing import (
|
||||
_AttributeValue,
|
||||
_Encoding,
|
||||
_Encodings,
|
||||
_RawOrProcessedAttributeValues,
|
||||
_RawMarkup,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"HTMLTreeBuilder",
|
||||
"SAXTreeBuilder",
|
||||
"TreeBuilder",
|
||||
"TreeBuilderRegistry",
|
||||
]
|
||||
|
||||
# Some useful features for a TreeBuilder to have.
|
||||
FAST = "fast"
|
||||
PERMISSIVE = "permissive"
|
||||
STRICT = "strict"
|
||||
XML = "xml"
|
||||
HTML = "html"
|
||||
HTML_5 = "html5"
|
||||
|
||||
__all__ = [
|
||||
"TreeBuilderRegistry",
|
||||
"TreeBuilder",
|
||||
"HTMLTreeBuilder",
|
||||
"DetectsXMLParsedAsHTML",
|
||||
|
||||
"ParserRejectedMarkup", # backwards compatibility only as of 4.13.0
|
||||
]
|
||||
|
||||
class TreeBuilderRegistry(object):
|
||||
"""A way of looking up TreeBuilder subclasses by their name or by desired
|
||||
features.
|
||||
"""
|
||||
|
||||
builders_for_feature: Dict[str, List[Type[TreeBuilder]]]
|
||||
builders: List[Type[TreeBuilder]]
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.builders_for_feature = defaultdict(list)
|
||||
self.builders = []
|
||||
|
||||
def register(self, treebuilder_class: type[TreeBuilder]) -> None:
|
||||
"""Register a treebuilder based on its advertised features.
|
||||
|
||||
:param treebuilder_class: A subclass of `TreeBuilder`. its
|
||||
`TreeBuilder.features` attribute should list its features.
|
||||
"""
|
||||
for feature in treebuilder_class.features:
|
||||
self.builders_for_feature[feature].insert(0, treebuilder_class)
|
||||
self.builders.insert(0, treebuilder_class)
|
||||
|
||||
def lookup(self, *features: str) -> Optional[Type[TreeBuilder]]:
|
||||
"""Look up a TreeBuilder subclass with the desired features.
|
||||
|
||||
:param features: A list of features to look for. If none are
|
||||
provided, the most recently registered TreeBuilder subclass
|
||||
will be used.
|
||||
:return: A TreeBuilder subclass, or None if there's no
|
||||
registered subclass with all the requested features.
|
||||
"""
|
||||
if len(self.builders) == 0:
|
||||
# There are no builders at all.
|
||||
return None
|
||||
|
||||
if len(features) == 0:
|
||||
# They didn't ask for any features. Give them the most
|
||||
# recently registered builder.
|
||||
return self.builders[0]
|
||||
|
||||
# Go down the list of features in order, and eliminate any builders
|
||||
# that don't match every feature.
|
||||
feature_list = list(features)
|
||||
feature_list.reverse()
|
||||
candidates = None
|
||||
candidate_set = None
|
||||
while len(feature_list) > 0:
|
||||
feature = feature_list.pop()
|
||||
we_have_the_feature = self.builders_for_feature.get(feature, [])
|
||||
if len(we_have_the_feature) > 0:
|
||||
if candidates is None:
|
||||
candidates = we_have_the_feature
|
||||
candidate_set = set(candidates)
|
||||
elif candidate_set is not None:
|
||||
# Eliminate any candidates that don't have this feature.
|
||||
candidate_set = candidate_set.intersection(set(we_have_the_feature))
|
||||
|
||||
# The only valid candidates are the ones in candidate_set.
|
||||
# Go through the original list of candidates and pick the first one
|
||||
# that's in candidate_set.
|
||||
if candidate_set is None or candidates is None:
|
||||
return None
|
||||
for candidate in candidates:
|
||||
if candidate in candidate_set:
|
||||
return candidate
|
||||
return None
|
||||
|
||||
|
||||
#: The `BeautifulSoup` constructor will take a list of features
|
||||
#: and use it to look up `TreeBuilder` classes in this registry.
|
||||
builder_registry: TreeBuilderRegistry = TreeBuilderRegistry()
|
||||
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Turn a textual document into a Beautiful Soup object tree.
|
||||
|
||||
This is an abstract superclass which smooths out the behavior of
|
||||
different parser libraries into a single, unified interface.
|
||||
|
||||
:param multi_valued_attributes: If this is set to None, the
|
||||
TreeBuilder will not turn any values for attributes like
|
||||
'class' into lists. Setting this to a dictionary will
|
||||
customize this behavior; look at :py:attr:`bs4.builder.HTMLTreeBuilder.DEFAULT_CDATA_LIST_ATTRIBUTES`
|
||||
for an example.
|
||||
|
||||
Internally, these are called "CDATA list attributes", but that
|
||||
probably doesn't make sense to an end-user, so the argument name
|
||||
is ``multi_valued_attributes``.
|
||||
|
||||
:param preserve_whitespace_tags: A set of tags to treat
|
||||
the way <pre> tags are treated in HTML. Tags in this set
|
||||
are immune from pretty-printing; their contents will always be
|
||||
output as-is.
|
||||
|
||||
:param string_containers: A dictionary mapping tag names to
|
||||
the classes that should be instantiated to contain the textual
|
||||
contents of those tags. The default is to use NavigableString
|
||||
for every tag, no matter what the name. You can override the
|
||||
default by changing :py:attr:`DEFAULT_STRING_CONTAINERS`.
|
||||
|
||||
:param store_line_numbers: If the parser keeps track of the line
|
||||
numbers and positions of the original markup, that information
|
||||
will, by default, be stored in each corresponding
|
||||
:py:class:`bs4.element.Tag` object. You can turn this off by
|
||||
passing store_line_numbers=False; then Tag.sourcepos and
|
||||
Tag.sourceline will always be None. If the parser you're using
|
||||
doesn't keep track of this information, then store_line_numbers
|
||||
is irrelevant.
|
||||
|
||||
:param attribute_dict_class: The value of a multi-valued attribute
|
||||
(such as HTML's 'class') willl be stored in an instance of this
|
||||
class. The default is Beautiful Soup's built-in
|
||||
`AttributeValueList`, which is a normal Python list, and you
|
||||
will probably never need to change it.
|
||||
"""
|
||||
|
||||
USE_DEFAULT: Any = object() #: :meta private:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
multi_valued_attributes: Dict[str, Set[str]] = USE_DEFAULT,
|
||||
preserve_whitespace_tags: Set[str] = USE_DEFAULT,
|
||||
store_line_numbers: bool = USE_DEFAULT,
|
||||
string_containers: Dict[str, Type[NavigableString]] = USE_DEFAULT,
|
||||
empty_element_tags: Set[str] = USE_DEFAULT,
|
||||
attribute_dict_class: Type[AttributeDict] = AttributeDict,
|
||||
attribute_value_list_class: Type[AttributeValueList] = AttributeValueList,
|
||||
):
|
||||
self.soup = None
|
||||
if multi_valued_attributes is self.USE_DEFAULT:
|
||||
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
|
||||
self.cdata_list_attributes = multi_valued_attributes
|
||||
if preserve_whitespace_tags is self.USE_DEFAULT:
|
||||
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
|
||||
self.preserve_whitespace_tags = preserve_whitespace_tags
|
||||
if empty_element_tags is self.USE_DEFAULT:
|
||||
self.empty_element_tags = self.DEFAULT_EMPTY_ELEMENT_TAGS
|
||||
else:
|
||||
self.empty_element_tags = empty_element_tags
|
||||
# TODO: store_line_numbers is probably irrelevant now that
|
||||
# the behavior of sourceline and sourcepos has been made consistent
|
||||
# everywhere.
|
||||
if store_line_numbers == self.USE_DEFAULT:
|
||||
store_line_numbers = self.TRACKS_LINE_NUMBERS
|
||||
self.store_line_numbers = store_line_numbers
|
||||
if string_containers == self.USE_DEFAULT:
|
||||
string_containers = self.DEFAULT_STRING_CONTAINERS
|
||||
self.string_containers = string_containers
|
||||
self.attribute_dict_class = attribute_dict_class
|
||||
self.attribute_value_list_class = attribute_value_list_class
|
||||
|
||||
NAME: str = "[Unknown tree builder]"
|
||||
ALTERNATE_NAMES: Iterable[str] = []
|
||||
features: Iterable[str] = []
|
||||
|
||||
is_xml: bool = False
|
||||
picklable: bool = False
|
||||
|
||||
soup: Optional[BeautifulSoup] #: :meta private:
|
||||
|
||||
#: A tag will be considered an empty-element
|
||||
#: tag when and only when it has no contents.
|
||||
empty_element_tags: Optional[Set[str]] = None #: :meta private:
|
||||
cdata_list_attributes: Dict[str, Set[str]] #: :meta private:
|
||||
preserve_whitespace_tags: Set[str] #: :meta private:
|
||||
string_containers: Dict[str, Type[NavigableString]] #: :meta private:
|
||||
tracks_line_numbers: bool #: :meta private:
|
||||
|
||||
#: A value for these tag/attribute combinations is a space- or
|
||||
#: comma-separated list of CDATA, rather than a single CDATA.
|
||||
DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = defaultdict(set)
|
||||
|
||||
#: Whitespace should be preserved inside these tags.
|
||||
DEFAULT_PRESERVE_WHITESPACE_TAGS: Set[str] = set()
|
||||
|
||||
#: The textual contents of tags with these names should be
|
||||
#: instantiated with some class other than `bs4.element.NavigableString`.
|
||||
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = {} # type:ignore
|
||||
|
||||
#: By default, tags are treated as empty-element tags if they have
|
||||
#: no contents--that is, using XML rules. HTMLTreeBuilder
|
||||
#: defines a different set of DEFAULT_EMPTY_ELEMENT_TAGS based on the
|
||||
#: HTML 4 and HTML5 standards.
|
||||
DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = None
|
||||
|
||||
#: Most parsers don't keep track of line numbers.
|
||||
TRACKS_LINE_NUMBERS: bool = False
|
||||
|
||||
def initialize_soup(self, soup: BeautifulSoup) -> None:
|
||||
"""The BeautifulSoup object has been initialized and is now
|
||||
being associated with the TreeBuilder.
|
||||
|
||||
:param soup: A BeautifulSoup object.
|
||||
"""
|
||||
self.soup = soup
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Do any work necessary to reset the underlying parser
|
||||
for a new document.
|
||||
|
||||
By default, this does nothing.
|
||||
"""
|
||||
pass
|
||||
|
||||
def can_be_empty_element(self, tag_name: str) -> bool:
|
||||
"""Might a tag with this name be an empty-element tag?
|
||||
|
||||
The final markup may or may not actually present this tag as
|
||||
self-closing.
|
||||
|
||||
For instance: an HTMLBuilder does not consider a <p> tag to be
|
||||
an empty-element tag (it's not in
|
||||
HTMLBuilder.empty_element_tags). This means an empty <p> tag
|
||||
will be presented as "<p></p>", not "<p/>" or "<p>".
|
||||
|
||||
The default implementation has no opinion about which tags are
|
||||
empty-element tags, so a tag will be presented as an
|
||||
empty-element tag if and only if it has no children.
|
||||
"<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
|
||||
be left alone.
|
||||
|
||||
:param tag_name: The name of a markup tag.
|
||||
"""
|
||||
if self.empty_element_tags is None:
|
||||
return True
|
||||
return tag_name in self.empty_element_tags
|
||||
|
||||
def feed(self, markup: _RawMarkup) -> None:
|
||||
"""Run incoming markup through some parsing process."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def prepare_markup(
|
||||
self,
|
||||
markup: _RawMarkup,
|
||||
user_specified_encoding: Optional[_Encoding] = None,
|
||||
document_declared_encoding: Optional[_Encoding] = None,
|
||||
exclude_encodings: Optional[_Encodings] = None,
|
||||
) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
|
||||
"""Run any preliminary steps necessary to make incoming markup
|
||||
acceptable to the parser.
|
||||
|
||||
:param markup: The markup that's about to be parsed.
|
||||
:param user_specified_encoding: The user asked to try this encoding
|
||||
to convert the markup into a Unicode string.
|
||||
:param document_declared_encoding: The markup itself claims to be
|
||||
in this encoding. NOTE: This argument is not used by the
|
||||
calling code and can probably be removed.
|
||||
:param exclude_encodings: The user asked *not* to try any of
|
||||
these encodings.
|
||||
|
||||
:yield: A series of 4-tuples: (markup, encoding, declared encoding,
|
||||
has undergone character replacement)
|
||||
|
||||
Each 4-tuple represents a strategy that the parser can try
|
||||
to convert the document to Unicode and parse it. Each
|
||||
strategy will be tried in turn.
|
||||
|
||||
By default, the only strategy is to parse the markup
|
||||
as-is. See `LXMLTreeBuilderForXML` and
|
||||
`HTMLParserTreeBuilder` for implementations that take into
|
||||
account the quirks of particular parsers.
|
||||
|
||||
:meta private:
|
||||
|
||||
"""
|
||||
yield markup, None, None, False
|
||||
|
||||
def test_fragment_to_document(self, fragment: str) -> str:
|
||||
"""Wrap an HTML fragment to make it look like a document.
|
||||
|
||||
Different parsers do this differently. For instance, lxml
|
||||
introduces an empty <head> tag, and html5lib
|
||||
doesn't. Abstracting this away lets us write simple tests
|
||||
which run HTML fragments through the parser and compare the
|
||||
results against other HTML fragments.
|
||||
|
||||
This method should not be used outside of unit tests.
|
||||
|
||||
:param fragment: A fragment of HTML.
|
||||
:return: A full HTML document.
|
||||
:meta private:
|
||||
"""
|
||||
return fragment
|
||||
|
||||
def set_up_substitutions(self, tag: Tag) -> bool:
|
||||
"""Set up any substitutions that will need to be performed on
|
||||
a `Tag` when it's output as a string.
|
||||
|
||||
By default, this does nothing. See `HTMLTreeBuilder` for a
|
||||
case where this is used.
|
||||
|
||||
:return: Whether or not a substitution was performed.
|
||||
:meta private:
|
||||
"""
|
||||
return False
|
||||
|
||||
def _replace_cdata_list_attribute_values(
|
||||
self, tag_name: str, attrs: _RawOrProcessedAttributeValues
|
||||
) -> _AttributeValues:
|
||||
"""When an attribute value is associated with a tag that can
|
||||
have multiple values for that attribute, convert the string
|
||||
value to a list of strings.
|
||||
|
||||
Basically, replaces class="foo bar" with class=["foo", "bar"]
|
||||
|
||||
NOTE: This method modifies its input in place.
|
||||
|
||||
:param tag_name: The name of a tag.
|
||||
:param attrs: A dictionary containing the tag's attributes.
|
||||
Any appropriate attribute values will be modified in place.
|
||||
:return: The modified dictionary that was originally passed in.
|
||||
"""
|
||||
|
||||
# First, cast the attrs dict to _AttributeValues. This might
|
||||
# not be accurate yet, but it will be by the time this method
|
||||
# returns.
|
||||
modified_attrs = cast(_AttributeValues, attrs)
|
||||
if not modified_attrs or not self.cdata_list_attributes:
|
||||
# Nothing to do.
|
||||
return modified_attrs
|
||||
|
||||
# There is at least a possibility that we need to modify one of
|
||||
# the attribute values.
|
||||
universal: Set[str] = self.cdata_list_attributes.get("*", set())
|
||||
tag_specific = self.cdata_list_attributes.get(tag_name.lower(), None)
|
||||
for attr in list(modified_attrs.keys()):
|
||||
modified_value: _AttributeValue
|
||||
if attr in universal or (tag_specific and attr in tag_specific):
|
||||
# We have a "class"-type attribute whose string
|
||||
# value is a whitespace-separated list of
|
||||
# values. Split it into a list.
|
||||
original_value: _AttributeValue = modified_attrs[attr]
|
||||
if isinstance(original_value, _RawAttributeValue):
|
||||
# This is a _RawAttributeValue (a string) that
|
||||
# needs to be split and converted to a
|
||||
# AttributeValueList so it can be an
|
||||
# _AttributeValue.
|
||||
modified_value = self.attribute_value_list_class(
|
||||
nonwhitespace_re.findall(original_value)
|
||||
)
|
||||
else:
|
||||
# html5lib calls setAttributes twice for the
|
||||
# same tag when rearranging the parse tree. On
|
||||
# the second call the attribute value here is
|
||||
# already a list. This can also happen when a
|
||||
# Tag object is cloned. If this happens, leave
|
||||
# the value alone rather than trying to split
|
||||
# it again.
|
||||
modified_value = original_value
|
||||
modified_attrs[attr] = modified_value
|
||||
return modified_attrs
|
||||
|
||||
|
||||
class SAXTreeBuilder(TreeBuilder):
|
||||
"""A Beautiful Soup treebuilder that listens for SAX events.
|
||||
|
||||
This is not currently used for anything, and it will be removed
|
||||
soon. It was a good idea, but it wasn't properly integrated into the
|
||||
rest of Beautiful Soup, so there have been long stretches where it
|
||||
hasn't worked properly.
|
||||
"""
|
||||
|
||||
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
||||
warnings.warn(
|
||||
"The SAXTreeBuilder class was deprecated in 4.13.0 and will be removed soon thereafter. It is completely untested and probably doesn't work; do not use it.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
super(SAXTreeBuilder, self).__init__(*args, **kwargs)
|
||||
|
||||
def feed(self, markup: _RawMarkup) -> None:
|
||||
raise NotImplementedError()
|
||||
|
||||
def close(self) -> None:
|
||||
pass
|
||||
|
||||
def startElement(self, name: str, attrs: Dict[str, str]) -> None:
|
||||
attrs = AttributeDict((key[1], value) for key, value in list(attrs.items()))
|
||||
# print("Start %s, %r" % (name, attrs))
|
||||
assert self.soup is not None
|
||||
self.soup.handle_starttag(name, None, None, attrs)
|
||||
|
||||
def endElement(self, name: str) -> None:
|
||||
# print("End %s" % name)
|
||||
assert self.soup is not None
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def startElementNS(
|
||||
self, nsTuple: Tuple[str, str], nodeName: str, attrs: Dict[str, str]
|
||||
) -> None:
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.startElement(nodeName, attrs)
|
||||
|
||||
def endElementNS(self, nsTuple: Tuple[str, str], nodeName: str) -> None:
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.endElement(nodeName)
|
||||
# handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||
|
||||
def startPrefixMapping(self, prefix: str, nodeValue: str) -> None:
|
||||
# Ignore the prefix for now.
|
||||
pass
|
||||
|
||||
def endPrefixMapping(self, prefix: str) -> None:
|
||||
# Ignore the prefix for now.
|
||||
# handler.endPrefixMapping(prefix)
|
||||
pass
|
||||
|
||||
def characters(self, content: str) -> None:
|
||||
assert self.soup is not None
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def startDocument(self) -> None:
|
||||
pass
|
||||
|
||||
def endDocument(self) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class HTMLTreeBuilder(TreeBuilder):
|
||||
"""This TreeBuilder knows facts about HTML, such as which tags are treated
|
||||
specially by the HTML standard.
|
||||
"""
|
||||
|
||||
#: Some HTML tags are defined as having no contents. Beautiful Soup
|
||||
#: treats these specially.
|
||||
DEFAULT_EMPTY_ELEMENT_TAGS: Optional[Set[str]] = set(
|
||||
[
|
||||
# These are from HTML5.
|
||||
"area",
|
||||
"base",
|
||||
"br",
|
||||
"col",
|
||||
"embed",
|
||||
"hr",
|
||||
"img",
|
||||
"input",
|
||||
"keygen",
|
||||
"link",
|
||||
"menuitem",
|
||||
"meta",
|
||||
"param",
|
||||
"source",
|
||||
"track",
|
||||
"wbr",
|
||||
# These are from earlier versions of HTML and are removed in HTML5.
|
||||
"basefont",
|
||||
"bgsound",
|
||||
"command",
|
||||
"frame",
|
||||
"image",
|
||||
"isindex",
|
||||
"nextid",
|
||||
"spacer",
|
||||
]
|
||||
)
|
||||
|
||||
#: The HTML standard defines these tags as block-level elements. Beautiful
|
||||
#: Soup does not treat these elements differently from other elements,
|
||||
#: but it may do so eventually, and this information is available if
|
||||
#: you need to use it.
|
||||
DEFAULT_BLOCK_ELEMENTS: Set[str] = set(
|
||||
[
|
||||
"address",
|
||||
"article",
|
||||
"aside",
|
||||
"blockquote",
|
||||
"canvas",
|
||||
"dd",
|
||||
"div",
|
||||
"dl",
|
||||
"dt",
|
||||
"fieldset",
|
||||
"figcaption",
|
||||
"figure",
|
||||
"footer",
|
||||
"form",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"header",
|
||||
"hr",
|
||||
"li",
|
||||
"main",
|
||||
"nav",
|
||||
"noscript",
|
||||
"ol",
|
||||
"output",
|
||||
"p",
|
||||
"pre",
|
||||
"section",
|
||||
"table",
|
||||
"tfoot",
|
||||
"ul",
|
||||
"video",
|
||||
]
|
||||
)
|
||||
|
||||
#: These HTML tags need special treatment so they can be
|
||||
#: represented by a string class other than `bs4.element.NavigableString`.
|
||||
#:
|
||||
#: For some of these tags, it's because the HTML standard defines
|
||||
#: an unusual content model for them. I made this list by going
|
||||
#: through the HTML spec
|
||||
#: (https://html.spec.whatwg.org/#metadata-content) and looking for
|
||||
#: "metadata content" elements that can contain strings.
|
||||
#:
|
||||
#: The Ruby tags (<rt> and <rp>) are here despite being normal
|
||||
#: "phrasing content" tags, because the content they contain is
|
||||
#: qualitatively different from other text in the document, and it
|
||||
#: can be useful to be able to distinguish it.
|
||||
#:
|
||||
#: TODO: Arguably <noscript> could go here but it seems
|
||||
#: qualitatively different from the other tags.
|
||||
DEFAULT_STRING_CONTAINERS: Dict[str, Type[bs4.element.NavigableString]] = { # type:ignore
|
||||
"rt": RubyTextString,
|
||||
"rp": RubyParenthesisString,
|
||||
"style": Stylesheet,
|
||||
"script": Script,
|
||||
"template": TemplateString,
|
||||
}
|
||||
|
||||
#: The HTML standard defines these attributes as containing a
|
||||
#: space-separated list of values, not a single value. That is,
|
||||
#: class="foo bar" means that the 'class' attribute has two values,
|
||||
#: 'foo' and 'bar', not the single value 'foo bar'. When we
|
||||
#: encounter one of these attributes, we will parse its value into
|
||||
#: a list of values if possible. Upon output, the list will be
|
||||
#: converted back into a string.
|
||||
DEFAULT_CDATA_LIST_ATTRIBUTES: Dict[str, Set[str]] = {
|
||||
"*": {"class", "accesskey", "dropzone"},
|
||||
"a": {"rel", "rev"},
|
||||
"link": {"rel", "rev"},
|
||||
"td": {"headers"},
|
||||
"th": {"headers"},
|
||||
"form": {"accept-charset"},
|
||||
"object": {"archive"},
|
||||
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
|
||||
"area": {"rel"},
|
||||
"icon": {"sizes"},
|
||||
"iframe": {"sandbox"},
|
||||
"output": {"for"},
|
||||
}
|
||||
|
||||
#: By default, whitespace inside these HTML tags will be
|
||||
#: preserved rather than being collapsed.
|
||||
DEFAULT_PRESERVE_WHITESPACE_TAGS: set[str] = set(["pre", "textarea"])
|
||||
|
||||
def set_up_substitutions(self, tag: Tag) -> bool:
|
||||
"""Replace the declared encoding in a <meta> tag with a placeholder,
|
||||
to be substituted when the tag is output to a string.
|
||||
|
||||
An HTML document may come in to Beautiful Soup as one
|
||||
encoding, but exit in a different encoding, and the <meta> tag
|
||||
needs to be changed to reflect this.
|
||||
|
||||
:return: Whether or not a substitution was performed.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
# We are only interested in <meta> tags
|
||||
if tag.name != "meta":
|
||||
return False
|
||||
|
||||
# TODO: This cast will fail in the (very unlikely) scenario
|
||||
# that the programmer who instantiates the TreeBuilder
|
||||
# specifies meta['content'] or meta['charset'] as
|
||||
# cdata_list_attributes.
|
||||
content: Optional[str] = cast(Optional[str], tag.get("content"))
|
||||
charset: Optional[str] = cast(Optional[str], tag.get("charset"))
|
||||
|
||||
# But we can accommodate meta['http-equiv'] being made a
|
||||
# cdata_list_attribute (again, very unlikely) without much
|
||||
# trouble.
|
||||
http_equiv: List[str] = tag.get_attribute_list("http-equiv")
|
||||
|
||||
# We are interested in <meta> tags that say what encoding the
|
||||
# document was originally in. This means HTML 5-style <meta>
|
||||
# tags that provide the "charset" attribute. It also means
|
||||
# HTML 4-style <meta> tags that provide the "content"
|
||||
# attribute and have "http-equiv" set to "content-type".
|
||||
#
|
||||
# In both cases we will replace the value of the appropriate
|
||||
# attribute with a standin object that can take on any
|
||||
# encoding.
|
||||
substituted = False
|
||||
if charset is not None:
|
||||
# HTML 5 style:
|
||||
# <meta charset="utf8">
|
||||
tag["charset"] = CharsetMetaAttributeValue(charset)
|
||||
substituted = True
|
||||
|
||||
elif content is not None and any(
|
||||
x.lower() == "content-type" for x in http_equiv
|
||||
):
|
||||
# HTML 4 style:
|
||||
# <meta http-equiv="content-type" content="text/html; charset=utf8">
|
||||
tag["content"] = ContentMetaAttributeValue(content)
|
||||
substituted = True
|
||||
|
||||
return substituted
|
||||
|
||||
|
||||
class DetectsXMLParsedAsHTML(object):
|
||||
"""A mixin class for any class (a TreeBuilder, or some class used by a
|
||||
TreeBuilder) that's in a position to detect whether an XML
|
||||
document is being incorrectly parsed as HTML, and issue an
|
||||
appropriate warning.
|
||||
|
||||
This requires being able to observe an incoming processing
|
||||
instruction that might be an XML declaration, and also able to
|
||||
observe tags as they're opened. If you can't do that for a given
|
||||
`TreeBuilder`, there's a less reliable implementation based on
|
||||
examining the raw markup.
|
||||
"""
|
||||
|
||||
#: Regular expression for seeing if string markup has an <html> tag.
|
||||
LOOKS_LIKE_HTML: Pattern[str] = re.compile("<[^ +]html", re.I)
|
||||
|
||||
#: Regular expression for seeing if byte markup has an <html> tag.
|
||||
LOOKS_LIKE_HTML_B: Pattern[bytes] = re.compile(b"<[^ +]html", re.I)
|
||||
|
||||
#: The start of an XML document string.
|
||||
XML_PREFIX: str = "<?xml"
|
||||
|
||||
#: The start of an XML document bytestring.
|
||||
XML_PREFIX_B: bytes = b"<?xml"
|
||||
|
||||
# This is typed as str, not `ProcessingInstruction`, because this
|
||||
# check may be run before any Beautiful Soup objects are created.
|
||||
_first_processing_instruction: Optional[str] #: :meta private:
|
||||
_root_tag_name: Optional[str] #: :meta private:
|
||||
|
||||
@classmethod
|
||||
def warn_if_markup_looks_like_xml(
|
||||
cls, markup: Optional[_RawMarkup], stacklevel: int = 3
|
||||
) -> bool:
|
||||
"""Perform a check on some markup to see if it looks like XML
|
||||
that's not XHTML. If so, issue a warning.
|
||||
|
||||
This is much less reliable than doing the check while parsing,
|
||||
but some of the tree builders can't do that.
|
||||
|
||||
:param stacklevel: The stacklevel of the code calling this\
|
||||
function.
|
||||
|
||||
:return: True if the markup looks like non-XHTML XML, False
|
||||
otherwise.
|
||||
"""
|
||||
if markup is None:
|
||||
return False
|
||||
markup = markup[:500]
|
||||
if isinstance(markup, bytes):
|
||||
markup_b: bytes = markup
|
||||
looks_like_xml = markup_b.startswith(
|
||||
cls.XML_PREFIX_B
|
||||
) and not cls.LOOKS_LIKE_HTML_B.search(markup)
|
||||
else:
|
||||
markup_s: str = markup
|
||||
looks_like_xml = markup_s.startswith(
|
||||
cls.XML_PREFIX
|
||||
) and not cls.LOOKS_LIKE_HTML.search(markup)
|
||||
|
||||
if looks_like_xml:
|
||||
cls._warn(stacklevel=stacklevel + 2)
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def _warn(cls, stacklevel: int = 5) -> None:
|
||||
"""Issue a warning about XML being parsed as HTML."""
|
||||
warnings.warn(
|
||||
XMLParsedAsHTMLWarning.MESSAGE,
|
||||
XMLParsedAsHTMLWarning,
|
||||
stacklevel=stacklevel,
|
||||
)
|
||||
|
||||
def _initialize_xml_detector(self) -> None:
|
||||
"""Call this method before parsing a document."""
|
||||
self._first_processing_instruction = None
|
||||
self._root_tag_name = None
|
||||
|
||||
def _document_might_be_xml(self, processing_instruction: str) -> None:
|
||||
"""Call this method when encountering an XML declaration, or a
|
||||
"processing instruction" that might be an XML declaration.
|
||||
|
||||
This helps Beautiful Soup detect potential issues later, if
|
||||
the XML document turns out to be a non-XHTML document that's
|
||||
being parsed as XML.
|
||||
"""
|
||||
if (
|
||||
self._first_processing_instruction is not None
|
||||
or self._root_tag_name is not None
|
||||
):
|
||||
# The document has already started. Don't bother checking
|
||||
# anymore.
|
||||
return
|
||||
|
||||
self._first_processing_instruction = processing_instruction
|
||||
|
||||
# We won't know until we encounter the first tag whether or
|
||||
# not this is actually a problem.
|
||||
|
||||
def _root_tag_encountered(self, name: str) -> None:
|
||||
"""Call this when you encounter the document's root tag.
|
||||
|
||||
This is where we actually check whether an XML document is
|
||||
being incorrectly parsed as HTML, and issue the warning.
|
||||
"""
|
||||
if self._root_tag_name is not None:
|
||||
# This method was incorrectly called multiple times. Do
|
||||
# nothing.
|
||||
return
|
||||
|
||||
self._root_tag_name = name
|
||||
|
||||
if (
|
||||
name != "html"
|
||||
and self._first_processing_instruction is not None
|
||||
and self._first_processing_instruction.lower().startswith("xml ")
|
||||
):
|
||||
# We encountered an XML declaration and then a tag other
|
||||
# than 'html'. This is a reliable indicator that a
|
||||
# non-XHTML document is being parsed as XML.
|
||||
self._warn(stacklevel=10)
|
||||
|
||||
|
||||
def register_treebuilders_from(module: ModuleType) -> None:
|
||||
"""Copy TreeBuilders from the given module into this module."""
|
||||
this_module = sys.modules[__name__]
|
||||
for name in module.__all__:
|
||||
obj = getattr(module, name)
|
||||
|
||||
if issubclass(obj, TreeBuilder):
|
||||
setattr(this_module, name, obj)
|
||||
this_module.__all__.append(name)
|
||||
# Register the builder while we're at it.
|
||||
this_module.builder_registry.register(obj)
|
||||
|
||||
|
||||
# Builders are registered in reverse order of priority, so that custom
|
||||
# builder registrations will take precedence. In general, we want lxml
|
||||
# to take precedence over html5lib, because it's faster. And we only
|
||||
# want to use HTMLParser as a last resort.
|
||||
from . import _htmlparser # noqa: E402
|
||||
|
||||
register_treebuilders_from(_htmlparser)
|
||||
try:
|
||||
from . import _html5lib
|
||||
|
||||
register_treebuilders_from(_html5lib)
|
||||
except ImportError:
|
||||
# They don't have html5lib installed.
|
||||
pass
|
||||
try:
|
||||
from . import _lxml
|
||||
|
||||
register_treebuilders_from(_lxml)
|
||||
except ImportError:
|
||||
# They don't have lxml installed.
|
||||
pass
|
||||
@@ -0,0 +1,611 @@
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
"HTML5TreeBuilder",
|
||||
]
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
cast,
|
||||
Dict,
|
||||
Iterable,
|
||||
Optional,
|
||||
Sequence,
|
||||
TYPE_CHECKING,
|
||||
Tuple,
|
||||
Union,
|
||||
)
|
||||
from typing_extensions import TypeAlias
|
||||
from bs4._typing import (
|
||||
_AttributeValue,
|
||||
_AttributeValues,
|
||||
_Encoding,
|
||||
_Encodings,
|
||||
_NamespaceURL,
|
||||
_RawMarkup,
|
||||
)
|
||||
|
||||
import warnings
|
||||
from bs4.builder import (
|
||||
DetectsXMLParsedAsHTML,
|
||||
PERMISSIVE,
|
||||
HTML,
|
||||
HTML_5,
|
||||
HTMLTreeBuilder,
|
||||
)
|
||||
from bs4.element import (
|
||||
NamespacedAttribute,
|
||||
PageElement,
|
||||
nonwhitespace_re,
|
||||
)
|
||||
import html5lib
|
||||
from html5lib.constants import (
|
||||
namespaces,
|
||||
)
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
Tag,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from html5lib.treebuilders import base as treebuilder_base
|
||||
|
||||
|
||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
"""Use `html5lib <https://github.com/html5lib/html5lib-python>`_ to
|
||||
build a tree.
|
||||
|
||||
Note that `HTML5TreeBuilder` does not support some common HTML
|
||||
`TreeBuilder` features. Some of these features could theoretically
|
||||
be implemented, but at the very least it's quite difficult,
|
||||
because html5lib moves the parse tree around as it's being built.
|
||||
|
||||
Specifically:
|
||||
|
||||
* This `TreeBuilder` doesn't use different subclasses of
|
||||
`NavigableString` (e.g. `Script`) based on the name of the tag
|
||||
in which the string was found.
|
||||
* You can't use a `SoupStrainer` to parse only part of a document.
|
||||
"""
|
||||
|
||||
NAME: str = "html5lib"
|
||||
|
||||
features: Iterable[str] = [NAME, PERMISSIVE, HTML_5, HTML]
|
||||
|
||||
#: html5lib can tell us which line number and position in the
|
||||
#: original file is the source of an element.
|
||||
TRACKS_LINE_NUMBERS: bool = True
|
||||
|
||||
underlying_builder: "TreeBuilderForHtml5lib" #: :meta private:
|
||||
user_specified_encoding: Optional[_Encoding]
|
||||
|
||||
def prepare_markup(
|
||||
self,
|
||||
markup: _RawMarkup,
|
||||
user_specified_encoding: Optional[_Encoding] = None,
|
||||
document_declared_encoding: Optional[_Encoding] = None,
|
||||
exclude_encodings: Optional[_Encodings] = None,
|
||||
) -> Iterable[Tuple[_RawMarkup, Optional[_Encoding], Optional[_Encoding], bool]]:
|
||||
# Store the user-specified encoding for use later on.
|
||||
self.user_specified_encoding = user_specified_encoding
|
||||
|
||||
# document_declared_encoding and exclude_encodings aren't used
|
||||
# ATM because the html5lib TreeBuilder doesn't use
|
||||
# UnicodeDammit.
|
||||
for variable, name in (
|
||||
(document_declared_encoding, "document_declared_encoding"),
|
||||
(exclude_encodings, "exclude_encodings"),
|
||||
):
|
||||
if variable:
|
||||
warnings.warn(
|
||||
f"You provided a value for {name}, but the html5lib tree builder doesn't support {name}.",
|
||||
stacklevel=3,
|
||||
)
|
||||
|
||||
# html5lib only parses HTML, so if it's given XML that's worth
|
||||
# noting.
|
||||
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
|
||||
|
||||
yield (markup, None, None, False)
|
||||
|
||||
# These methods are defined by Beautiful Soup.
|
||||
def feed(self, markup: _RawMarkup) -> None:
|
||||
"""Run some incoming markup through some parsing process,
|
||||
populating the `BeautifulSoup` object in `HTML5TreeBuilder.soup`.
|
||||
"""
|
||||
if self.soup is not None and self.soup.parse_only is not None:
|
||||
warnings.warn(
|
||||
"You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.",
|
||||
stacklevel=4,
|
||||
)
|
||||
|
||||
# self.underlying_builder is probably None now, but it'll be set
|
||||
# when html5lib calls self.create_treebuilder().
|
||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||
assert self.underlying_builder is not None
|
||||
self.underlying_builder.parser = parser
|
||||
extra_kwargs = dict()
|
||||
if not isinstance(markup, str):
|
||||
# kwargs, specifically override_encoding, will eventually
|
||||
# be passed in to html5lib's
|
||||
# HTMLBinaryInputStream.__init__.
|
||||
extra_kwargs["override_encoding"] = self.user_specified_encoding
|
||||
|
||||
doc = parser.parse(markup, **extra_kwargs) # type:ignore
|
||||
|
||||
# Set the character encoding detected by the tokenizer.
|
||||
if isinstance(markup, str):
|
||||
# We need to special-case this because html5lib sets
|
||||
# charEncoding to UTF-8 if it gets Unicode input.
|
||||
doc.original_encoding = None
|
||||
else:
|
||||
original_encoding = parser.tokenizer.stream.charEncoding[0] # type:ignore
|
||||
# The encoding is an html5lib Encoding object. We want to
|
||||
# use a string for compatibility with other tree builders.
|
||||
original_encoding = original_encoding.name
|
||||
doc.original_encoding = original_encoding
|
||||
self.underlying_builder.parser = None
|
||||
|
||||
def create_treebuilder(
|
||||
self, namespaceHTMLElements: bool
|
||||
) -> "TreeBuilderForHtml5lib":
|
||||
"""Called by html5lib to instantiate the kind of class it
|
||||
calls a 'TreeBuilder'.
|
||||
|
||||
:param namespaceHTMLElements: Whether or not to namespace HTML elements.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||
namespaceHTMLElements, self.soup, store_line_numbers=self.store_line_numbers
|
||||
)
|
||||
return self.underlying_builder
|
||||
|
||||
def test_fragment_to_document(self, fragment: str) -> str:
|
||||
"""See `TreeBuilder`."""
|
||||
return "<html><head></head><body>%s</body></html>" % fragment
|
||||
|
||||
|
||||
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||
soup: "BeautifulSoup" #: :meta private:
|
||||
parser: Optional[html5lib.HTMLParser] #: :meta private:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
namespaceHTMLElements: bool,
|
||||
soup: Optional["BeautifulSoup"] = None,
|
||||
store_line_numbers: bool = True,
|
||||
**kwargs: Any,
|
||||
):
|
||||
if soup:
|
||||
self.soup = soup
|
||||
else:
|
||||
warnings.warn(
|
||||
"The optionality of the 'soup' argument to the TreeBuilderForHtml5lib constructor is deprecated as of Beautiful Soup 4.13.0: 'soup' is now required. If you can't pass in a BeautifulSoup object here, or you get this warning and it seems mysterious to you, please contact the Beautiful Soup developer team for possible un-deprecation.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# TODO: Why is the parser 'html.parser' here? Using
|
||||
# html5lib doesn't cause an infinite loop and is more
|
||||
# accurate. Best to get rid of this entire section, I think.
|
||||
self.soup = BeautifulSoup(
|
||||
"", "html.parser", store_line_numbers=store_line_numbers, **kwargs
|
||||
)
|
||||
# TODO: What are **kwargs exactly? Should they be passed in
|
||||
# here in addition to/instead of being passed to the BeautifulSoup
|
||||
# constructor?
|
||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||
|
||||
# This will be set later to a real html5lib HTMLParser object,
|
||||
# which we can use to track the current line number.
|
||||
self.parser = None
|
||||
self.store_line_numbers = store_line_numbers
|
||||
|
||||
def documentClass(self) -> "Element":
|
||||
self.soup.reset()
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def insertDoctype(self, token: Dict[str, Any]) -> None:
|
||||
name: str = cast(str, token["name"])
|
||||
publicId: Optional[str] = cast(Optional[str], token["publicId"])
|
||||
systemId: Optional[str] = cast(Optional[str], token["systemId"])
|
||||
|
||||
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def elementClass(self, name: str, namespace: str) -> "Element":
|
||||
sourceline: Optional[int] = None
|
||||
sourcepos: Optional[int] = None
|
||||
if self.parser is not None and self.store_line_numbers:
|
||||
# This represents the point immediately after the end of the
|
||||
# tag. We don't know when the tag started, but we do know
|
||||
# where it ended -- the character just before this one.
|
||||
sourceline, sourcepos = self.parser.tokenizer.stream.position() # type:ignore
|
||||
assert sourcepos is not None
|
||||
sourcepos = sourcepos - 1
|
||||
tag = self.soup.new_tag(
|
||||
name, namespace, sourceline=sourceline, sourcepos=sourcepos
|
||||
)
|
||||
|
||||
return Element(tag, self.soup, namespace)
|
||||
|
||||
def commentClass(self, data: str) -> "TextNode":
|
||||
return TextNode(Comment(data), self.soup)
|
||||
|
||||
def fragmentClass(self) -> "Element":
|
||||
"""This is only used by html5lib HTMLParser.parseFragment(),
|
||||
which is never used by Beautiful Soup, only by the html5lib
|
||||
unit tests. Since we don't currently hook into those tests,
|
||||
the implementation is left blank.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def getFragment(self) -> "Element":
|
||||
"""This is only used by the html5lib unit tests. Since we
|
||||
don't currently hook into those tests, the implementation is
|
||||
left blank.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def appendChild(self, node: "Element") -> None:
|
||||
# TODO: This code is not covered by the BS4 tests, and
|
||||
# apparently not triggered by the html5lib test suite either.
|
||||
# But it doesn't seem test-specific and there are calls to it
|
||||
# (or a method with the same name) all over html5lib, so I'm
|
||||
# leaving the implementation in place rather than replacing it
|
||||
# with NotImplementedError()
|
||||
self.soup.append(node.element)
|
||||
|
||||
def getDocument(self) -> "BeautifulSoup":
|
||||
return self.soup
|
||||
|
||||
def testSerializer(self, node: "Element") -> None:
|
||||
"""This is only used by the html5lib unit tests. Since we
|
||||
don't currently hook into those tests, the implementation is
|
||||
left blank.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class AttrList(object):
|
||||
"""Represents a Tag's attributes in a way compatible with html5lib."""
|
||||
|
||||
element: Tag
|
||||
attrs: _AttributeValues
|
||||
|
||||
def __init__(self, element: Tag):
|
||||
self.element = element
|
||||
self.attrs = dict(self.element.attrs)
|
||||
|
||||
def __iter__(self) -> Iterable[Tuple[str, _AttributeValue]]:
|
||||
return list(self.attrs.items()).__iter__()
|
||||
|
||||
def __setitem__(self, name: str, value: _AttributeValue) -> None:
|
||||
# If this attribute is a multi-valued attribute for this element,
|
||||
# turn its value into a list.
|
||||
list_attr = self.element.cdata_list_attributes or {}
|
||||
if name in list_attr.get("*", []) or (
|
||||
self.element.name in list_attr
|
||||
and name in list_attr.get(self.element.name, [])
|
||||
):
|
||||
# A node that is being cloned may have already undergone
|
||||
# this procedure. Check for this and skip it.
|
||||
if not isinstance(value, list):
|
||||
assert isinstance(value, str)
|
||||
value = self.element.attribute_value_list_class(
|
||||
nonwhitespace_re.findall(value)
|
||||
)
|
||||
self.element[name] = value
|
||||
|
||||
def items(self) -> Iterable[Tuple[str, _AttributeValue]]:
|
||||
return list(self.attrs.items())
|
||||
|
||||
def keys(self) -> Iterable[str]:
|
||||
return list(self.attrs.keys())
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.attrs)
|
||||
|
||||
def __getitem__(self, name: str) -> _AttributeValue:
|
||||
return self.attrs[name]
|
||||
|
||||
def __contains__(self, name: str) -> bool:
|
||||
return name in list(self.attrs.keys())
|
||||
|
||||
|
||||
class BeautifulSoupNode(treebuilder_base.Node):
|
||||
# A node can correspond to _either_ a Tag _or_ a NavigableString.
|
||||
tag: Optional[Tag]
|
||||
string: Optional[NavigableString]
|
||||
soup: "BeautifulSoup"
|
||||
namespace: Optional[_NamespaceURL]
|
||||
|
||||
@property
|
||||
def element(self) -> PageElement:
|
||||
assert self.tag is not None or self.string is not None
|
||||
if self.tag is not None:
|
||||
return self.tag
|
||||
else:
|
||||
assert self.string is not None
|
||||
return self.string
|
||||
|
||||
@property
|
||||
def nodeType(self) -> int:
|
||||
"""Return the html5lib constant corresponding to the type of
|
||||
the underlying DOM object.
|
||||
|
||||
NOTE: This property is only accessed by the html5lib test
|
||||
suite, not by Beautiful Soup proper.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
# TODO-TYPING: typeshed stubs are incorrect about this;
|
||||
# cloneNode returns a new Node, not None.
|
||||
def cloneNode(self) -> treebuilder_base.Node: # type:ignore
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class Element(BeautifulSoupNode):
|
||||
namespace: Optional[_NamespaceURL]
|
||||
|
||||
def __init__(
|
||||
self, element: Tag, soup: "BeautifulSoup", namespace: Optional[_NamespaceURL]
|
||||
):
|
||||
self.tag = element
|
||||
self.string = None
|
||||
self.soup = soup
|
||||
self.namespace = namespace
|
||||
treebuilder_base.Node.__init__(self, element.name)
|
||||
|
||||
def appendChild(self, node: "BeautifulSoupNode") -> None:
|
||||
string_child: Optional[NavigableString] = None
|
||||
child: PageElement
|
||||
if type(node.string) is NavigableString:
|
||||
# We check for NavigableString *only* because we want to avoid
|
||||
# joining PreformattedStrings, such as Comments, with nearby strings.
|
||||
string_child = child = node.string
|
||||
else:
|
||||
child = node.element
|
||||
node.parent = self
|
||||
|
||||
if (
|
||||
child is not None
|
||||
and child.parent is not None
|
||||
and not isinstance(child, str)
|
||||
):
|
||||
node.element.extract()
|
||||
|
||||
if (
|
||||
string_child is not None
|
||||
and self.tag is not None and self.tag.contents
|
||||
and type(self.tag.contents[-1]) is NavigableString
|
||||
):
|
||||
# We are appending a string onto another string.
|
||||
# TODO This has O(n^2) performance, for input like
|
||||
# "a</a>a</a>a</a>..."
|
||||
old_element = self.tag.contents[-1]
|
||||
new_element = self.soup.new_string(old_element + string_child)
|
||||
old_element.replace_with(new_element)
|
||||
self.soup._most_recent_element = new_element
|
||||
else:
|
||||
if isinstance(node, str):
|
||||
# Create a brand new NavigableString from this string.
|
||||
child = self.soup.new_string(node)
|
||||
|
||||
# Tell Beautiful Soup to act as if it parsed this element
|
||||
# immediately after the parent's last descendant. (Or
|
||||
# immediately after the parent, if it has no children.)
|
||||
if self.tag is not None and self.tag.contents:
|
||||
most_recent_element = self.tag._last_descendant(False)
|
||||
elif self.element.next_element is not None:
|
||||
# Something from further ahead in the parse tree is
|
||||
# being inserted into this earlier element. This is
|
||||
# very annoying because it means an expensive search
|
||||
# for the last element in the tree.
|
||||
most_recent_element = self.soup._last_descendant()
|
||||
else:
|
||||
most_recent_element = self.element
|
||||
|
||||
self.soup.object_was_parsed(
|
||||
child, parent=self.tag, most_recent_element=most_recent_element
|
||||
)
|
||||
|
||||
def getAttributes(self) -> AttrList:
|
||||
assert self.tag is not None
|
||||
return AttrList(self.tag)
|
||||
|
||||
# An HTML5lib attribute name may either be a single string,
|
||||
# or a tuple (namespace, name).
|
||||
_Html5libAttributeName: TypeAlias = Union[str, Tuple[str, str]]
|
||||
# Now we can define the type this method accepts as a dictionary
|
||||
# mapping those attribute names to single string values.
|
||||
_Html5libAttributes: TypeAlias = Dict[_Html5libAttributeName, str]
|
||||
|
||||
def setAttributes(self, attributes: Optional[_Html5libAttributes]) -> None:
|
||||
assert self.tag is not None
|
||||
if attributes is not None and len(attributes) > 0:
|
||||
# Replace any namespaced attributes with
|
||||
# NamespacedAttribute objects.
|
||||
for name, value in list(attributes.items()):
|
||||
if isinstance(name, tuple):
|
||||
new_name = NamespacedAttribute(*name)
|
||||
del attributes[name]
|
||||
attributes[new_name] = value
|
||||
|
||||
# We can now cast attributes to the type of Dict
|
||||
# used by Beautiful Soup.
|
||||
normalized_attributes = cast(_AttributeValues, attributes)
|
||||
|
||||
# Values for tags like 'class' came in as single strings;
|
||||
# replace them with lists of strings as appropriate.
|
||||
self.soup.builder._replace_cdata_list_attribute_values(
|
||||
self.name, normalized_attributes
|
||||
)
|
||||
|
||||
# Then set the attributes on the Tag associated with this
|
||||
# BeautifulSoupNode.
|
||||
for name, value_or_values in list(normalized_attributes.items()):
|
||||
self.tag[name] = value_or_values
|
||||
|
||||
# The attributes may contain variables that need substitution.
|
||||
# Call set_up_substitutions manually.
|
||||
#
|
||||
# The Tag constructor called this method when the Tag was created,
|
||||
# but we just set/changed the attributes, so call it again.
|
||||
self.soup.builder.set_up_substitutions(self.tag)
|
||||
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def insertText(
|
||||
self, data: str, insertBefore: Optional["BeautifulSoupNode"] = None
|
||||
) -> None:
|
||||
text = TextNode(self.soup.new_string(data), self.soup)
|
||||
if insertBefore:
|
||||
self.insertBefore(text, insertBefore)
|
||||
else:
|
||||
self.appendChild(text)
|
||||
|
||||
def insertBefore(
|
||||
self, node: "BeautifulSoupNode", refNode: "BeautifulSoupNode"
|
||||
) -> None:
|
||||
assert self.tag is not None
|
||||
index = self.tag.index(refNode.element)
|
||||
if (
|
||||
type(node.element) is NavigableString
|
||||
and self.tag.contents
|
||||
and type(self.tag.contents[index - 1]) is NavigableString
|
||||
):
|
||||
# (See comments in appendChild)
|
||||
old_node = self.tag.contents[index - 1]
|
||||
assert type(old_node) is NavigableString
|
||||
new_str = self.soup.new_string(old_node + node.element)
|
||||
old_node.replace_with(new_str)
|
||||
else:
|
||||
self.tag.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node: "Element") -> None:
|
||||
node.element.extract()
|
||||
|
||||
def reparentChildren(self, newParent: "Element") -> None:
|
||||
"""Move all of this tag's children into another tag."""
|
||||
# print("MOVE", self.element.contents)
|
||||
# print("FROM", self.element)
|
||||
# print("TO", new_parent.element)
|
||||
|
||||
element = self.tag
|
||||
assert element is not None
|
||||
new_parent_element = newParent.tag
|
||||
assert new_parent_element is not None
|
||||
# Determine what this tag's next_element will be once all the children
|
||||
# are removed.
|
||||
final_next_element = element.next_sibling
|
||||
|
||||
new_parents_last_descendant = new_parent_element._last_descendant(False, False)
|
||||
if len(new_parent_element.contents) > 0:
|
||||
# The new parent already contains children. We will be
|
||||
# appending this tag's children to the end.
|
||||
|
||||
# We can make this assertion since we know new_parent has
|
||||
# children.
|
||||
assert new_parents_last_descendant is not None
|
||||
new_parents_last_child = new_parent_element.contents[-1]
|
||||
new_parents_last_descendant_next_element = (
|
||||
new_parents_last_descendant.next_element
|
||||
)
|
||||
else:
|
||||
# The new parent contains no children.
|
||||
new_parents_last_child = None
|
||||
new_parents_last_descendant_next_element = new_parent_element.next_element
|
||||
|
||||
to_append = element.contents
|
||||
if len(to_append) > 0:
|
||||
# Set the first child's previous_element and previous_sibling
|
||||
# to elements within the new parent
|
||||
first_child = to_append[0]
|
||||
if new_parents_last_descendant is not None:
|
||||
first_child.previous_element = new_parents_last_descendant
|
||||
else:
|
||||
first_child.previous_element = new_parent_element
|
||||
first_child.previous_sibling = new_parents_last_child
|
||||
if new_parents_last_descendant is not None:
|
||||
new_parents_last_descendant.next_element = first_child
|
||||
else:
|
||||
new_parent_element.next_element = first_child
|
||||
if new_parents_last_child is not None:
|
||||
new_parents_last_child.next_sibling = first_child
|
||||
|
||||
# Find the very last element being moved. It is now the
|
||||
# parent's last descendant. It has no .next_sibling and
|
||||
# its .next_element is whatever the previous last
|
||||
# descendant had.
|
||||
last_childs_last_descendant = to_append[-1]._last_descendant(
|
||||
is_initialized=False, accept_self=True
|
||||
)
|
||||
|
||||
# Since we passed accept_self=True into _last_descendant,
|
||||
# there's no possibility that the result is None.
|
||||
assert last_childs_last_descendant is not None
|
||||
last_childs_last_descendant.next_element = (
|
||||
new_parents_last_descendant_next_element
|
||||
)
|
||||
if new_parents_last_descendant_next_element is not None:
|
||||
# TODO-COVERAGE: This code has no test coverage and
|
||||
# I'm not sure how to get html5lib to go through this
|
||||
# path, but it's just the other side of the previous
|
||||
# line.
|
||||
new_parents_last_descendant_next_element.previous_element = (
|
||||
last_childs_last_descendant
|
||||
)
|
||||
last_childs_last_descendant.next_sibling = None
|
||||
|
||||
for child in to_append:
|
||||
child.parent = new_parent_element
|
||||
new_parent_element.contents.append(child)
|
||||
|
||||
# Now that this element has no children, change its .next_element.
|
||||
element.contents = []
|
||||
element.next_element = final_next_element
|
||||
|
||||
# print("DONE WITH MOVE")
|
||||
# print("FROM", self.element)
|
||||
# print("TO", new_parent_element)
|
||||
|
||||
# TODO-TYPING: typeshed stubs are incorrect about this;
|
||||
# hasContent returns a boolean, not None.
|
||||
def hasContent(self) -> bool: # type:ignore
|
||||
return self.tag is None or len(self.tag.contents) > 0
|
||||
|
||||
# TODO-TYPING: typeshed stubs are incorrect about this;
|
||||
# cloneNode returns a new Node, not None.
|
||||
def cloneNode(self) -> treebuilder_base.Node: # type:ignore
|
||||
assert self.tag is not None
|
||||
tag = self.soup.new_tag(self.tag.name, self.namespace)
|
||||
node = Element(tag, self.soup, self.namespace)
|
||||
for key, value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
|
||||
def getNameTuple(self) -> Tuple[Optional[_NamespaceURL], str]:
|
||||
if self.namespace is None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
|
||||
class TextNode(BeautifulSoupNode):
|
||||
|
||||
def __init__(self, element: NavigableString, soup: "BeautifulSoup"):
|
||||
treebuilder_base.Node.__init__(self, None)
|
||||
self.tag = None
|
||||
self.string = element
|
||||
self.soup = soup
|
||||
@@ -0,0 +1,462 @@
|
||||
# encoding: utf-8
|
||||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||
from __future__ import annotations
|
||||
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
"HTMLParserTreeBuilder",
|
||||
]
|
||||
|
||||
from html.parser import HTMLParser
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
cast,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
TYPE_CHECKING,
|
||||
Tuple,
|
||||
Type,
|
||||
Union,
|
||||
)
|
||||
|
||||
from bs4.element import (
|
||||
AttributeDict,
|
||||
CData,
|
||||
Comment,
|
||||
Declaration,
|
||||
Doctype,
|
||||
ProcessingInstruction,
|
||||
)
|
||||
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||
|
||||
from bs4.builder import (
|
||||
DetectsXMLParsedAsHTML,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
STRICT,
|
||||
)
|
||||
|
||||
from bs4.exceptions import ParserRejectedMarkup
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import NavigableString
|
||||
from bs4._typing import (
|
||||
_Encoding,
|
||||
_Encodings,
|
||||
_RawMarkup,
|
||||
)
|
||||
|
||||
HTMLPARSER = "html.parser"
|
||||
|
||||
_DuplicateAttributeHandler = Callable[[Dict[str, str], str, str], None]
|
||||
|
||||
|
||||
class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
|
||||
#: Constant to handle duplicate attributes by ignoring later values
|
||||
#: and keeping the earlier ones.
|
||||
REPLACE: str = "replace"
|
||||
|
||||
#: Constant to handle duplicate attributes by replacing earlier values
|
||||
#: with later ones.
|
||||
IGNORE: str = "ignore"
|
||||
|
||||
"""A subclass of the Python standard library's HTMLParser class, which
|
||||
listens for HTMLParser events and translates them into calls
|
||||
to Beautiful Soup's tree construction API.
|
||||
|
||||
:param on_duplicate_attribute: A strategy for what to do if a
|
||||
tag includes the same attribute more than once. Accepted
|
||||
values are: REPLACE (replace earlier values with later
|
||||
ones, the default), IGNORE (keep the earliest value
|
||||
encountered), or a callable. A callable must take three
|
||||
arguments: the dictionary of attributes already processed,
|
||||
the name of the duplicate attribute, and the most recent value
|
||||
encountered.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
soup: BeautifulSoup,
|
||||
*args: Any,
|
||||
on_duplicate_attribute: Union[str, _DuplicateAttributeHandler] = REPLACE,
|
||||
**kwargs: Any,
|
||||
):
|
||||
self.soup = soup
|
||||
self.on_duplicate_attribute = on_duplicate_attribute
|
||||
self.attribute_dict_class = soup.builder.attribute_dict_class
|
||||
HTMLParser.__init__(self, *args, **kwargs)
|
||||
|
||||
# Keep a list of empty-element tags that were encountered
|
||||
# without an explicit closing tag. If we encounter a closing tag
|
||||
# of this type, we'll associate it with one of those entries.
|
||||
#
|
||||
# This isn't a stack because we don't care about the
|
||||
# order. It's a list of closing tags we've already handled and
|
||||
# will ignore, assuming they ever show up.
|
||||
self.already_closed_empty_element = []
|
||||
|
||||
self._initialize_xml_detector()
|
||||
|
||||
on_duplicate_attribute: Union[str, _DuplicateAttributeHandler]
|
||||
already_closed_empty_element: List[str]
|
||||
soup: BeautifulSoup
|
||||
|
||||
def error(self, message: str) -> None:
|
||||
# NOTE: This method is required so long as Python 3.9 is
|
||||
# supported. The corresponding code is removed from HTMLParser
|
||||
# in 3.5, but not removed from ParserBase until 3.10.
|
||||
# https://github.com/python/cpython/issues/76025
|
||||
#
|
||||
# The original implementation turned the error into a warning,
|
||||
# but in every case I discovered, this made HTMLParser
|
||||
# immediately crash with an error message that was less
|
||||
# helpful than the warning. The new implementation makes it
|
||||
# more clear that html.parser just can't parse this
|
||||
# markup. The 3.10 implementation does the same, though it
|
||||
# raises AssertionError rather than calling a method. (We
|
||||
# catch this error and wrap it in a ParserRejectedMarkup.)
|
||||
raise ParserRejectedMarkup(message)
|
||||
|
||||
def handle_startendtag(
|
||||
self, tag: str, attrs: List[Tuple[str, Optional[str]]]
|
||||
) -> None:
|
||||
"""Handle an incoming empty-element tag.
|
||||
|
||||
html.parser only calls this method when the markup looks like
|
||||
<tag/>.
|
||||
"""
|
||||
# `handle_empty_element` tells handle_starttag not to close the tag
|
||||
# just because its name matches a known empty-element tag. We
|
||||
# know that this is an empty-element tag, and we want to call
|
||||
# handle_endtag ourselves.
|
||||
self.handle_starttag(tag, attrs, handle_empty_element=False)
|
||||
self.handle_endtag(tag)
|
||||
|
||||
def handle_starttag(
|
||||
self,
|
||||
tag: str,
|
||||
attrs: List[Tuple[str, Optional[str]]],
|
||||
handle_empty_element: bool = True,
|
||||
) -> None:
|
||||
"""Handle an opening tag, e.g. '<tag>'
|
||||
|
||||
:param handle_empty_element: True if this tag is known to be
|
||||
an empty-element tag (i.e. there is not expected to be any
|
||||
closing tag).
|
||||
"""
|
||||
# TODO: handle namespaces here?
|
||||
attr_dict: AttributeDict = self.attribute_dict_class()
|
||||
for key, value in attrs:
|
||||
# Change None attribute values to the empty string
|
||||
# for consistency with the other tree builders.
|
||||
if value is None:
|
||||
value = ""
|
||||
if key in attr_dict:
|
||||
# A single attribute shows up multiple times in this
|
||||
# tag. How to handle it depends on the
|
||||
# on_duplicate_attribute setting.
|
||||
on_dupe = self.on_duplicate_attribute
|
||||
if on_dupe == self.IGNORE:
|
||||
pass
|
||||
elif on_dupe in (None, self.REPLACE):
|
||||
attr_dict[key] = value
|
||||
else:
|
||||
on_dupe = cast(_DuplicateAttributeHandler, on_dupe)
|
||||
on_dupe(attr_dict, key, value)
|
||||
else:
|
||||
attr_dict[key] = value
|
||||
# print("START", tag)
|
||||
sourceline: Optional[int]
|
||||
sourcepos: Optional[int]
|
||||
if self.soup.builder.store_line_numbers:
|
||||
sourceline, sourcepos = self.getpos()
|
||||
else:
|
||||
sourceline = sourcepos = None
|
||||
tagObj = self.soup.handle_starttag(
|
||||
tag, None, None, attr_dict, sourceline=sourceline, sourcepos=sourcepos
|
||||
)
|
||||
if tagObj is not None and tagObj.is_empty_element and handle_empty_element:
|
||||
# Unlike other parsers, html.parser doesn't send separate end tag
|
||||
# events for empty-element tags. (It's handled in
|
||||
# handle_startendtag, but only if the original markup looked like
|
||||
# <tag/>.)
|
||||
#
|
||||
# So we need to call handle_endtag() ourselves. Since we
|
||||
# know the start event is identical to the end event, we
|
||||
# don't want handle_endtag() to cross off any previous end
|
||||
# events for tags of this name.
|
||||
self.handle_endtag(tag, check_already_closed=False)
|
||||
|
||||
# But we might encounter an explicit closing tag for this tag
|
||||
# later on. If so, we want to ignore it.
|
||||
self.already_closed_empty_element.append(tag)
|
||||
|
||||
if self._root_tag_name is None:
|
||||
self._root_tag_encountered(tag)
|
||||
|
||||
def handle_endtag(self, tag: str, check_already_closed: bool = True) -> None:
|
||||
"""Handle a closing tag, e.g. '</tag>'
|
||||
|
||||
:param tag: A tag name.
|
||||
:param check_already_closed: True if this tag is expected to
|
||||
be the closing portion of an empty-element tag,
|
||||
e.g. '<tag></tag>'.
|
||||
"""
|
||||
# print("END", tag)
|
||||
if check_already_closed and tag in self.already_closed_empty_element:
|
||||
# This is a redundant end tag for an empty-element tag.
|
||||
# We've already called handle_endtag() for it, so just
|
||||
# check it off the list.
|
||||
# print("ALREADY CLOSED", tag)
|
||||
self.already_closed_empty_element.remove(tag)
|
||||
else:
|
||||
self.soup.handle_endtag(tag)
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
"""Handle some textual data that shows up between tags."""
|
||||
self.soup.handle_data(data)
|
||||
|
||||
def handle_charref(self, name: str) -> None:
|
||||
"""Handle a numeric character reference by converting it to the
|
||||
corresponding Unicode character and treating it as textual
|
||||
data.
|
||||
|
||||
:param name: Character number, possibly in hexadecimal.
|
||||
"""
|
||||
# TODO: This was originally a workaround for a bug in
|
||||
# HTMLParser. (http://bugs.python.org/issue13633) The bug has
|
||||
# been fixed, but removing this code still makes some
|
||||
# Beautiful Soup tests fail. This needs investigation.
|
||||
real_name:int
|
||||
if name.startswith("x"):
|
||||
real_name = int(name.lstrip("x"), 16)
|
||||
elif name.startswith("X"):
|
||||
real_name = int(name.lstrip("X"), 16)
|
||||
else:
|
||||
real_name = int(name)
|
||||
|
||||
data, replacement_added = UnicodeDammit.numeric_character_reference(real_name)
|
||||
if replacement_added:
|
||||
self.soup.contains_replacement_characters = True
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_entityref(self, name: str) -> None:
|
||||
"""Handle a named entity reference by converting it to the
|
||||
corresponding Unicode character(s) and treating it as textual
|
||||
data.
|
||||
|
||||
:param name: Name of the entity reference.
|
||||
"""
|
||||
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
|
||||
if character is not None:
|
||||
data = character
|
||||
else:
|
||||
# If this were XML, it would be ambiguous whether "&foo"
|
||||
# was an character entity reference with a missing
|
||||
# semicolon or the literal string "&foo". Since this is
|
||||
# HTML, we have a complete list of all character entity references,
|
||||
# and this one wasn't found, so assume it's the literal string "&foo".
|
||||
data = "&%s" % name
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_comment(self, data: str) -> None:
|
||||
"""Handle an HTML comment.
|
||||
|
||||
:param data: The text of the comment.
|
||||
"""
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def handle_decl(self, decl: str) -> None:
|
||||
"""Handle a DOCTYPE declaration.
|
||||
|
||||
:param data: The text of the declaration.
|
||||
"""
|
||||
self.soup.endData()
|
||||
decl = decl[len("DOCTYPE ") :]
|
||||
self.soup.handle_data(decl)
|
||||
self.soup.endData(Doctype)
|
||||
|
||||
def unknown_decl(self, data: str) -> None:
|
||||
"""Handle a declaration of unknown type -- probably a CDATA block.
|
||||
|
||||
:param data: The text of the declaration.
|
||||
"""
|
||||
cls: Type[NavigableString]
|
||||
if data.upper().startswith("CDATA["):
|
||||
cls = CData
|
||||
data = data[len("CDATA[") :]
|
||||
else:
|
||||
cls = Declaration
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(cls)
|
||||
|
||||
def handle_pi(self, data: str) -> None:
|
||||
"""Handle a processing instruction.
|
||||
|
||||
:param data: The text of the instruction.
|
||||
"""
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self._document_might_be_xml(data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
|
||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
"""A Beautiful soup `bs4.builder.TreeBuilder` that uses the
|
||||
:py:class:`html.parser.HTMLParser` parser, found in the Python
|
||||
standard library.
|
||||
|
||||
"""
|
||||
|
||||
is_xml: bool = False
|
||||
picklable: bool = True
|
||||
NAME: str = HTMLPARSER
|
||||
features: Iterable[str] = [NAME, HTML, STRICT]
|
||||
parser_args: Tuple[Iterable[Any], Dict[str, Any]]
|
||||
|
||||
#: The html.parser knows which line number and position in the
|
||||
#: original file is the source of an element.
|
||||
TRACKS_LINE_NUMBERS: bool = True
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
parser_args: Optional[Iterable[Any]] = None,
|
||||
parser_kwargs: Optional[Dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
):
|
||||
"""Constructor.
|
||||
|
||||
:param parser_args: Positional arguments to pass into
|
||||
the BeautifulSoupHTMLParser constructor, once it's
|
||||
invoked.
|
||||
:param parser_kwargs: Keyword arguments to pass into
|
||||
the BeautifulSoupHTMLParser constructor, once it's
|
||||
invoked.
|
||||
:param kwargs: Keyword arguments for the superclass constructor.
|
||||
"""
|
||||
# Some keyword arguments will be pulled out of kwargs and placed
|
||||
# into parser_kwargs.
|
||||
extra_parser_kwargs = dict()
|
||||
for arg in ("on_duplicate_attribute",):
|
||||
if arg in kwargs:
|
||||
value = kwargs.pop(arg)
|
||||
extra_parser_kwargs[arg] = value
|
||||
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
|
||||
parser_args = parser_args or []
|
||||
parser_kwargs = parser_kwargs or {}
|
||||
parser_kwargs.update(extra_parser_kwargs)
|
||||
parser_kwargs["convert_charrefs"] = False
|
||||
self.parser_args = (parser_args, parser_kwargs)
|
||||
|
||||
def prepare_markup(
|
||||
self,
|
||||
markup: _RawMarkup,
|
||||
user_specified_encoding: Optional[_Encoding] = None,
|
||||
document_declared_encoding: Optional[_Encoding] = None,
|
||||
exclude_encodings: Optional[_Encodings] = None,
|
||||
) -> Iterable[Tuple[str, Optional[_Encoding], Optional[_Encoding], bool]]:
|
||||
"""Run any preliminary steps necessary to make incoming markup
|
||||
acceptable to the parser.
|
||||
|
||||
:param markup: Some markup -- probably a bytestring.
|
||||
:param user_specified_encoding: The user asked to try this encoding.
|
||||
:param document_declared_encoding: The markup itself claims to be
|
||||
in this encoding.
|
||||
:param exclude_encodings: The user asked _not_ to try any of
|
||||
these encodings.
|
||||
|
||||
:yield: A series of 4-tuples: (markup, encoding, declared encoding,
|
||||
has undergone character replacement)
|
||||
|
||||
Each 4-tuple represents a strategy for parsing the document.
|
||||
This TreeBuilder uses Unicode, Dammit to convert the markup
|
||||
into Unicode, so the ``markup`` element of the tuple will
|
||||
always be a string.
|
||||
"""
|
||||
if isinstance(markup, str):
|
||||
# Parse Unicode as-is.
|
||||
yield (markup, None, None, False)
|
||||
return
|
||||
|
||||
# Ask UnicodeDammit to sniff the most likely encoding.
|
||||
|
||||
known_definite_encodings: List[_Encoding] = []
|
||||
if user_specified_encoding:
|
||||
# This was provided by the end-user; treat it as a known
|
||||
# definite encoding per the algorithm laid out in the
|
||||
# HTML5 spec. (See the EncodingDetector class for
|
||||
# details.)
|
||||
known_definite_encodings.append(user_specified_encoding)
|
||||
|
||||
user_encodings: List[_Encoding] = []
|
||||
if document_declared_encoding:
|
||||
# This was found in the document; treat it as a slightly
|
||||
# lower-priority user encoding.
|
||||
user_encodings.append(document_declared_encoding)
|
||||
|
||||
dammit = UnicodeDammit(
|
||||
markup,
|
||||
known_definite_encodings=known_definite_encodings,
|
||||
user_encodings=user_encodings,
|
||||
is_html=True,
|
||||
exclude_encodings=exclude_encodings,
|
||||
)
|
||||
|
||||
if dammit.unicode_markup is None:
|
||||
# In every case I've seen, Unicode, Dammit is able to
|
||||
# convert the markup into Unicode, even if it needs to use
|
||||
# REPLACEMENT CHARACTER. But there is a code path that
|
||||
# could result in unicode_markup being None, and
|
||||
# HTMLParser can only parse Unicode, so here we handle
|
||||
# that code path.
|
||||
raise ParserRejectedMarkup(
|
||||
"Could not convert input to Unicode, and html.parser will not accept bytestrings."
|
||||
)
|
||||
else:
|
||||
yield (
|
||||
dammit.unicode_markup,
|
||||
dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters,
|
||||
)
|
||||
|
||||
def feed(self, markup: _RawMarkup, _parser_class:type[BeautifulSoupHTMLParser] =BeautifulSoupHTMLParser) -> None:
|
||||
"""
|
||||
:param markup: The markup to feed into the parser.
|
||||
:param _parser_class: An HTMLParser subclass to use. This is only intended for use in unit tests.
|
||||
"""
|
||||
args, kwargs = self.parser_args
|
||||
|
||||
# HTMLParser.feed will only handle str, but
|
||||
# BeautifulSoup.markup is allowed to be _RawMarkup, because
|
||||
# it's set by the yield value of
|
||||
# TreeBuilder.prepare_markup. Fortunately,
|
||||
# HTMLParserTreeBuilder.prepare_markup always yields a str
|
||||
# (UnicodeDammit.unicode_markup).
|
||||
assert isinstance(markup, str)
|
||||
|
||||
# We know BeautifulSoup calls TreeBuilder.initialize_soup
|
||||
# before calling feed(), so we can assume self.soup
|
||||
# is set.
|
||||
assert self.soup is not None
|
||||
parser = _parser_class(self.soup, *args, **kwargs)
|
||||
|
||||
try:
|
||||
parser.feed(markup)
|
||||
parser.close()
|
||||
except AssertionError as e:
|
||||
# html.parser raises AssertionError in rare cases to
|
||||
# indicate a fatal problem with the markup, especially
|
||||
# when there's an error in the doctype declaration.
|
||||
raise ParserRejectedMarkup(e)
|
||||
parser.already_closed_empty_element = []
|
||||
@@ -0,0 +1,501 @@
|
||||
# encoding: utf-8
|
||||
from __future__ import annotations
|
||||
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
"LXMLTreeBuilderForXML",
|
||||
"LXMLTreeBuilder",
|
||||
]
|
||||
|
||||
|
||||
from typing import (
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Set,
|
||||
Tuple,
|
||||
Type,
|
||||
TYPE_CHECKING,
|
||||
Union,
|
||||
)
|
||||
|
||||
from io import BytesIO
|
||||
from io import StringIO
|
||||
|
||||
from typing_extensions import TypeAlias
|
||||
|
||||
from lxml import etree # type:ignore
|
||||
from bs4.element import (
|
||||
AttributeDict,
|
||||
XMLAttributeDict,
|
||||
Comment,
|
||||
Doctype,
|
||||
NamespacedAttribute,
|
||||
ProcessingInstruction,
|
||||
XMLProcessingInstruction,
|
||||
)
|
||||
from bs4.builder import (
|
||||
DetectsXMLParsedAsHTML,
|
||||
FAST,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
PERMISSIVE,
|
||||
TreeBuilder,
|
||||
XML,
|
||||
)
|
||||
from bs4.dammit import EncodingDetector
|
||||
from bs4.exceptions import ParserRejectedMarkup
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from bs4._typing import (
|
||||
_Encoding,
|
||||
_Encodings,
|
||||
_NamespacePrefix,
|
||||
_NamespaceURL,
|
||||
_NamespaceMapping,
|
||||
_InvertedNamespaceMapping,
|
||||
_RawMarkup,
|
||||
)
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
LXML: str = "lxml"
|
||||
|
||||
|
||||
def _invert(d: dict[Any, Any]) -> dict[Any, Any]:
|
||||
"Invert a dictionary."
|
||||
return dict((v, k) for k, v in list(d.items()))
|
||||
|
||||
|
||||
_LXMLParser: TypeAlias = Union[etree.XMLParser, etree.HTMLParser]
|
||||
_ParserOrParserClass: TypeAlias = Union[
|
||||
_LXMLParser, Type[etree.XMLParser], Type[etree.HTMLParser]
|
||||
]
|
||||
|
||||
|
||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
DEFAULT_PARSER_CLASS: Type[etree.XMLParser] = etree.XMLParser
|
||||
|
||||
is_xml: bool = True
|
||||
|
||||
#: Set this to true (probably by passing huge_tree=True into the :
|
||||
#: BeautifulSoup constructor) to enable the lxml feature "disable security
|
||||
#: restrictions and support very deep trees and very long text
|
||||
#: content".
|
||||
huge_tree: bool
|
||||
|
||||
processing_instruction_class: Type[ProcessingInstruction]
|
||||
|
||||
NAME: str = "lxml-xml"
|
||||
ALTERNATE_NAMES: Iterable[str] = ["xml"]
|
||||
|
||||
# Well, it's permissive by XML parser standards.
|
||||
features: Iterable[str] = [NAME, LXML, XML, FAST, PERMISSIVE]
|
||||
|
||||
CHUNK_SIZE: int = 512
|
||||
|
||||
# This namespace mapping is specified in the XML Namespace
|
||||
# standard.
|
||||
DEFAULT_NSMAPS: _NamespaceMapping = dict(xml="http://www.w3.org/XML/1998/namespace")
|
||||
|
||||
DEFAULT_NSMAPS_INVERTED: _InvertedNamespaceMapping = _invert(DEFAULT_NSMAPS)
|
||||
|
||||
nsmaps: List[Optional[_InvertedNamespaceMapping]]
|
||||
empty_element_tags: Optional[Set[str]]
|
||||
parser: Any
|
||||
_default_parser: Optional[etree.XMLParser]
|
||||
|
||||
# NOTE: If we parsed Element objects and looked at .sourceline,
|
||||
# we'd be able to see the line numbers from the original document.
|
||||
# But instead we build an XMLParser or HTMLParser object to serve
|
||||
# as the target of parse messages, and those messages don't include
|
||||
# line numbers.
|
||||
# See: https://bugs.launchpad.net/lxml/+bug/1846906
|
||||
|
||||
def initialize_soup(self, soup: BeautifulSoup) -> None:
|
||||
"""Let the BeautifulSoup object know about the standard namespace
|
||||
mapping.
|
||||
|
||||
:param soup: A `BeautifulSoup`.
|
||||
"""
|
||||
# Beyond this point, self.soup is set, so we can assume (and
|
||||
# assert) it's not None whenever necessary.
|
||||
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
|
||||
self._register_namespaces(self.DEFAULT_NSMAPS)
|
||||
|
||||
def _register_namespaces(self, mapping: Dict[str, str]) -> None:
|
||||
"""Let the BeautifulSoup object know about namespaces encountered
|
||||
while parsing the document.
|
||||
|
||||
This might be useful later on when creating CSS selectors.
|
||||
|
||||
This will track (almost) all namespaces, even ones that were
|
||||
only in scope for part of the document. If two namespaces have
|
||||
the same prefix, only the first one encountered will be
|
||||
tracked. Un-prefixed namespaces are not tracked.
|
||||
|
||||
:param mapping: A dictionary mapping namespace prefixes to URIs.
|
||||
"""
|
||||
assert self.soup is not None
|
||||
for key, value in list(mapping.items()):
|
||||
# This is 'if key' and not 'if key is not None' because we
|
||||
# don't track un-prefixed namespaces. Soupselect will
|
||||
# treat an un-prefixed namespace as the default, which
|
||||
# causes confusion in some cases.
|
||||
if key and key not in self.soup._namespaces:
|
||||
# Let the BeautifulSoup object know about a new namespace.
|
||||
# If there are multiple namespaces defined with the same
|
||||
# prefix, the first one in the document takes precedence.
|
||||
self.soup._namespaces[key] = value
|
||||
|
||||
def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
|
||||
"""Find the default parser for the given encoding.
|
||||
|
||||
:return: Either a parser object or a class, which
|
||||
will be instantiated with default arguments.
|
||||
"""
|
||||
if self._default_parser is not None:
|
||||
return self._default_parser
|
||||
return self.DEFAULT_PARSER_CLASS(target=self, recover=True, huge_tree=self.huge_tree, encoding=encoding)
|
||||
|
||||
def parser_for(self, encoding: Optional[_Encoding]) -> _LXMLParser:
|
||||
"""Instantiate an appropriate parser for the given encoding.
|
||||
|
||||
:param encoding: A string.
|
||||
:return: A parser object such as an `etree.XMLParser`.
|
||||
"""
|
||||
# Use the default parser.
|
||||
parser = self.default_parser(encoding)
|
||||
|
||||
if callable(parser):
|
||||
# Instantiate the parser with default arguments
|
||||
parser = parser(target=self, recover=True, huge_tree=self.huge_tree, encoding=encoding)
|
||||
return parser
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
parser: Optional[etree.XMLParser] = None,
|
||||
empty_element_tags: Optional[Set[str]] = None,
|
||||
huge_tree: bool = False,
|
||||
**kwargs: Any,
|
||||
):
|
||||
# TODO: Issue a warning if parser is present but not a
|
||||
# callable, since that means there's no way to create new
|
||||
# parsers for different encodings.
|
||||
self._default_parser = parser
|
||||
self.soup = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||
self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]
|
||||
if self.is_xml:
|
||||
self.processing_instruction_class = XMLProcessingInstruction
|
||||
else:
|
||||
self.processing_instruction_class = ProcessingInstruction
|
||||
|
||||
if "attribute_dict_class" not in kwargs:
|
||||
kwargs["attribute_dict_class"] = XMLAttributeDict
|
||||
self.huge_tree = huge_tree
|
||||
|
||||
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
|
||||
|
||||
def _getNsTag(self, tag: str) -> Tuple[Optional[str], str]:
|
||||
# Split the namespace URL out of a fully-qualified lxml tag
|
||||
# name. Copied from lxml's src/lxml/sax.py.
|
||||
if tag[0] == "{" and "}" in tag:
|
||||
namespace, name = tag[1:].split("}", 1)
|
||||
return (namespace, name)
|
||||
return (None, tag)
|
||||
|
||||
def prepare_markup(
|
||||
self,
|
||||
markup: _RawMarkup,
|
||||
user_specified_encoding: Optional[_Encoding] = None,
|
||||
document_declared_encoding: Optional[_Encoding] = None,
|
||||
exclude_encodings: Optional[_Encodings] = None,
|
||||
) -> Iterable[
|
||||
Tuple[Union[str, bytes], Optional[_Encoding], Optional[_Encoding], bool]
|
||||
]:
|
||||
"""Run any preliminary steps necessary to make incoming markup
|
||||
acceptable to the parser.
|
||||
|
||||
lxml really wants to get a bytestring and convert it to
|
||||
Unicode itself. So instead of using UnicodeDammit to convert
|
||||
the bytestring to Unicode using different encodings, this
|
||||
implementation uses EncodingDetector to iterate over the
|
||||
encodings, and tell lxml to try to parse the document as each
|
||||
one in turn.
|
||||
|
||||
:param markup: Some markup -- hopefully a bytestring.
|
||||
:param user_specified_encoding: The user asked to try this encoding.
|
||||
:param document_declared_encoding: The markup itself claims to be
|
||||
in this encoding.
|
||||
:param exclude_encodings: The user asked _not_ to try any of
|
||||
these encodings.
|
||||
|
||||
:yield: A series of 4-tuples: (markup, encoding, declared encoding,
|
||||
has undergone character replacement)
|
||||
|
||||
Each 4-tuple represents a strategy for converting the
|
||||
document to Unicode and parsing it. Each strategy will be tried
|
||||
in turn.
|
||||
"""
|
||||
if not self.is_xml:
|
||||
# We're in HTML mode, so if we're given XML, that's worth
|
||||
# noting.
|
||||
DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup, stacklevel=3)
|
||||
|
||||
if isinstance(markup, str):
|
||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||
# this system?
|
||||
|
||||
# TODO: This is a workaround for
|
||||
# https://bugs.launchpad.net/lxml/+bug/1948551.
|
||||
# We can remove it once the upstream issue is fixed.
|
||||
if len(markup) > 0 and markup[0] == "\N{BYTE ORDER MARK}":
|
||||
markup = markup[1:]
|
||||
yield markup, None, document_declared_encoding, False
|
||||
|
||||
if isinstance(markup, str):
|
||||
# No, apparently not. Convert the Unicode to UTF-8 and
|
||||
# tell lxml to parse it as UTF-8.
|
||||
yield (markup.encode("utf8"), "utf8", document_declared_encoding, False)
|
||||
|
||||
# Since the document was Unicode in the first place, there
|
||||
# is no need to try any more strategies; we know this will
|
||||
# work.
|
||||
return
|
||||
|
||||
known_definite_encodings: List[_Encoding] = []
|
||||
if user_specified_encoding:
|
||||
# This was provided by the end-user; treat it as a known
|
||||
# definite encoding per the algorithm laid out in the
|
||||
# HTML5 spec. (See the EncodingDetector class for
|
||||
# details.)
|
||||
known_definite_encodings.append(user_specified_encoding)
|
||||
|
||||
user_encodings: List[_Encoding] = []
|
||||
if document_declared_encoding:
|
||||
# This was found in the document; treat it as a slightly
|
||||
# lower-priority user encoding.
|
||||
user_encodings.append(document_declared_encoding)
|
||||
|
||||
detector = EncodingDetector(
|
||||
markup,
|
||||
known_definite_encodings=known_definite_encodings,
|
||||
user_encodings=user_encodings,
|
||||
is_html=not self.is_xml,
|
||||
exclude_encodings=exclude_encodings,
|
||||
)
|
||||
for encoding in detector.encodings:
|
||||
yield (detector.markup, encoding, document_declared_encoding, False)
|
||||
|
||||
def feed(self, markup: _RawMarkup) -> None:
|
||||
io: Union[BytesIO, StringIO]
|
||||
if isinstance(markup, bytes):
|
||||
io = BytesIO(markup)
|
||||
elif isinstance(markup, str):
|
||||
io = StringIO(markup)
|
||||
|
||||
# initialize_soup is called before feed, so we know this
|
||||
# is not None.
|
||||
assert self.soup is not None
|
||||
|
||||
# Call feed() at least once, even if the markup is empty,
|
||||
# or the parser won't be initialized.
|
||||
data = io.read(self.CHUNK_SIZE)
|
||||
try:
|
||||
self.parser = self.parser_for(self.soup.original_encoding)
|
||||
self.parser.feed(data)
|
||||
while len(data) != 0:
|
||||
# Now call feed() on the rest of the data, chunk by chunk.
|
||||
data = io.read(self.CHUNK_SIZE)
|
||||
if len(data) != 0:
|
||||
self.parser.feed(data)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||
raise ParserRejectedMarkup(e)
|
||||
|
||||
def close(self) -> None:
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||
|
||||
def start(
|
||||
self,
|
||||
tag: str | bytes,
|
||||
attrib: Dict[str | bytes, str | bytes],
|
||||
nsmap: _NamespaceMapping = {},
|
||||
) -> None:
|
||||
# This is called by lxml code as a result of calling
|
||||
# BeautifulSoup.feed(), and we know self.soup is set by the time feed()
|
||||
# is called.
|
||||
assert self.soup is not None
|
||||
assert isinstance(tag, str)
|
||||
|
||||
# We need to recreate the attribute dict for three
|
||||
# reasons. First, for type checking, so we can assert there
|
||||
# are no bytestrings in the keys or values. Second, because we
|
||||
# need a mutable dict--lxml might send us an immutable
|
||||
# dictproxy. Third, so we can handle namespaced attribute
|
||||
# names by converting the keys to NamespacedAttributes.
|
||||
new_attrib: Dict[Union[str, NamespacedAttribute], str] = (
|
||||
self.attribute_dict_class()
|
||||
)
|
||||
for k, v in attrib.items():
|
||||
assert isinstance(k, str)
|
||||
assert isinstance(v, str)
|
||||
new_attrib[k] = v
|
||||
|
||||
nsprefix: Optional[_NamespacePrefix] = None
|
||||
namespace: Optional[_NamespaceURL] = None
|
||||
# Invert each namespace map as it comes in.
|
||||
if len(nsmap) == 0 and len(self.nsmaps) > 1:
|
||||
# There are no new namespaces for this tag, but
|
||||
# non-default namespaces are in play, so we need a
|
||||
# separate tag stack to know when they end.
|
||||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
|
||||
# First, Let the BeautifulSoup object know about it.
|
||||
self._register_namespaces(nsmap)
|
||||
|
||||
# Then, add it to our running list of inverted namespace
|
||||
# mappings.
|
||||
self.nsmaps.append(_invert(nsmap))
|
||||
|
||||
# The currently active namespace prefixes have
|
||||
# changed. Calculate the new mapping so it can be stored
|
||||
# with all Tag objects created while these prefixes are in
|
||||
# scope.
|
||||
current_mapping = dict(self.active_namespace_prefixes[-1])
|
||||
current_mapping.update(nsmap)
|
||||
|
||||
# We should not track un-prefixed namespaces as we can only hold one
|
||||
# and it will be recognized as the default namespace by soupsieve,
|
||||
# which may be confusing in some situations.
|
||||
if "" in current_mapping:
|
||||
del current_mapping[""]
|
||||
self.active_namespace_prefixes.append(current_mapping)
|
||||
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
for prefix, namespace in list(nsmap.items()):
|
||||
attribute = NamespacedAttribute(
|
||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/"
|
||||
)
|
||||
new_attrib[attribute] = namespace
|
||||
|
||||
# Namespaces are in play. Find any attributes that came in
|
||||
# from lxml with namespaces attached to their names, and
|
||||
# turn then into NamespacedAttribute objects.
|
||||
final_attrib: AttributeDict = self.attribute_dict_class()
|
||||
for attr, value in list(new_attrib.items()):
|
||||
namespace, attr = self._getNsTag(attr)
|
||||
if namespace is None:
|
||||
final_attrib[attr] = value
|
||||
else:
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
||||
final_attrib[attr] = value
|
||||
|
||||
namespace, tag = self._getNsTag(tag)
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
self.soup.handle_starttag(
|
||||
tag,
|
||||
namespace,
|
||||
nsprefix,
|
||||
final_attrib,
|
||||
namespaces=self.active_namespace_prefixes[-1],
|
||||
)
|
||||
|
||||
def _prefix_for_namespace(
|
||||
self, namespace: Optional[_NamespaceURL]
|
||||
) -> Optional[_NamespacePrefix]:
|
||||
"""Find the currently active prefix for the given namespace."""
|
||||
if namespace is None:
|
||||
return None
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
return inverted_nsmap[namespace]
|
||||
return None
|
||||
|
||||
def end(self, tag: str | bytes) -> None:
|
||||
assert self.soup is not None
|
||||
assert isinstance(tag, str)
|
||||
self.soup.endData()
|
||||
namespace, tag = self._getNsTag(tag)
|
||||
nsprefix = None
|
||||
if namespace is not None:
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
nsprefix = inverted_nsmap[namespace]
|
||||
break
|
||||
self.soup.handle_endtag(tag, nsprefix)
|
||||
if len(self.nsmaps) > 1:
|
||||
# This tag, or one of its parents, introduced a namespace
|
||||
# mapping, so pop it off the stack.
|
||||
out_of_scope_nsmap = self.nsmaps.pop()
|
||||
|
||||
if out_of_scope_nsmap is not None:
|
||||
# This tag introduced a namespace mapping which is no
|
||||
# longer in scope. Recalculate the currently active
|
||||
# namespace prefixes.
|
||||
self.active_namespace_prefixes.pop()
|
||||
|
||||
def pi(self, target: str, data: str) -> None:
|
||||
assert self.soup is not None
|
||||
self.soup.endData()
|
||||
data = target + " " + data
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(self.processing_instruction_class)
|
||||
|
||||
def data(self, data: str | bytes) -> None:
|
||||
assert self.soup is not None
|
||||
assert isinstance(data, str)
|
||||
self.soup.handle_data(data)
|
||||
|
||||
def doctype(self, name: str, pubid: str, system: str) -> None:
|
||||
assert self.soup is not None
|
||||
self.soup.endData()
|
||||
doctype_string = Doctype._string_for_name_and_ids(name, pubid, system)
|
||||
self.soup.handle_data(doctype_string)
|
||||
self.soup.endData(containerClass=Doctype)
|
||||
|
||||
def comment(self, text: str | bytes) -> None:
|
||||
"Handle comments as Comment objects."
|
||||
assert self.soup is not None
|
||||
assert isinstance(text, str)
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(text)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def test_fragment_to_document(self, fragment: str) -> str:
|
||||
"""See `TreeBuilder`."""
|
||||
return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
|
||||
|
||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
NAME: str = LXML
|
||||
ALTERNATE_NAMES: Iterable[str] = ["lxml-html"]
|
||||
|
||||
features: Iterable[str] = list(ALTERNATE_NAMES) + [NAME, HTML, FAST, PERMISSIVE]
|
||||
is_xml: bool = False
|
||||
|
||||
def default_parser(self, encoding: Optional[_Encoding]) -> _ParserOrParserClass:
|
||||
return etree.HTMLParser
|
||||
|
||||
def feed(self, markup: _RawMarkup) -> None:
|
||||
# We know self.soup is set by the time feed() is called.
|
||||
assert self.soup is not None
|
||||
encoding = self.soup.original_encoding
|
||||
try:
|
||||
self.parser = self.parser_for(encoding)
|
||||
self.parser.feed(markup)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||
raise ParserRejectedMarkup(e)
|
||||
|
||||
def test_fragment_to_document(self, fragment: str) -> str:
|
||||
"""See `TreeBuilder`."""
|
||||
return "<html><body>%s</body></html>" % fragment
|
||||
@@ -0,0 +1,339 @@
|
||||
"""Integration code for CSS selectors using `Soup Sieve <https://facelessuser.github.io/soupsieve/>`_ (pypi: ``soupsieve``).
|
||||
|
||||
Acquire a `CSS` object through the `element.Tag.css` attribute of
|
||||
the starting point of your CSS selector, or (if you want to run a
|
||||
selector against the entire document) of the `BeautifulSoup` object
|
||||
itself.
|
||||
|
||||
The main advantage of doing this instead of using ``soupsieve``
|
||||
functions is that you don't need to keep passing the `element.Tag` to be
|
||||
selected against, since the `CSS` object is permanently scoped to that
|
||||
`element.Tag`.
|
||||
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from types import ModuleType
|
||||
from typing import (
|
||||
Any,
|
||||
cast,
|
||||
Iterable,
|
||||
Iterator,
|
||||
MutableSequence,
|
||||
Optional,
|
||||
TYPE_CHECKING,
|
||||
)
|
||||
import warnings
|
||||
from bs4._typing import _NamespaceMapping
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from soupsieve import SoupSieve
|
||||
from bs4 import element
|
||||
from bs4.element import ResultSet, Tag
|
||||
|
||||
soupsieve: Optional[ModuleType]
|
||||
try:
|
||||
import soupsieve
|
||||
except ImportError:
|
||||
soupsieve = None
|
||||
warnings.warn(
|
||||
"The soupsieve package is not installed. CSS selectors cannot be used."
|
||||
)
|
||||
|
||||
|
||||
class CSS(object):
|
||||
"""A proxy object against the ``soupsieve`` library, to simplify its
|
||||
CSS selector API.
|
||||
|
||||
You don't need to instantiate this class yourself; instead, use
|
||||
`element.Tag.css`.
|
||||
|
||||
:param tag: All CSS selectors run by this object will use this as
|
||||
their starting point.
|
||||
|
||||
:param api: An optional drop-in replacement for the ``soupsieve`` module,
|
||||
intended for use in unit tests.
|
||||
"""
|
||||
|
||||
def __init__(self, tag: element.Tag, api: Optional[ModuleType] = None):
|
||||
if api is None:
|
||||
api = soupsieve
|
||||
if api is None:
|
||||
raise NotImplementedError(
|
||||
"Cannot execute CSS selectors because the soupsieve package is not installed."
|
||||
)
|
||||
self.api = api
|
||||
self.tag = tag
|
||||
|
||||
def escape(self, ident: str) -> str:
|
||||
"""Escape a CSS identifier.
|
||||
|
||||
This is a simple wrapper around `soupsieve.escape() <https://facelessuser.github.io/soupsieve/api/#soupsieveescape>`_. See the
|
||||
documentation for that function for more information.
|
||||
"""
|
||||
if soupsieve is None:
|
||||
raise NotImplementedError(
|
||||
"Cannot escape CSS identifiers because the soupsieve package is not installed."
|
||||
)
|
||||
return cast(str, self.api.escape(ident))
|
||||
|
||||
def _ns(
|
||||
self, ns: Optional[_NamespaceMapping], select: str
|
||||
) -> Optional[_NamespaceMapping]:
|
||||
"""Normalize a dictionary of namespaces."""
|
||||
if not isinstance(select, self.api.SoupSieve) and ns is None:
|
||||
# If the selector is a precompiled pattern, it already has
|
||||
# a namespace context compiled in, which cannot be
|
||||
# replaced.
|
||||
ns = self.tag._namespaces
|
||||
return ns
|
||||
|
||||
def _rs(self, results: MutableSequence[Tag]) -> ResultSet[Tag]:
|
||||
"""Normalize a list of results to a py:class:`ResultSet`.
|
||||
|
||||
A py:class:`ResultSet` is more consistent with the rest of
|
||||
Beautiful Soup's API, and :py:meth:`ResultSet.__getattr__` has
|
||||
a helpful error message if you try to treat a list of results
|
||||
as a single result (a common mistake).
|
||||
"""
|
||||
# Import here to avoid circular import
|
||||
from bs4 import ResultSet
|
||||
|
||||
return ResultSet(None, results)
|
||||
|
||||
def compile(
|
||||
self,
|
||||
select: str,
|
||||
namespaces: Optional[_NamespaceMapping] = None,
|
||||
flags: int = 0,
|
||||
**kwargs: Any,
|
||||
) -> SoupSieve:
|
||||
"""Pre-compile a selector and return the compiled object.
|
||||
|
||||
:param selector: A CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will use the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
`soupsieve.compile() <https://facelessuser.github.io/soupsieve/api/#soupsievecompile>`_ method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into Soup Sieve's
|
||||
`soupsieve.compile() <https://facelessuser.github.io/soupsieve/api/#soupsievecompile>`_ method.
|
||||
|
||||
:return: A precompiled selector object.
|
||||
:rtype: soupsieve.SoupSieve
|
||||
"""
|
||||
return self.api.compile(select, self._ns(namespaces, select), flags, **kwargs)
|
||||
|
||||
def select_one(
|
||||
self,
|
||||
select: str,
|
||||
namespaces: Optional[_NamespaceMapping] = None,
|
||||
flags: int = 0,
|
||||
**kwargs: Any,
|
||||
) -> element.Tag | None:
|
||||
"""Perform a CSS selection operation on the current Tag and return the
|
||||
first result, if any.
|
||||
|
||||
This uses the Soup Sieve library. For more information, see
|
||||
that library's documentation for the `soupsieve.select_one() <https://facelessuser.github.io/soupsieve/api/#soupsieveselect_one>`_ method.
|
||||
|
||||
:param selector: A CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will use the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
`soupsieve.select_one() <https://facelessuser.github.io/soupsieve/api/#soupsieveselect_one>`_ method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into Soup Sieve's
|
||||
`soupsieve.select_one() <https://facelessuser.github.io/soupsieve/api/#soupsieveselect_one>`_ method.
|
||||
"""
|
||||
return self.api.select_one(
|
||||
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||
)
|
||||
|
||||
def select(
|
||||
self,
|
||||
select: str,
|
||||
namespaces: Optional[_NamespaceMapping] = None,
|
||||
limit: int = 0,
|
||||
flags: int = 0,
|
||||
**kwargs: Any,
|
||||
) -> ResultSet[element.Tag]:
|
||||
"""Perform a CSS selection operation on the current `element.Tag`.
|
||||
|
||||
This uses the Soup Sieve library. For more information, see
|
||||
that library's documentation for the `soupsieve.select() <https://facelessuser.github.io/soupsieve/api/#soupsieveselect>`_ method.
|
||||
|
||||
:param selector: A CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will pass in the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param limit: After finding this number of results, stop looking.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
`soupsieve.select() <https://facelessuser.github.io/soupsieve/api/#soupsieveselect>`_ method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into Soup Sieve's
|
||||
`soupsieve.select() <https://facelessuser.github.io/soupsieve/api/#soupsieveselect>`_ method.
|
||||
"""
|
||||
if limit is None:
|
||||
limit = 0
|
||||
|
||||
return self._rs(
|
||||
self.api.select(
|
||||
select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs
|
||||
)
|
||||
)
|
||||
|
||||
def iselect(
|
||||
self,
|
||||
select: str,
|
||||
namespaces: Optional[_NamespaceMapping] = None,
|
||||
limit: int = 0,
|
||||
flags: int = 0,
|
||||
**kwargs: Any,
|
||||
) -> Iterator[element.Tag]:
|
||||
"""Perform a CSS selection operation on the current `element.Tag`.
|
||||
|
||||
This uses the Soup Sieve library. For more information, see
|
||||
that library's documentation for the `soupsieve.iselect()
|
||||
<https://facelessuser.github.io/soupsieve/api/#soupsieveiselect>`_
|
||||
method. It is the same as select(), but it returns a generator
|
||||
instead of a list.
|
||||
|
||||
:param selector: A string containing a CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will pass in the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param limit: After finding this number of results, stop looking.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
`soupsieve.iselect() <https://facelessuser.github.io/soupsieve/api/#soupsieveiselect>`_ method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into Soup Sieve's
|
||||
`soupsieve.iselect() <https://facelessuser.github.io/soupsieve/api/#soupsieveiselect>`_ method.
|
||||
"""
|
||||
return self.api.iselect(
|
||||
select, self.tag, self._ns(namespaces, select), limit, flags, **kwargs
|
||||
)
|
||||
|
||||
def closest(
|
||||
self,
|
||||
select: str,
|
||||
namespaces: Optional[_NamespaceMapping] = None,
|
||||
flags: int = 0,
|
||||
**kwargs: Any,
|
||||
) -> Optional[element.Tag]:
|
||||
"""Find the `element.Tag` closest to this one that matches the given selector.
|
||||
|
||||
This uses the Soup Sieve library. For more information, see
|
||||
that library's documentation for the `soupsieve.closest()
|
||||
<https://facelessuser.github.io/soupsieve/api/#soupsieveclosest>`_
|
||||
method.
|
||||
|
||||
:param selector: A string containing a CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will pass in the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
`soupsieve.closest() <https://facelessuser.github.io/soupsieve/api/#soupsieveclosest>`_ method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into Soup Sieve's
|
||||
`soupsieve.closest() <https://facelessuser.github.io/soupsieve/api/#soupsieveclosest>`_ method.
|
||||
|
||||
"""
|
||||
return self.api.closest(
|
||||
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||
)
|
||||
|
||||
def match(
|
||||
self,
|
||||
select: str,
|
||||
namespaces: Optional[_NamespaceMapping] = None,
|
||||
flags: int = 0,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
"""Check whether or not this `element.Tag` matches the given CSS selector.
|
||||
|
||||
This uses the Soup Sieve library. For more information, see
|
||||
that library's documentation for the `soupsieve.match()
|
||||
<https://facelessuser.github.io/soupsieve/api/#soupsievematch>`_
|
||||
method.
|
||||
|
||||
:param: a CSS selector.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will pass in the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
`soupsieve.match()
|
||||
<https://facelessuser.github.io/soupsieve/api/#soupsievematch>`_
|
||||
method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||
`soupsieve.match()
|
||||
<https://facelessuser.github.io/soupsieve/api/#soupsievematch>`_
|
||||
method.
|
||||
"""
|
||||
return cast(
|
||||
bool,
|
||||
self.api.match(
|
||||
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||
),
|
||||
)
|
||||
|
||||
def filter(
|
||||
self,
|
||||
select: str,
|
||||
namespaces: Optional[_NamespaceMapping] = None,
|
||||
flags: int = 0,
|
||||
**kwargs: Any,
|
||||
) -> ResultSet[element.Tag]:
|
||||
"""Filter this `element.Tag`'s direct children based on the given CSS selector.
|
||||
|
||||
This uses the Soup Sieve library. It works the same way as
|
||||
passing a `element.Tag` into that library's `soupsieve.filter()
|
||||
<https://facelessuser.github.io/soupsieve/api/#soupsievefilter>`_
|
||||
method. For more information, see the documentation for
|
||||
`soupsieve.filter()
|
||||
<https://facelessuser.github.io/soupsieve/api/#soupsievefilter>`_.
|
||||
|
||||
:param namespaces: A dictionary mapping namespace prefixes
|
||||
used in the CSS selector to namespace URIs. By default,
|
||||
Beautiful Soup will pass in the prefixes it encountered while
|
||||
parsing the document.
|
||||
|
||||
:param flags: Flags to be passed into Soup Sieve's
|
||||
`soupsieve.filter()
|
||||
<https://facelessuser.github.io/soupsieve/api/#soupsievefilter>`_
|
||||
method.
|
||||
|
||||
:param kwargs: Keyword arguments to be passed into SoupSieve's
|
||||
`soupsieve.filter()
|
||||
<https://facelessuser.github.io/soupsieve/api/#soupsievefilter>`_
|
||||
method.
|
||||
"""
|
||||
return self._rs(
|
||||
self.api.filter(
|
||||
select, self.tag, self._ns(namespaces, select), flags, **kwargs
|
||||
)
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,268 @@
|
||||
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
import cProfile
|
||||
from io import BytesIO
|
||||
from html.parser import HTMLParser
|
||||
import bs4
|
||||
from bs4 import BeautifulSoup, __version__
|
||||
from bs4.builder import builder_registry
|
||||
from typing import (
|
||||
Any,
|
||||
IO,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
TYPE_CHECKING,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from bs4._typing import _IncomingMarkup
|
||||
|
||||
import pstats
|
||||
import random
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
import sys
|
||||
|
||||
|
||||
def diagnose(data: "_IncomingMarkup") -> None:
|
||||
"""Diagnostic suite for isolating common problems.
|
||||
|
||||
:param data: Some markup that needs to be explained.
|
||||
:return: None; diagnostics are printed to standard output.
|
||||
"""
|
||||
print(("Diagnostic running on Beautiful Soup %s" % __version__))
|
||||
print(("Python version %s" % sys.version))
|
||||
|
||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||
for name in basic_parsers:
|
||||
for builder in builder_registry.builders:
|
||||
if name in builder.features:
|
||||
break
|
||||
else:
|
||||
basic_parsers.remove(name)
|
||||
print(
|
||||
("I noticed that %s is not installed. Installing it may help." % name)
|
||||
)
|
||||
|
||||
if "lxml" in basic_parsers:
|
||||
basic_parsers.append("lxml-xml")
|
||||
try:
|
||||
from lxml import etree # type:ignore
|
||||
|
||||
print(("Found lxml version %s" % ".".join(map(str, etree.LXML_VERSION))))
|
||||
except ImportError:
|
||||
print("lxml is not installed or couldn't be imported.")
|
||||
|
||||
if "html5lib" in basic_parsers:
|
||||
try:
|
||||
import html5lib
|
||||
|
||||
print(("Found html5lib version %s" % html5lib.__version__))
|
||||
except ImportError:
|
||||
print("html5lib is not installed or couldn't be imported.")
|
||||
|
||||
if hasattr(data, "read"):
|
||||
data = data.read()
|
||||
|
||||
for parser in basic_parsers:
|
||||
print(("Trying to parse your markup with %s" % parser))
|
||||
success = False
|
||||
try:
|
||||
soup = BeautifulSoup(data, features=parser)
|
||||
success = True
|
||||
except Exception:
|
||||
print(("%s could not parse the markup." % parser))
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print(("Here's what %s did with the markup:" % parser))
|
||||
print((soup.prettify()))
|
||||
|
||||
print(("-" * 80))
|
||||
|
||||
|
||||
def lxml_trace(data: "_IncomingMarkup", html: bool = True, **kwargs: Any) -> None:
|
||||
"""Print out the lxml events that occur during parsing.
|
||||
|
||||
This lets you see how lxml parses a document when no Beautiful
|
||||
Soup code is running. You can use this to determine whether
|
||||
an lxml-specific problem is in Beautiful Soup's lxml tree builders
|
||||
or in lxml itself.
|
||||
|
||||
:param data: Some markup.
|
||||
:param html: If True, markup will be parsed with lxml's HTML parser.
|
||||
if False, lxml's XML parser will be used.
|
||||
"""
|
||||
from lxml import etree
|
||||
|
||||
recover = kwargs.pop("recover", True)
|
||||
if isinstance(data, str):
|
||||
data = data.encode("utf8")
|
||||
if not isinstance(data, IO):
|
||||
reader = BytesIO(data)
|
||||
for event, element in etree.iterparse(reader, html=html, recover=recover, **kwargs):
|
||||
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||||
|
||||
|
||||
class AnnouncingParser(HTMLParser):
|
||||
"""Subclass of HTMLParser that announces parse events, without doing
|
||||
anything else.
|
||||
|
||||
You can use this to get a picture of how html.parser sees a given
|
||||
document. The easiest way to do this is to call `htmlparser_trace`.
|
||||
"""
|
||||
|
||||
def _p(self, s: str) -> None:
|
||||
print(s)
|
||||
|
||||
def handle_starttag(
|
||||
self,
|
||||
name: str,
|
||||
attrs: List[Tuple[str, Optional[str]]],
|
||||
handle_empty_element: bool = True,
|
||||
) -> None:
|
||||
self._p(f"{name} {attrs} START")
|
||||
|
||||
def handle_endtag(self, name: str, check_already_closed: bool = True) -> None:
|
||||
self._p("%s END" % name)
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
self._p("%s DATA" % data)
|
||||
|
||||
def handle_charref(self, name: str) -> None:
|
||||
self._p("%s CHARREF" % name)
|
||||
|
||||
def handle_entityref(self, name: str) -> None:
|
||||
self._p("%s ENTITYREF" % name)
|
||||
|
||||
def handle_comment(self, data: str) -> None:
|
||||
self._p("%s COMMENT" % data)
|
||||
|
||||
def handle_decl(self, data: str) -> None:
|
||||
self._p("%s DECL" % data)
|
||||
|
||||
def unknown_decl(self, data: str) -> None:
|
||||
self._p("%s UNKNOWN-DECL" % data)
|
||||
|
||||
def handle_pi(self, data: str) -> None:
|
||||
self._p("%s PI" % data)
|
||||
|
||||
|
||||
def htmlparser_trace(data: str) -> None:
|
||||
"""Print out the HTMLParser events that occur during parsing.
|
||||
|
||||
This lets you see how HTMLParser parses a document when no
|
||||
Beautiful Soup code is running.
|
||||
|
||||
:param data: Some markup.
|
||||
"""
|
||||
parser = AnnouncingParser()
|
||||
parser.feed(data)
|
||||
|
||||
|
||||
_vowels: str = "aeiou"
|
||||
_consonants: str = "bcdfghjklmnpqrstvwxyz"
|
||||
|
||||
|
||||
def rword(length: int = 5) -> str:
|
||||
"""Generate a random word-like string.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
s = ""
|
||||
for i in range(length):
|
||||
if i % 2 == 0:
|
||||
t = _consonants
|
||||
else:
|
||||
t = _vowels
|
||||
s += random.choice(t)
|
||||
return s
|
||||
|
||||
|
||||
def rsentence(length: int = 4) -> str:
|
||||
"""Generate a random sentence-like string.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return " ".join(rword(random.randint(4, 9)) for i in range(length))
|
||||
|
||||
|
||||
def rdoc(num_elements: int = 1000) -> str:
|
||||
"""Randomly generate an invalid HTML document.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
tag_names = ["p", "div", "span", "i", "b", "script", "table"]
|
||||
elements = []
|
||||
for i in range(num_elements):
|
||||
choice = random.randint(0, 3)
|
||||
if choice == 0:
|
||||
# New tag.
|
||||
tag_name = random.choice(tag_names)
|
||||
elements.append("<%s>" % tag_name)
|
||||
elif choice == 1:
|
||||
elements.append(rsentence(random.randint(1, 4)))
|
||||
elif choice == 2:
|
||||
# Close a tag.
|
||||
tag_name = random.choice(tag_names)
|
||||
elements.append("</%s>" % tag_name)
|
||||
return "<html>" + "\n".join(elements) + "</html>"
|
||||
|
||||
|
||||
def benchmark_parsers(num_elements: int = 100000) -> None:
|
||||
"""Very basic head-to-head performance benchmark."""
|
||||
print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
|
||||
data = rdoc(num_elements)
|
||||
print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
|
||||
|
||||
for parser_name in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||
success = False
|
||||
try:
|
||||
a = time.time()
|
||||
BeautifulSoup(data, parser_name)
|
||||
b = time.time()
|
||||
success = True
|
||||
except Exception:
|
||||
print(("%s could not parse the markup." % parser_name))
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print(("BS4+%s parsed the markup in %.2fs." % (parser_name, b - a)))
|
||||
|
||||
from lxml import etree
|
||||
|
||||
a = time.time()
|
||||
etree.HTML(data)
|
||||
b = time.time()
|
||||
print(("Raw lxml parsed the markup in %.2fs." % (b - a)))
|
||||
|
||||
import html5lib
|
||||
|
||||
parser = html5lib.HTMLParser()
|
||||
a = time.time()
|
||||
parser.parse(data)
|
||||
b = time.time()
|
||||
print(("Raw html5lib parsed the markup in %.2fs." % (b - a)))
|
||||
|
||||
|
||||
def profile(num_elements: int = 100000, parser: str = "lxml") -> None:
|
||||
"""Use Python's profiler on a randomly generated document."""
|
||||
filehandle = tempfile.NamedTemporaryFile()
|
||||
filename = filehandle.name
|
||||
|
||||
data = rdoc(num_elements)
|
||||
vars = dict(bs4=bs4, data=data, parser=parser)
|
||||
cProfile.runctx("bs4.BeautifulSoup(data, parser)", vars, vars, filename)
|
||||
|
||||
stats = pstats.Stats(filename)
|
||||
# stats.strip_dirs()
|
||||
stats.sort_stats("cumulative")
|
||||
stats.print_stats("_html5lib|bs4", 50)
|
||||
|
||||
|
||||
# If this file is run as a script, standard input is diagnosed.
|
||||
if __name__ == "__main__":
|
||||
diagnose(sys.stdin.read())
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,28 @@
|
||||
"""Exceptions defined by Beautiful Soup itself."""
|
||||
|
||||
from typing import Union
|
||||
|
||||
|
||||
class StopParsing(Exception):
|
||||
"""Exception raised by a TreeBuilder if it's unable to continue parsing."""
|
||||
|
||||
|
||||
class FeatureNotFound(ValueError):
|
||||
"""Exception raised by the BeautifulSoup constructor if no parser with the
|
||||
requested features is found.
|
||||
"""
|
||||
|
||||
|
||||
class ParserRejectedMarkup(Exception):
|
||||
"""An Exception to be raised when the underlying parser simply
|
||||
refuses to parse the given markup.
|
||||
"""
|
||||
|
||||
def __init__(self, message_or_exception: Union[str, Exception]):
|
||||
"""Explain why the parser rejected the given markup, either
|
||||
with a textual explanation or another exception.
|
||||
"""
|
||||
if isinstance(message_or_exception, Exception):
|
||||
e = message_or_exception
|
||||
message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
|
||||
super(ParserRejectedMarkup, self).__init__(message_or_exception)
|
||||
@@ -0,0 +1,764 @@
|
||||
from __future__ import annotations
|
||||
from collections import defaultdict
|
||||
import re
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
cast,
|
||||
Dict,
|
||||
Iterator,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Type,
|
||||
Union,
|
||||
)
|
||||
import warnings
|
||||
|
||||
from bs4._deprecation import _deprecated
|
||||
from bs4.element import (
|
||||
AttributeDict,
|
||||
NavigableString,
|
||||
PageElement,
|
||||
ResultSet,
|
||||
Tag,
|
||||
)
|
||||
from bs4._typing import (
|
||||
_AtMostOneElement,
|
||||
_AttributeValue,
|
||||
_NullableStringMatchFunction,
|
||||
_OneElement,
|
||||
_PageElementMatchFunction,
|
||||
_QueryResults,
|
||||
_RawAttributeValues,
|
||||
_RegularExpressionProtocol,
|
||||
_StrainableAttribute,
|
||||
_StrainableElement,
|
||||
_StrainableString,
|
||||
_StringMatchFunction,
|
||||
_TagMatchFunction,
|
||||
)
|
||||
|
||||
|
||||
class ElementFilter(object):
|
||||
"""`ElementFilter` encapsulates the logic necessary to decide:
|
||||
|
||||
1. whether a `PageElement` (a `Tag` or a `NavigableString`) matches a
|
||||
user-specified query.
|
||||
|
||||
2. whether a given sequence of markup found during initial parsing
|
||||
should be turned into a `PageElement` at all, or simply discarded.
|
||||
|
||||
The base class is the simplest `ElementFilter`. By default, it
|
||||
matches everything and allows all markup to become `PageElement`
|
||||
objects. You can make it more selective by passing in a
|
||||
user-defined match function, or defining a subclass.
|
||||
|
||||
Most users of Beautiful Soup will never need to use
|
||||
`ElementFilter`, or its more capable subclass
|
||||
`SoupStrainer`. Instead, they will use methods like
|
||||
:py:meth:`Tag.find`, which will convert their arguments into
|
||||
`SoupStrainer` objects and run them against the tree.
|
||||
|
||||
However, if you find yourself wanting to treat the arguments to
|
||||
Beautiful Soup's find_*() methods as first-class objects, those
|
||||
objects will be `SoupStrainer` objects. You can create them
|
||||
yourself and then make use of functions like
|
||||
`ElementFilter.filter()`.
|
||||
"""
|
||||
|
||||
match_function: Optional[_PageElementMatchFunction]
|
||||
|
||||
def __init__(self, match_function: Optional[_PageElementMatchFunction] = None):
|
||||
"""Pass in a match function to easily customize the behavior of
|
||||
`ElementFilter.match` without needing to subclass.
|
||||
|
||||
:param match_function: A function that takes a `PageElement`
|
||||
and returns `True` if that `PageElement` matches some criteria.
|
||||
"""
|
||||
self.match_function = match_function
|
||||
|
||||
@property
|
||||
def includes_everything(self) -> bool:
|
||||
"""Does this `ElementFilter` obviously include everything? If so,
|
||||
the filter process can be made much faster.
|
||||
|
||||
The `ElementFilter` might turn out to include everything even
|
||||
if this returns `False`, but it won't include everything in an
|
||||
obvious way.
|
||||
|
||||
The base `ElementFilter` implementation includes things based on
|
||||
the match function, so includes_everything is only true if
|
||||
there is no match function.
|
||||
"""
|
||||
return not self.match_function
|
||||
|
||||
@property
|
||||
def excludes_everything(self) -> bool:
|
||||
"""Does this `ElementFilter` obviously exclude everything? If
|
||||
so, Beautiful Soup will issue a warning if you try to use it
|
||||
when parsing a document.
|
||||
|
||||
The `ElementFilter` might turn out to exclude everything even
|
||||
if this returns `False`, but it won't exclude everything in an
|
||||
obvious way.
|
||||
|
||||
The base `ElementFilter` implementation excludes things based
|
||||
on a match function we can't inspect, so excludes_everything
|
||||
is always false.
|
||||
"""
|
||||
return False
|
||||
|
||||
def match(self, element: PageElement, _known_rules:bool=False) -> bool:
|
||||
"""Does the given PageElement match the rules set down by this
|
||||
ElementFilter?
|
||||
|
||||
The base implementation delegates to the function passed in to
|
||||
the constructor.
|
||||
|
||||
:param _known_rules: Defined for compatibility with
|
||||
SoupStrainer._match(). Used more for consistency than because
|
||||
we need the performance optimization.
|
||||
"""
|
||||
if not _known_rules and self.includes_everything:
|
||||
return True
|
||||
if not self.match_function:
|
||||
return True
|
||||
return self.match_function(element)
|
||||
|
||||
def filter(self, generator: Iterator[PageElement]) -> Iterator[_OneElement]:
|
||||
"""The most generic search method offered by Beautiful Soup.
|
||||
|
||||
Acts like Python's built-in `filter`, using
|
||||
`ElementFilter.match` as the filtering function.
|
||||
"""
|
||||
# If there are no rules at all, don't bother filtering. Let
|
||||
# anything through.
|
||||
if self.includes_everything:
|
||||
yield from generator
|
||||
while True:
|
||||
try:
|
||||
i = next(generator)
|
||||
except StopIteration:
|
||||
break
|
||||
if i:
|
||||
if self.match(i, _known_rules=True):
|
||||
yield cast("_OneElement", i)
|
||||
|
||||
def find(self, generator: Iterator[PageElement]) -> _AtMostOneElement:
|
||||
"""A lower-level equivalent of :py:meth:`Tag.find`.
|
||||
|
||||
You can pass in your own generator for iterating over
|
||||
`PageElement` objects. The first one that matches this
|
||||
`ElementFilter` will be returned.
|
||||
|
||||
:param generator: A way of iterating over `PageElement`
|
||||
objects.
|
||||
"""
|
||||
for match in self.filter(generator):
|
||||
return match
|
||||
return None
|
||||
|
||||
def find_all(
|
||||
self, generator: Iterator[PageElement], limit: Optional[int] = None
|
||||
) -> _QueryResults:
|
||||
"""A lower-level equivalent of :py:meth:`Tag.find_all`.
|
||||
|
||||
You can pass in your own generator for iterating over
|
||||
`PageElement` objects. Only elements that match this
|
||||
`ElementFilter` will be returned in the :py:class:`ResultSet`.
|
||||
|
||||
:param generator: A way of iterating over `PageElement`
|
||||
objects.
|
||||
|
||||
:param limit: Stop looking after finding this many results.
|
||||
"""
|
||||
results = []
|
||||
for match in self.filter(generator):
|
||||
results.append(match)
|
||||
if limit is not None and len(results) >= limit:
|
||||
break
|
||||
return ResultSet(self, results)
|
||||
|
||||
def allow_tag_creation(
|
||||
self, nsprefix: Optional[str], name: str, attrs: Optional[_RawAttributeValues]
|
||||
) -> bool:
|
||||
"""Based on the name and attributes of a tag, see whether this
|
||||
`ElementFilter` will allow a `Tag` object to even be created.
|
||||
|
||||
By default, all tags are parsed. To change this, subclass
|
||||
`ElementFilter`.
|
||||
|
||||
:param name: The name of the prospective tag.
|
||||
:param attrs: The attributes of the prospective tag.
|
||||
"""
|
||||
return True
|
||||
|
||||
def allow_string_creation(self, string: str) -> bool:
|
||||
"""Based on the content of a string, see whether this
|
||||
`ElementFilter` will allow a `NavigableString` object based on
|
||||
this string to be added to the parse tree.
|
||||
|
||||
By default, all strings are processed into `NavigableString`
|
||||
objects. To change this, subclass `ElementFilter`.
|
||||
|
||||
:param str: The string under consideration.
|
||||
"""
|
||||
return True
|
||||
|
||||
|
||||
class MatchRule(object):
|
||||
"""Each MatchRule encapsulates the logic behind a single argument
|
||||
passed in to one of the Beautiful Soup find* methods.
|
||||
"""
|
||||
|
||||
string: Optional[str]
|
||||
pattern: Optional[_RegularExpressionProtocol]
|
||||
present: Optional[bool]
|
||||
exclude_everything: Optional[bool]
|
||||
# TODO-TYPING: All MatchRule objects also have an attribute
|
||||
# ``function``, but the type of the function depends on the
|
||||
# subclass.
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
string: Optional[Union[str, bytes]] = None,
|
||||
pattern: Optional[_RegularExpressionProtocol] = None,
|
||||
function: Optional[Callable] = None,
|
||||
present: Optional[bool] = None,
|
||||
exclude_everything: Optional[bool] = None
|
||||
):
|
||||
if isinstance(string, bytes):
|
||||
string = string.decode("utf8")
|
||||
self.string = string
|
||||
if isinstance(pattern, bytes):
|
||||
self.pattern = re.compile(pattern.decode("utf8"))
|
||||
elif isinstance(pattern, str):
|
||||
self.pattern = re.compile(pattern)
|
||||
else:
|
||||
self.pattern = pattern
|
||||
self.function = function
|
||||
self.present = present
|
||||
self.exclude_everything = exclude_everything
|
||||
|
||||
values = [
|
||||
x
|
||||
for x in (self.string, self.pattern, self.function, self.present, self.exclude_everything)
|
||||
if x is not None
|
||||
]
|
||||
if len(values) == 0:
|
||||
raise ValueError(
|
||||
"Either string, pattern, function, present, or exclude_everything must be provided."
|
||||
)
|
||||
if len(values) > 1:
|
||||
raise ValueError(
|
||||
"At most one of string, pattern, function, present, and exclude_everything must be provided."
|
||||
)
|
||||
|
||||
def _base_match(self, string: Optional[str]) -> Optional[bool]:
|
||||
"""Run the 'cheap' portion of a match, trying to get an answer without
|
||||
calling a potentially expensive custom function.
|
||||
|
||||
:return: True or False if we have a (positive or negative)
|
||||
match; None if we need to keep trying.
|
||||
"""
|
||||
# self.exclude_everything matches nothing.
|
||||
if self.exclude_everything:
|
||||
return False
|
||||
|
||||
# self.present==True matches everything except None.
|
||||
if self.present is True:
|
||||
return string is not None
|
||||
|
||||
# self.present==False matches _only_ None.
|
||||
if self.present is False:
|
||||
return string is None
|
||||
|
||||
# self.string does an exact string match.
|
||||
if self.string is not None:
|
||||
# print(f"{self.string} ?= {string}")
|
||||
return self.string == string
|
||||
|
||||
# self.pattern does a regular expression search.
|
||||
if self.pattern is not None:
|
||||
# print(f"{self.pattern} ?~ {string}")
|
||||
if string is None:
|
||||
return False
|
||||
return self.pattern.search(string) is not None
|
||||
|
||||
return None
|
||||
|
||||
def matches_string(self, string: Optional[str]) -> bool:
|
||||
_base_result = self._base_match(string)
|
||||
if _base_result is not None:
|
||||
# No need to invoke the test function.
|
||||
return _base_result
|
||||
if self.function is not None and not self.function(string):
|
||||
# print(f"{self.function}({string}) == False")
|
||||
return False
|
||||
return True
|
||||
|
||||
def __repr__(self) -> str:
|
||||
cls = type(self).__name__
|
||||
return f"<{cls} string={self.string} pattern={self.pattern} function={self.function} present={self.present}>"
|
||||
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
return (
|
||||
isinstance(other, MatchRule)
|
||||
and self.string == other.string
|
||||
and self.pattern == other.pattern
|
||||
and self.function == other.function
|
||||
and self.present == other.present
|
||||
)
|
||||
|
||||
|
||||
class TagNameMatchRule(MatchRule):
|
||||
"""A MatchRule implementing the rules for matches against tag name."""
|
||||
|
||||
function: Optional[_TagMatchFunction]
|
||||
|
||||
def matches_tag(self, tag: Tag) -> bool:
|
||||
base_value = self._base_match(tag.name)
|
||||
if base_value is not None:
|
||||
return base_value
|
||||
|
||||
# The only remaining possibility is that the match is determined
|
||||
# by a function call. Call the function.
|
||||
function = cast(_TagMatchFunction, self.function)
|
||||
if function(tag):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class AttributeValueMatchRule(MatchRule):
|
||||
"""A MatchRule implementing the rules for matches against attribute value."""
|
||||
|
||||
function: Optional[_NullableStringMatchFunction]
|
||||
|
||||
|
||||
class StringMatchRule(MatchRule):
|
||||
"""A MatchRule implementing the rules for matches against a NavigableString."""
|
||||
|
||||
function: Optional[_StringMatchFunction]
|
||||
|
||||
|
||||
class SoupStrainer(ElementFilter):
|
||||
"""The `ElementFilter` subclass used internally by Beautiful Soup.
|
||||
|
||||
A `SoupStrainer` encapsulates the logic necessary to perform the
|
||||
kind of matches supported by methods such as
|
||||
:py:meth:`Tag.find`. `SoupStrainer` objects are primarily created
|
||||
internally, but you can create one yourself and pass it in as
|
||||
``parse_only`` to the `BeautifulSoup` constructor, to parse a
|
||||
subset of a large document.
|
||||
|
||||
Internally, `SoupStrainer` objects work by converting the
|
||||
constructor arguments into `MatchRule` objects. Incoming
|
||||
tags/markup are matched against those rules.
|
||||
|
||||
:param name: One or more restrictions on the tags found in a document.
|
||||
|
||||
:param attrs: A dictionary that maps attribute names to
|
||||
restrictions on tags that use those attributes.
|
||||
|
||||
:param string: One or more restrictions on the strings found in a
|
||||
document.
|
||||
|
||||
:param kwargs: A dictionary that maps attribute names to restrictions
|
||||
on tags that use those attributes. These restrictions are additive to
|
||||
any specified in ``attrs``.
|
||||
|
||||
"""
|
||||
|
||||
name_rules: List[TagNameMatchRule]
|
||||
attribute_rules: Dict[str, List[AttributeValueMatchRule]]
|
||||
string_rules: List[StringMatchRule]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
name: Optional[_StrainableElement] = None,
|
||||
attrs: Optional[Dict[str, _StrainableAttribute]] = None,
|
||||
string: Optional[_StrainableString] = None,
|
||||
**kwargs: _StrainableAttribute,
|
||||
):
|
||||
if string is None and "text" in kwargs:
|
||||
string = cast(Optional[_StrainableString], kwargs.pop("text"))
|
||||
warnings.warn(
|
||||
"As of version 4.11.0, the 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
if name is None and not attrs and not string and not kwargs:
|
||||
# Special case for backwards compatibility. Instantiating
|
||||
# a SoupStrainer with no arguments whatsoever gets you one
|
||||
# that matches all Tags, and only Tags.
|
||||
self.name_rules = [TagNameMatchRule(present=True)]
|
||||
else:
|
||||
self.name_rules = cast(
|
||||
List[TagNameMatchRule], list(self._make_match_rules(name, TagNameMatchRule))
|
||||
)
|
||||
self.attribute_rules = defaultdict(list)
|
||||
|
||||
if attrs is None:
|
||||
attrs = {}
|
||||
if not isinstance(attrs, dict):
|
||||
# Passing something other than a dictionary as attrs is
|
||||
# sugar for matching that thing against the 'class'
|
||||
# attribute.
|
||||
attrs = {"class": attrs}
|
||||
|
||||
for attrdict in attrs, kwargs:
|
||||
for attr, value in attrdict.items():
|
||||
if attr == "class_" and attrdict is kwargs:
|
||||
# If you pass in 'class_' as part of kwargs, it's
|
||||
# because class is a Python reserved word. If you
|
||||
# pass it in as part of the attrs dict, it's
|
||||
# because you really are looking for an attribute
|
||||
# called 'class_'.
|
||||
attr = "class"
|
||||
|
||||
if value is None:
|
||||
value = False
|
||||
for rule_obj in self._make_match_rules(value, AttributeValueMatchRule):
|
||||
self.attribute_rules[attr].append(
|
||||
cast(AttributeValueMatchRule, rule_obj)
|
||||
)
|
||||
|
||||
self.string_rules = cast(
|
||||
List[StringMatchRule], list(self._make_match_rules(string, StringMatchRule))
|
||||
)
|
||||
|
||||
#: DEPRECATED 4.13.0: You shouldn't need to check this under
|
||||
#: any name (.string or .text), and if you do, you're probably
|
||||
#: not taking into account all of the types of values this
|
||||
#: variable might have. Look at the .string_rules list instead.
|
||||
self.__string = string
|
||||
|
||||
@property
|
||||
def includes_everything(self) -> bool:
|
||||
"""Check whether the provided rules will obviously include
|
||||
everything. (They might include everything even if this returns `False`,
|
||||
but not in an obvious way.)
|
||||
"""
|
||||
return not self.name_rules and not self.string_rules and not self.attribute_rules
|
||||
|
||||
@property
|
||||
def excludes_everything(self) -> bool:
|
||||
"""Check whether the provided rules will obviously exclude
|
||||
everything. (They might exclude everything even if this returns `False`,
|
||||
but not in an obvious way.)
|
||||
"""
|
||||
if (self.string_rules and (self.name_rules or self.attribute_rules)):
|
||||
# This is self-contradictory, so the rules exclude everything.
|
||||
return True
|
||||
|
||||
# If there's a rule that ended up treated as an "exclude everything"
|
||||
# rule due to creating a logical inconsistency, then the rules
|
||||
# exclude everything.
|
||||
if any(x.exclude_everything for x in self.string_rules):
|
||||
return True
|
||||
if any(x.exclude_everything for x in self.name_rules):
|
||||
return True
|
||||
for ruleset in self.attribute_rules.values():
|
||||
if any(x.exclude_everything for x in ruleset):
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def string(self) -> Optional[_StrainableString]:
|
||||
":meta private:"
|
||||
warnings.warn(
|
||||
"Access to deprecated property string. (Look at .string_rules instead) -- Deprecated since version 4.13.0.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return self.__string
|
||||
|
||||
@property
|
||||
def text(self) -> Optional[_StrainableString]:
|
||||
":meta private:"
|
||||
warnings.warn(
|
||||
"Access to deprecated property text. (Look at .string_rules instead) -- Deprecated since version 4.13.0.",
|
||||
DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
return self.__string
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<{self.__class__.__name__} name={self.name_rules} attrs={self.attribute_rules} string={self.string_rules}>"
|
||||
|
||||
@classmethod
|
||||
def _make_match_rules(
|
||||
cls,
|
||||
obj: Optional[Union[_StrainableElement, _StrainableAttribute]],
|
||||
rule_class: Type[MatchRule],
|
||||
) -> Iterator[MatchRule]:
|
||||
"""Convert a vaguely-specific 'object' into one or more well-defined
|
||||
`MatchRule` objects.
|
||||
|
||||
:param obj: Some kind of object that corresponds to one or more
|
||||
matching rules.
|
||||
:param rule_class: Create instances of this `MatchRule` subclass.
|
||||
"""
|
||||
if obj is None:
|
||||
return
|
||||
if isinstance(obj, (str, bytes)):
|
||||
yield rule_class(string=obj)
|
||||
elif isinstance(obj, bool):
|
||||
yield rule_class(present=obj)
|
||||
elif callable(obj):
|
||||
yield rule_class(function=obj)
|
||||
elif isinstance(obj, _RegularExpressionProtocol):
|
||||
yield rule_class(pattern=obj)
|
||||
elif hasattr(obj, "__iter__"):
|
||||
if not obj:
|
||||
# The attribute is being matched against the null set,
|
||||
# which means it should exclude everything.
|
||||
yield rule_class(exclude_everything=True)
|
||||
for o in obj:
|
||||
if not isinstance(o, (bytes, str)) and hasattr(o, "__iter__"):
|
||||
# This is almost certainly the user's
|
||||
# mistake. This list contains another list, which
|
||||
# opens up the possibility of infinite
|
||||
# self-reference. In the interests of avoiding
|
||||
# infinite recursion, we'll treat this as an
|
||||
# impossible match and issue a rule that excludes
|
||||
# everything, rather than looking inside.
|
||||
warnings.warn(
|
||||
f"Ignoring nested list {o} to avoid the possibility of infinite recursion.",
|
||||
stacklevel=5,
|
||||
)
|
||||
yield rule_class(exclude_everything=True)
|
||||
continue
|
||||
for x in cls._make_match_rules(o, rule_class):
|
||||
yield x
|
||||
else:
|
||||
yield rule_class(string=str(obj))
|
||||
|
||||
def matches_tag(self, tag: Tag) -> bool:
|
||||
"""Do the rules of this `SoupStrainer` trigger a match against the
|
||||
given `Tag`?
|
||||
|
||||
If the `SoupStrainer` has any `TagNameMatchRule`, at least one
|
||||
must match the `Tag` or its `Tag.name`.
|
||||
|
||||
If there are any `AttributeValueMatchRule` for a given
|
||||
attribute, at least one of them must match the attribute
|
||||
value.
|
||||
|
||||
If there are any `StringMatchRule`, at least one must match,
|
||||
but a `SoupStrainer` that *only* contains `StringMatchRule`
|
||||
cannot match a `Tag`, only a `NavigableString`.
|
||||
"""
|
||||
# If there are no rules at all, let anything through.
|
||||
#if self.includes_everything:
|
||||
# return True
|
||||
|
||||
# String rules cannot not match a Tag on their own.
|
||||
if not self.name_rules and not self.attribute_rules:
|
||||
return False
|
||||
|
||||
# Optimization for a very common case where the user is
|
||||
# searching for a tag with one specific name, and we're
|
||||
# looking at a tag with a different name.
|
||||
if (
|
||||
not tag.prefix
|
||||
and len(self.name_rules) == 1
|
||||
and self.name_rules[0].string is not None
|
||||
and tag.name != self.name_rules[0].string
|
||||
):
|
||||
return False
|
||||
|
||||
# If there are name rules, at least one must match. It can
|
||||
# match either the Tag object itself or the prefixed name of
|
||||
# the tag.
|
||||
prefixed_name = None
|
||||
if tag.prefix:
|
||||
prefixed_name = f"{tag.prefix}:{tag.name}"
|
||||
if self.name_rules:
|
||||
name_matches = False
|
||||
for rule in self.name_rules:
|
||||
# attrs = " ".join(
|
||||
# [f"{k}={v}" for k, v in sorted(tag.attrs.items())]
|
||||
# )
|
||||
# print(f"Testing <{tag.name} {attrs}>{tag.string}</{tag.name}> against {rule}")
|
||||
|
||||
# If the rule contains a function, the function will be called
|
||||
# with `tag`. It will not be called a second time with
|
||||
# `prefixed_name`.
|
||||
if rule.matches_tag(tag) or (
|
||||
not rule.function and prefixed_name is not None and rule.matches_string(prefixed_name)
|
||||
):
|
||||
name_matches = True
|
||||
break
|
||||
|
||||
if not name_matches:
|
||||
return False
|
||||
|
||||
# If there are attribute rules for a given attribute, at least
|
||||
# one of them must match. If there are rules for multiple
|
||||
# attributes, each attribute must have at least one match.
|
||||
for attr, rules in self.attribute_rules.items():
|
||||
attr_value = tag.get(attr, None)
|
||||
this_attr_match = self._attribute_match(attr_value, rules)
|
||||
if not this_attr_match:
|
||||
return False
|
||||
|
||||
# If there are string rules, at least one must match.
|
||||
if self.string_rules:
|
||||
_str = tag.string
|
||||
if _str is None:
|
||||
return False
|
||||
if not self.matches_any_string_rule(_str):
|
||||
return False
|
||||
return True
|
||||
|
||||
def _attribute_match(
|
||||
self,
|
||||
attr_value: Optional[_AttributeValue],
|
||||
rules: Iterable[AttributeValueMatchRule],
|
||||
) -> bool:
|
||||
attr_values: Sequence[Optional[str]]
|
||||
if isinstance(attr_value, list):
|
||||
attr_values = attr_value
|
||||
else:
|
||||
attr_values = [cast(str, attr_value)]
|
||||
|
||||
def _match_attribute_value_helper(attr_values: Sequence[Optional[str]]) -> bool:
|
||||
for rule in rules:
|
||||
for attr_value in attr_values:
|
||||
if rule.matches_string(attr_value):
|
||||
return True
|
||||
return False
|
||||
|
||||
this_attr_match = _match_attribute_value_helper(attr_values)
|
||||
if not this_attr_match and len(attr_values) != 1:
|
||||
# Try again but treat the attribute value as a single
|
||||
# string instead of a list. The result can only be
|
||||
# different if the list of values contains more or less
|
||||
# than one item.
|
||||
|
||||
# This cast converts Optional[str] to plain str.
|
||||
#
|
||||
# We know there can't be any None in the list. Beautiful
|
||||
# Soup never uses None as a value of a multi-valued
|
||||
# attribute, and if None is passed in as attr_value, it's
|
||||
# turned into a list with 1 element, which was excluded by
|
||||
# the if statement above.
|
||||
attr_values = cast(Sequence[str], attr_values)
|
||||
|
||||
joined_attr_value = " ".join(attr_values)
|
||||
this_attr_match = _match_attribute_value_helper([joined_attr_value])
|
||||
return this_attr_match
|
||||
|
||||
def allow_tag_creation(
|
||||
self, nsprefix: Optional[str], name: str, attrs: Optional[_RawAttributeValues]
|
||||
) -> bool:
|
||||
"""Based on the name and attributes of a tag, see whether this
|
||||
`SoupStrainer` will allow a `Tag` object to even be created.
|
||||
|
||||
:param name: The name of the prospective tag.
|
||||
:param attrs: The attributes of the prospective tag.
|
||||
"""
|
||||
if self.string_rules:
|
||||
# A SoupStrainer that has string rules can't be used to
|
||||
# manage tag creation, because the string rule can't be
|
||||
# evaluated until after the tag and all of its contents
|
||||
# have been parsed.
|
||||
return False
|
||||
prefixed_name = None
|
||||
if nsprefix:
|
||||
prefixed_name = f"{nsprefix}:{name}"
|
||||
if self.name_rules:
|
||||
# At least one name rule must match.
|
||||
name_match = False
|
||||
for rule in self.name_rules:
|
||||
for x in name, prefixed_name:
|
||||
if x is not None:
|
||||
if rule.matches_string(x):
|
||||
name_match = True
|
||||
break
|
||||
if not name_match:
|
||||
return False
|
||||
|
||||
# For each attribute that has rules, at least one rule must
|
||||
# match.
|
||||
if attrs is None:
|
||||
attrs = AttributeDict()
|
||||
for attr, rules in self.attribute_rules.items():
|
||||
attr_value = attrs.get(attr)
|
||||
if not self._attribute_match(attr_value, rules):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def allow_string_creation(self, string: str) -> bool:
|
||||
"""Based on the content of a markup string, see whether this
|
||||
`SoupStrainer` will allow it to be instantiated as a
|
||||
`NavigableString` object, or whether it should be ignored.
|
||||
"""
|
||||
if self.name_rules or self.attribute_rules:
|
||||
# A SoupStrainer that has name or attribute rules won't
|
||||
# match any strings; it's designed to match tags with
|
||||
# certain properties.
|
||||
return False
|
||||
if not self.string_rules:
|
||||
# A SoupStrainer with no string rules will match
|
||||
# all strings.
|
||||
return True
|
||||
if not self.matches_any_string_rule(string):
|
||||
return False
|
||||
return True
|
||||
|
||||
def matches_any_string_rule(self, string: str) -> bool:
|
||||
"""See whether the content of a string matches any of
|
||||
this `SoupStrainer`'s string rules.
|
||||
"""
|
||||
if not self.string_rules:
|
||||
return True
|
||||
for string_rule in self.string_rules:
|
||||
if string_rule.matches_string(string):
|
||||
return True
|
||||
return False
|
||||
|
||||
def match(self, element: PageElement, _known_rules: bool=False) -> bool:
|
||||
"""Does the given `PageElement` match the rules set down by this
|
||||
`SoupStrainer`?
|
||||
|
||||
The find_* methods rely heavily on this method to find matches.
|
||||
|
||||
:param element: A `PageElement`.
|
||||
:param _known_rules: Set to true in the common case where
|
||||
we already checked and found at least one rule in this SoupStrainer
|
||||
that might exclude a PageElement. Without this, we need
|
||||
to check .includes_everything every time, just to be safe.
|
||||
:return: `True` if the element matches this `SoupStrainer`'s rules; `False` otherwise.
|
||||
"""
|
||||
# If there are no rules at all, let anything through.
|
||||
if not _known_rules and self.includes_everything:
|
||||
return True
|
||||
if isinstance(element, Tag):
|
||||
return self.matches_tag(element)
|
||||
assert isinstance(element, NavigableString)
|
||||
if not (self.name_rules or self.attribute_rules):
|
||||
# A NavigableString can only match a SoupStrainer that
|
||||
# does not define any name or attribute rules.
|
||||
# Then it comes down to the string rules.
|
||||
return self.matches_any_string_rule(element)
|
||||
return False
|
||||
|
||||
@_deprecated("allow_tag_creation", "4.13.0")
|
||||
def search_tag(self, name: str, attrs: Optional[_RawAttributeValues]) -> bool:
|
||||
"""A less elegant version of `allow_tag_creation`. Deprecated as of 4.13.0"""
|
||||
":meta private:"
|
||||
return self.allow_tag_creation(None, name, attrs)
|
||||
|
||||
@_deprecated("match", "4.13.0")
|
||||
def search(self, element: PageElement) -> Optional[PageElement]:
|
||||
"""A less elegant version of match(). Deprecated as of 4.13.0.
|
||||
|
||||
:meta private:
|
||||
"""
|
||||
return element if self.match(element) else None
|
||||
@@ -0,0 +1,276 @@
|
||||
from __future__ import annotations
|
||||
from typing import Callable, Dict, Iterable, Optional, Set, Tuple, TYPE_CHECKING, Union
|
||||
from typing_extensions import TypeAlias
|
||||
from bs4.dammit import EntitySubstitution
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from bs4._typing import _AttributeValue
|
||||
|
||||
|
||||
class Formatter(EntitySubstitution):
|
||||
"""Describes a strategy to use when outputting a parse tree to a string.
|
||||
|
||||
Some parts of this strategy come from the distinction between
|
||||
HTML4, HTML5, and XML. Others are configurable by the user.
|
||||
|
||||
Formatters are passed in as the `formatter` argument to methods
|
||||
like `bs4.element.Tag.encode`. Most people won't need to
|
||||
think about formatters, and most people who need to think about
|
||||
them can pass in one of these predefined strings as `formatter`
|
||||
rather than making a new Formatter object:
|
||||
|
||||
For HTML documents:
|
||||
* 'html' - HTML entity substitution for generic HTML documents. (default)
|
||||
* 'html5' - HTML entity substitution for HTML5 documents, as
|
||||
well as some optimizations in the way tags are rendered.
|
||||
* 'html5-4.12.0' - The version of the 'html5' formatter used prior to
|
||||
Beautiful Soup 4.13.0.
|
||||
* 'minimal' - Only make the substitutions necessary to guarantee
|
||||
valid HTML.
|
||||
* None - Do not perform any substitution. This will be faster
|
||||
but may result in invalid markup.
|
||||
|
||||
For XML documents:
|
||||
* 'html' - Entity substitution for XHTML documents.
|
||||
* 'minimal' - Only make the substitutions necessary to guarantee
|
||||
valid XML. (default)
|
||||
* None - Do not perform any substitution. This will be faster
|
||||
but may result in invalid markup.
|
||||
|
||||
"""
|
||||
|
||||
#: Constant name denoting HTML markup
|
||||
HTML: str = "html"
|
||||
|
||||
#: Constant name denoting XML markup
|
||||
XML: str = "xml"
|
||||
|
||||
#: Default values for the various constructor options when the
|
||||
#: markup language is HTML.
|
||||
HTML_DEFAULTS: Dict[str, Set[str]] = dict(
|
||||
cdata_containing_tags=set(["script", "style"]),
|
||||
)
|
||||
|
||||
language: Optional[str] #: :meta private:
|
||||
entity_substitution: Optional[_EntitySubstitutionFunction] #: :meta private:
|
||||
void_element_close_prefix: str #: :meta private:
|
||||
cdata_containing_tags: Set[str] #: :meta private:
|
||||
indent: str #: :meta private:
|
||||
|
||||
#: If this is set to true by the constructor, then attributes whose
|
||||
#: values are sent to the empty string will be treated as HTML
|
||||
#: boolean attributes. (Attributes whose value is None are always
|
||||
#: rendered this way.)
|
||||
empty_attributes_are_booleans: bool
|
||||
|
||||
def _default(
|
||||
self, language: str, value: Optional[Set[str]], kwarg: str
|
||||
) -> Set[str]:
|
||||
if value is not None:
|
||||
return value
|
||||
if language == self.XML:
|
||||
# When XML is the markup language in use, all of the
|
||||
# defaults are the empty list.
|
||||
return set()
|
||||
|
||||
# Otherwise, it depends on what's in HTML_DEFAULTS.
|
||||
return self.HTML_DEFAULTS[kwarg]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
language: Optional[str] = None,
|
||||
entity_substitution: Optional[_EntitySubstitutionFunction] = None,
|
||||
void_element_close_prefix: str = "/",
|
||||
cdata_containing_tags: Optional[Set[str]] = None,
|
||||
empty_attributes_are_booleans: bool = False,
|
||||
indent: Union[int,str] = 1,
|
||||
):
|
||||
r"""Constructor.
|
||||
|
||||
:param language: This should be `Formatter.XML` if you are formatting
|
||||
XML markup and `Formatter.HTML` if you are formatting HTML markup.
|
||||
|
||||
:param entity_substitution: A function to call to replace special
|
||||
characters with XML/HTML entities. For examples, see
|
||||
bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
|
||||
:param void_element_close_prefix: By default, void elements
|
||||
are represented as <tag/> (XML rules) rather than <tag>
|
||||
(HTML rules). To get <tag>, pass in the empty string.
|
||||
:param cdata_containing_tags: The set of tags that are defined
|
||||
as containing CDATA in this dialect. For example, in HTML,
|
||||
<script> and <style> tags are defined as containing CDATA,
|
||||
and their contents should not be formatted.
|
||||
:param empty_attributes_are_booleans: If this is set to true,
|
||||
then attributes whose values are sent to the empty string
|
||||
will be treated as `HTML boolean
|
||||
attributes<https://dev.w3.org/html5/spec-LC/common-microsyntaxes.html#boolean-attributes>`_. (Attributes
|
||||
whose value is None are always rendered this way.)
|
||||
:param indent: If indent is a non-negative integer or string,
|
||||
then the contents of elements will be indented
|
||||
appropriately when pretty-printing. An indent level of 0,
|
||||
negative, or "" will only insert newlines. Using a
|
||||
positive integer indent indents that many spaces per
|
||||
level. If indent is a string (such as "\t"), that string
|
||||
is used to indent each level. The default behavior is to
|
||||
indent one space per level.
|
||||
|
||||
"""
|
||||
self.language = language or self.HTML
|
||||
self.entity_substitution = entity_substitution
|
||||
self.void_element_close_prefix = void_element_close_prefix
|
||||
self.cdata_containing_tags = self._default(
|
||||
self.language, cdata_containing_tags, "cdata_containing_tags"
|
||||
)
|
||||
self.empty_attributes_are_booleans = empty_attributes_are_booleans
|
||||
if indent is None:
|
||||
indent = 0
|
||||
indent_str: str
|
||||
if isinstance(indent, int):
|
||||
if indent < 0:
|
||||
indent = 0
|
||||
indent_str = " " * indent
|
||||
elif isinstance(indent, str):
|
||||
indent_str = indent
|
||||
else:
|
||||
indent_str = " "
|
||||
self.indent = indent_str
|
||||
|
||||
def substitute(self, ns: str) -> str:
|
||||
"""Process a string that needs to undergo entity substitution.
|
||||
This may be a string encountered in an attribute value or as
|
||||
text.
|
||||
|
||||
:param ns: A string.
|
||||
:return: The same string but with certain characters replaced by named
|
||||
or numeric entities.
|
||||
"""
|
||||
if not self.entity_substitution:
|
||||
return ns
|
||||
from .element import NavigableString
|
||||
|
||||
if (
|
||||
isinstance(ns, NavigableString)
|
||||
and ns.parent is not None
|
||||
and ns.parent.name in self.cdata_containing_tags
|
||||
):
|
||||
# Do nothing.
|
||||
return ns
|
||||
# Substitute.
|
||||
return self.entity_substitution(ns)
|
||||
|
||||
def attribute_value(self, value: str) -> str:
|
||||
"""Process the value of an attribute.
|
||||
|
||||
:param ns: A string.
|
||||
:return: A string with certain characters replaced by named
|
||||
or numeric entities.
|
||||
"""
|
||||
return self.substitute(value)
|
||||
|
||||
def attributes(
|
||||
self, tag: bs4.element.Tag # type:ignore
|
||||
) -> Iterable[Tuple[str, Optional[_AttributeValue]]]:
|
||||
"""Reorder a tag's attributes however you want.
|
||||
|
||||
By default, attributes are sorted alphabetically. This makes
|
||||
behavior consistent between Python 2 and Python 3, and preserves
|
||||
backwards compatibility with older versions of Beautiful Soup.
|
||||
|
||||
If `empty_attributes_are_booleans` is True, then
|
||||
attributes whose values are set to the empty string will be
|
||||
treated as boolean attributes.
|
||||
"""
|
||||
if tag.attrs is None:
|
||||
return []
|
||||
|
||||
items: Iterable[Tuple[str, _AttributeValue]] = list(tag.attrs.items())
|
||||
return sorted(
|
||||
(k, (None if self.empty_attributes_are_booleans and v == "" else v))
|
||||
for k, v in items
|
||||
)
|
||||
|
||||
|
||||
class HTMLFormatter(Formatter):
|
||||
"""A generic Formatter for HTML."""
|
||||
|
||||
REGISTRY: Dict[Optional[str], HTMLFormatter] = {}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
entity_substitution: Optional[_EntitySubstitutionFunction] = None,
|
||||
void_element_close_prefix: str = "/",
|
||||
cdata_containing_tags: Optional[Set[str]] = None,
|
||||
empty_attributes_are_booleans: bool = False,
|
||||
indent: Union[int,str] = 1,
|
||||
):
|
||||
super(HTMLFormatter, self).__init__(
|
||||
self.HTML,
|
||||
entity_substitution,
|
||||
void_element_close_prefix,
|
||||
cdata_containing_tags,
|
||||
empty_attributes_are_booleans,
|
||||
indent=indent
|
||||
)
|
||||
|
||||
|
||||
class XMLFormatter(Formatter):
|
||||
"""A generic Formatter for XML."""
|
||||
|
||||
REGISTRY: Dict[Optional[str], XMLFormatter] = {}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
entity_substitution: Optional[_EntitySubstitutionFunction] = None,
|
||||
void_element_close_prefix: str = "/",
|
||||
cdata_containing_tags: Optional[Set[str]] = None,
|
||||
empty_attributes_are_booleans: bool = False,
|
||||
indent: Union[int,str] = 1,
|
||||
):
|
||||
super(XMLFormatter, self).__init__(
|
||||
self.XML,
|
||||
entity_substitution,
|
||||
void_element_close_prefix,
|
||||
cdata_containing_tags,
|
||||
empty_attributes_are_booleans,
|
||||
indent=indent,
|
||||
)
|
||||
|
||||
|
||||
# Set up aliases for the default formatters.
|
||||
HTMLFormatter.REGISTRY["html"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
|
||||
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html5,
|
||||
void_element_close_prefix="",
|
||||
empty_attributes_are_booleans=True,
|
||||
)
|
||||
HTMLFormatter.REGISTRY["html5-4.12"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html,
|
||||
void_element_close_prefix="",
|
||||
empty_attributes_are_booleans=True,
|
||||
)
|
||||
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
HTMLFormatter.REGISTRY[None] = HTMLFormatter(entity_substitution=None)
|
||||
XMLFormatter.REGISTRY["html"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
|
||||
XMLFormatter.REGISTRY[None] = XMLFormatter(entity_substitution=None)
|
||||
|
||||
# Define type aliases to improve readability.
|
||||
#
|
||||
|
||||
#: A function to call to replace special characters with XML or HTML
|
||||
#: entities.
|
||||
_EntitySubstitutionFunction: TypeAlias = Callable[[str], str]
|
||||
|
||||
# Many of the output-centered methods take an argument that can either
|
||||
# be a Formatter object or the name of a Formatter to be looked up.
|
||||
_FormatterOrName = Union[Formatter, str]
|
||||
+1
@@ -0,0 +1 @@
|
||||
pip
|
||||
+78
@@ -0,0 +1,78 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: certifi
|
||||
Version: 2026.1.4
|
||||
Summary: Python package for providing Mozilla's CA Bundle.
|
||||
Home-page: https://github.com/certifi/python-certifi
|
||||
Author: Kenneth Reitz
|
||||
Author-email: me@kennethreitz.com
|
||||
License: MPL-2.0
|
||||
Project-URL: Source, https://github.com/certifi/python-certifi
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
||||
Classifier: Natural Language :: English
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Classifier: Programming Language :: Python :: 3.13
|
||||
Classifier: Programming Language :: Python :: 3.14
|
||||
Requires-Python: >=3.7
|
||||
License-File: LICENSE
|
||||
Dynamic: author
|
||||
Dynamic: author-email
|
||||
Dynamic: classifier
|
||||
Dynamic: description
|
||||
Dynamic: home-page
|
||||
Dynamic: license
|
||||
Dynamic: license-file
|
||||
Dynamic: project-url
|
||||
Dynamic: requires-python
|
||||
Dynamic: summary
|
||||
|
||||
Certifi: Python SSL Certificates
|
||||
================================
|
||||
|
||||
Certifi provides Mozilla's carefully curated collection of Root Certificates for
|
||||
validating the trustworthiness of SSL certificates while verifying the identity
|
||||
of TLS hosts. It has been extracted from the `Requests`_ project.
|
||||
|
||||
Installation
|
||||
------------
|
||||
|
||||
``certifi`` is available on PyPI. Simply install it with ``pip``::
|
||||
|
||||
$ pip install certifi
|
||||
|
||||
Usage
|
||||
-----
|
||||
|
||||
To reference the installed certificate authority (CA) bundle, you can use the
|
||||
built-in function::
|
||||
|
||||
>>> import certifi
|
||||
|
||||
>>> certifi.where()
|
||||
'/usr/local/lib/python3.7/site-packages/certifi/cacert.pem'
|
||||
|
||||
Or from the command line::
|
||||
|
||||
$ python -m certifi
|
||||
/usr/local/lib/python3.7/site-packages/certifi/cacert.pem
|
||||
|
||||
Enjoy!
|
||||
|
||||
.. _`Requests`: https://requests.readthedocs.io/en/master/
|
||||
|
||||
Addition/Removal of Certificates
|
||||
--------------------------------
|
||||
|
||||
Certifi does not support any addition/removal or other modification of the
|
||||
CA trust store content. This project is intended to provide a reliable and
|
||||
highly portable root of trust to python deployments. Look to upstream projects
|
||||
for methods to use alternate trust.
|
||||
+14
@@ -0,0 +1,14 @@
|
||||
certifi-2026.1.4.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
certifi-2026.1.4.dist-info/METADATA,sha256=FSfJEfKuMo6bJlofUrtRpn4PFTYtbYyXpHN_A3ZFpIY,2473
|
||||
certifi-2026.1.4.dist-info/RECORD,,
|
||||
certifi-2026.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
||||
certifi-2026.1.4.dist-info/licenses/LICENSE,sha256=6TcW2mucDVpKHfYP5pWzcPBpVgPSH2-D8FPkLPwQyvc,989
|
||||
certifi-2026.1.4.dist-info/top_level.txt,sha256=KMu4vUCfsjLrkPbSNdgdekS-pVJzBAJFO__nI8NF6-U,8
|
||||
certifi/__init__.py,sha256=969deMMS7Uchipr0oO4dbRBUvRi0uNYCn07VmG1aTrg,94
|
||||
certifi/__main__.py,sha256=xBBoj905TUWBLRGANOcf7oi6e-3dMP4cEoG9OyMs11g,243
|
||||
certifi/__pycache__/__init__.cpython-311.pyc,,
|
||||
certifi/__pycache__/__main__.cpython-311.pyc,,
|
||||
certifi/__pycache__/core.cpython-311.pyc,,
|
||||
certifi/cacert.pem,sha256=Tzl1_zCrvzVEO0hgZK6Ly0Hf9wf_31dsdtKS-0WKoKk,270954
|
||||
certifi/core.py,sha256=XFXycndG5pf37ayeF8N32HUuDafsyhkVMbO4BAPWHa0,3394
|
||||
certifi/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
+5
@@ -0,0 +1,5 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: setuptools (80.9.0)
|
||||
Root-Is-Purelib: true
|
||||
Tag: py3-none-any
|
||||
|
||||
+20
@@ -0,0 +1,20 @@
|
||||
This package contains a modified version of ca-bundle.crt:
|
||||
|
||||
ca-bundle.crt -- Bundle of CA Root Certificates
|
||||
|
||||
This is a bundle of X.509 certificates of public Certificate Authorities
|
||||
(CA). These were automatically extracted from Mozilla's root certificates
|
||||
file (certdata.txt). This file can be found in the mozilla source tree:
|
||||
https://hg.mozilla.org/mozilla-central/file/tip/security/nss/lib/ckfw/builtins/certdata.txt
|
||||
It contains the certificates in PEM format and therefore
|
||||
can be directly used with curl / libcurl / php_curl, or with
|
||||
an Apache+mod_ssl webserver for SSL client authentication.
|
||||
Just configure this file as the SSLCACertificateFile.#
|
||||
|
||||
***** BEGIN LICENSE BLOCK *****
|
||||
This Source Code Form is subject to the terms of the Mozilla Public License,
|
||||
v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain
|
||||
one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
***** END LICENSE BLOCK *****
|
||||
@(#) $RCSfile: certdata.txt,v $ $Revision: 1.80 $ $Date: 2011/11/03 15:11:58 $
|
||||
+1
@@ -0,0 +1 @@
|
||||
certifi
|
||||
@@ -0,0 +1,4 @@
|
||||
from .core import contents, where
|
||||
|
||||
__all__ = ["contents", "where"]
|
||||
__version__ = "2026.01.04"
|
||||
@@ -0,0 +1,12 @@
|
||||
import argparse
|
||||
|
||||
from certifi import contents, where
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("-c", "--contents", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.contents:
|
||||
print(contents())
|
||||
else:
|
||||
print(where())
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,83 @@
|
||||
"""
|
||||
certifi.py
|
||||
~~~~~~~~~~
|
||||
|
||||
This module returns the installation location of cacert.pem or its contents.
|
||||
"""
|
||||
import sys
|
||||
import atexit
|
||||
|
||||
def exit_cacert_ctx() -> None:
|
||||
_CACERT_CTX.__exit__(None, None, None) # type: ignore[union-attr]
|
||||
|
||||
|
||||
if sys.version_info >= (3, 11):
|
||||
|
||||
from importlib.resources import as_file, files
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the file
|
||||
# in cases where we're inside of a zipimport situation until someone
|
||||
# actually calls where(), but we don't want to re-extract the file
|
||||
# on every call of where(), so we'll do it once then store it in a
|
||||
# global variable.
|
||||
global _CACERT_CTX
|
||||
global _CACERT_PATH
|
||||
if _CACERT_PATH is None:
|
||||
# This is slightly janky, the importlib.resources API wants you to
|
||||
# manage the cleanup of this file, so it doesn't actually return a
|
||||
# path, it returns a context manager that will give you the path
|
||||
# when you enter it and will do any cleanup when you leave it. In
|
||||
# the common case of not needing a temporary file, it will just
|
||||
# return the file system location and the __exit__() is a no-op.
|
||||
#
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = as_file(files("certifi").joinpath("cacert.pem"))
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
atexit.register(exit_cacert_ctx)
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
def contents() -> str:
|
||||
return files("certifi").joinpath("cacert.pem").read_text(encoding="ascii")
|
||||
|
||||
else:
|
||||
|
||||
from importlib.resources import path as get_path, read_text
|
||||
|
||||
_CACERT_CTX = None
|
||||
_CACERT_PATH = None
|
||||
|
||||
def where() -> str:
|
||||
# This is slightly terrible, but we want to delay extracting the
|
||||
# file in cases where we're inside of a zipimport situation until
|
||||
# someone actually calls where(), but we don't want to re-extract
|
||||
# the file on every call of where(), so we'll do it once then store
|
||||
# it in a global variable.
|
||||
global _CACERT_CTX
|
||||
global _CACERT_PATH
|
||||
if _CACERT_PATH is None:
|
||||
# This is slightly janky, the importlib.resources API wants you
|
||||
# to manage the cleanup of this file, so it doesn't actually
|
||||
# return a path, it returns a context manager that will give
|
||||
# you the path when you enter it and will do any cleanup when
|
||||
# you leave it. In the common case of not needing a temporary
|
||||
# file, it will just return the file system location and the
|
||||
# __exit__() is a no-op.
|
||||
#
|
||||
# We also have to hold onto the actual context manager, because
|
||||
# it will do the cleanup whenever it gets garbage collected, so
|
||||
# we will also store that at the global level as well.
|
||||
_CACERT_CTX = get_path("certifi", "cacert.pem")
|
||||
_CACERT_PATH = str(_CACERT_CTX.__enter__())
|
||||
atexit.register(exit_cacert_ctx)
|
||||
|
||||
return _CACERT_PATH
|
||||
|
||||
def contents() -> str:
|
||||
return read_text("certifi", "cacert.pem", encoding="ascii")
|
||||
@@ -0,0 +1 @@
|
||||
pip
|
||||
@@ -0,0 +1,68 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: cffi
|
||||
Version: 2.0.0
|
||||
Summary: Foreign Function Interface for Python calling C code.
|
||||
Author: Armin Rigo, Maciej Fijalkowski
|
||||
Maintainer: Matt Davis, Matt Clay, Matti Picus
|
||||
License-Expression: MIT
|
||||
Project-URL: Documentation, https://cffi.readthedocs.io/
|
||||
Project-URL: Changelog, https://cffi.readthedocs.io/en/latest/whatsnew.html
|
||||
Project-URL: Downloads, https://github.com/python-cffi/cffi/releases
|
||||
Project-URL: Contact, https://groups.google.com/forum/#!forum/python-cffi
|
||||
Project-URL: Source Code, https://github.com/python-cffi/cffi
|
||||
Project-URL: Issue Tracker, https://github.com/python-cffi/cffi/issues
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Classifier: Programming Language :: Python :: 3.13
|
||||
Classifier: Programming Language :: Python :: 3.14
|
||||
Classifier: Programming Language :: Python :: Free Threading :: 2 - Beta
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Requires-Python: >=3.9
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
License-File: AUTHORS
|
||||
Requires-Dist: pycparser; implementation_name != "PyPy"
|
||||
Dynamic: license-file
|
||||
|
||||
[](https://github.com/python-cffi/cffi/actions/workflows/ci.yaml?query=branch%3Amain++)
|
||||
[](https://pypi.org/project/cffi)
|
||||
[][Documentation]
|
||||
|
||||
|
||||
CFFI
|
||||
====
|
||||
|
||||
Foreign Function Interface for Python calling C code.
|
||||
|
||||
Please see the [Documentation] or uncompiled in the `doc/` subdirectory.
|
||||
|
||||
Download
|
||||
--------
|
||||
|
||||
[Download page](https://github.com/python-cffi/cffi/releases)
|
||||
|
||||
Source Code
|
||||
-----------
|
||||
|
||||
Source code is publicly available on
|
||||
[GitHub](https://github.com/python-cffi/cffi).
|
||||
|
||||
Contact
|
||||
-------
|
||||
|
||||
[Mailing list](https://groups.google.com/forum/#!forum/python-cffi)
|
||||
|
||||
Testing/development tips
|
||||
------------------------
|
||||
|
||||
After `git clone` or `wget && tar`, we will get a directory called `cffi` or `cffi-x.x.x`. we call it `repo-directory`. To run tests under CPython, run the following in the `repo-directory`:
|
||||
|
||||
pip install pytest
|
||||
pip install -e . # editable install of CFFI for local development
|
||||
pytest src/c/ testing/
|
||||
|
||||
[Documentation]: http://cffi.readthedocs.org/
|
||||
@@ -0,0 +1,49 @@
|
||||
_cffi_backend.cpython-311-x86_64-linux-gnu.so,sha256=gSEHv07vIWL26K3_tiw2e_rJzTGs31NW2-53K0J2X2c,332440
|
||||
cffi-2.0.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
cffi-2.0.0.dist-info/METADATA,sha256=uYzn40F68Im8EtXHNBLZs7FoPM-OxzyYbDWsjJvhujk,2559
|
||||
cffi-2.0.0.dist-info/RECORD,,
|
||||
cffi-2.0.0.dist-info/WHEEL,sha256=_CFvICYDmZlAYHt8L7Zn3n-BGLj8dkZLQPp22Piy5JE,151
|
||||
cffi-2.0.0.dist-info/entry_points.txt,sha256=y6jTxnyeuLnL-XJcDv8uML3n6wyYiGRg8MTp_QGJ9Ho,75
|
||||
cffi-2.0.0.dist-info/licenses/AUTHORS,sha256=KmemC7-zN1nWfWRf8TG45ta8TK_CMtdR_Kw-2k0xTMg,208
|
||||
cffi-2.0.0.dist-info/licenses/LICENSE,sha256=W6JN3FcGf5JJrdZEw6_EGl1tw34jQz73Wdld83Cwr2M,1123
|
||||
cffi-2.0.0.dist-info/top_level.txt,sha256=rE7WR3rZfNKxWI9-jn6hsHCAl7MDkB-FmuQbxWjFehQ,19
|
||||
cffi/__init__.py,sha256=-ksBQ7MfDzVvbBlV_ftYBWAmEqfA86ljIzMxzaZeAlI,511
|
||||
cffi/__pycache__/__init__.cpython-311.pyc,,
|
||||
cffi/__pycache__/_imp_emulation.cpython-311.pyc,,
|
||||
cffi/__pycache__/_shimmed_dist_utils.cpython-311.pyc,,
|
||||
cffi/__pycache__/api.cpython-311.pyc,,
|
||||
cffi/__pycache__/backend_ctypes.cpython-311.pyc,,
|
||||
cffi/__pycache__/cffi_opcode.cpython-311.pyc,,
|
||||
cffi/__pycache__/commontypes.cpython-311.pyc,,
|
||||
cffi/__pycache__/cparser.cpython-311.pyc,,
|
||||
cffi/__pycache__/error.cpython-311.pyc,,
|
||||
cffi/__pycache__/ffiplatform.cpython-311.pyc,,
|
||||
cffi/__pycache__/lock.cpython-311.pyc,,
|
||||
cffi/__pycache__/model.cpython-311.pyc,,
|
||||
cffi/__pycache__/pkgconfig.cpython-311.pyc,,
|
||||
cffi/__pycache__/recompiler.cpython-311.pyc,,
|
||||
cffi/__pycache__/setuptools_ext.cpython-311.pyc,,
|
||||
cffi/__pycache__/vengine_cpy.cpython-311.pyc,,
|
||||
cffi/__pycache__/vengine_gen.cpython-311.pyc,,
|
||||
cffi/__pycache__/verifier.cpython-311.pyc,,
|
||||
cffi/_cffi_errors.h,sha256=zQXt7uR_m8gUW-fI2hJg0KoSkJFwXv8RGUkEDZ177dQ,3908
|
||||
cffi/_cffi_include.h,sha256=Exhmgm9qzHWzWivjfTe0D7Xp4rPUkVxdNuwGhMTMzbw,15055
|
||||
cffi/_embedding.h,sha256=Ai33FHblE7XSpHOCp8kPcWwN5_9BV14OvN0JVa6ITpw,18786
|
||||
cffi/_imp_emulation.py,sha256=RxREG8zAbI2RPGBww90u_5fi8sWdahpdipOoPzkp7C0,2960
|
||||
cffi/_shimmed_dist_utils.py,sha256=Bjj2wm8yZbvFvWEx5AEfmqaqZyZFhYfoyLLQHkXZuao,2230
|
||||
cffi/api.py,sha256=alBv6hZQkjpmZplBphdaRn2lPO9-CORs_M7ixabvZWI,42169
|
||||
cffi/backend_ctypes.py,sha256=h5ZIzLc6BFVXnGyc9xPqZWUS7qGy7yFSDqXe68Sa8z4,42454
|
||||
cffi/cffi_opcode.py,sha256=JDV5l0R0_OadBX_uE7xPPTYtMdmpp8I9UYd6av7aiDU,5731
|
||||
cffi/commontypes.py,sha256=7N6zPtCFlvxXMWhHV08psUjdYIK2XgsN3yo5dgua_v4,2805
|
||||
cffi/cparser.py,sha256=QUTfmlL-aO-MYR8bFGlvAUHc36OQr7XYLe0WLkGFjRo,44790
|
||||
cffi/error.py,sha256=v6xTiS4U0kvDcy4h_BDRo5v39ZQuj-IMRYLv5ETddZs,877
|
||||
cffi/ffiplatform.py,sha256=avxFjdikYGJoEtmJO7ewVmwG_VEVl6EZ_WaNhZYCqv4,3584
|
||||
cffi/lock.py,sha256=l9TTdwMIMpi6jDkJGnQgE9cvTIR7CAntIJr8EGHt3pY,747
|
||||
cffi/model.py,sha256=W30UFQZE73jL5Mx5N81YT77us2W2iJjTm0XYfnwz1cg,21797
|
||||
cffi/parse_c_type.h,sha256=OdwQfwM9ktq6vlCB43exFQmxDBtj2MBNdK8LYl15tjw,5976
|
||||
cffi/pkgconfig.py,sha256=LP1w7vmWvmKwyqLaU1Z243FOWGNQMrgMUZrvgFuOlco,4374
|
||||
cffi/recompiler.py,sha256=78J6lMEEOygXNmjN9-fOFFO3j7eW-iFxSrxfvQb54bY,65509
|
||||
cffi/setuptools_ext.py,sha256=0rCwBJ1W7FHWtiMKfNXsSST88V8UXrui5oeXFlDNLG8,9411
|
||||
cffi/vengine_cpy.py,sha256=oyQKD23kpE0aChUKA8Jg0e723foPiYzLYEdb-J0MiNs,43881
|
||||
cffi/vengine_gen.py,sha256=DUlEIrDiVin1Pnhn1sfoamnS5NLqfJcOdhRoeSNeJRg,26939
|
||||
cffi/verifier.py,sha256=oX8jpaohg2Qm3aHcznidAdvrVm5N4sQYG0a3Eo5mIl4,11182
|
||||
@@ -0,0 +1,6 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: setuptools (80.9.0)
|
||||
Root-Is-Purelib: false
|
||||
Tag: cp311-cp311-manylinux_2_17_x86_64
|
||||
Tag: cp311-cp311-manylinux2014_x86_64
|
||||
|
||||
+2
@@ -0,0 +1,2 @@
|
||||
[distutils.setup_keywords]
|
||||
cffi_modules = cffi.setuptools_ext:cffi_modules
|
||||
+8
@@ -0,0 +1,8 @@
|
||||
This package has been mostly done by Armin Rigo with help from
|
||||
Maciej Fijałkowski. The idea is heavily based (although not directly
|
||||
copied) from LuaJIT ffi by Mike Pall.
|
||||
|
||||
|
||||
Other contributors:
|
||||
|
||||
Google Inc.
|
||||
+23
@@ -0,0 +1,23 @@
|
||||
|
||||
Except when otherwise stated (look for LICENSE files in directories or
|
||||
information at the beginning of each file) all software and
|
||||
documentation is licensed as follows:
|
||||
|
||||
MIT No Attribution
|
||||
|
||||
Permission is hereby granted, free of charge, to any person
|
||||
obtaining a copy of this software and associated documentation
|
||||
files (the "Software"), to deal in the Software without
|
||||
restriction, including without limitation the rights to use,
|
||||
copy, modify, merge, publish, distribute, sublicense, and/or
|
||||
sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
||||
OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
DEALINGS IN THE SOFTWARE.
|
||||
|
||||
+2
@@ -0,0 +1,2 @@
|
||||
_cffi_backend
|
||||
cffi
|
||||
@@ -0,0 +1,14 @@
|
||||
__all__ = ['FFI', 'VerificationError', 'VerificationMissing', 'CDefError',
|
||||
'FFIError']
|
||||
|
||||
from .api import FFI
|
||||
from .error import CDefError, FFIError, VerificationError, VerificationMissing
|
||||
from .error import PkgConfigError
|
||||
|
||||
__version__ = "2.0.0"
|
||||
__version_info__ = (2, 0, 0)
|
||||
|
||||
# The verifier module file names are based on the CRC32 of a string that
|
||||
# contains the following version number. It may be older than __version__
|
||||
# if nothing is clearly incompatible.
|
||||
__version_verifier_modules__ = "0.8.6"
|
||||
@@ -0,0 +1,149 @@
|
||||
#ifndef CFFI_MESSAGEBOX
|
||||
# ifdef _MSC_VER
|
||||
# define CFFI_MESSAGEBOX 1
|
||||
# else
|
||||
# define CFFI_MESSAGEBOX 0
|
||||
# endif
|
||||
#endif
|
||||
|
||||
|
||||
#if CFFI_MESSAGEBOX
|
||||
/* Windows only: logic to take the Python-CFFI embedding logic
|
||||
initialization errors and display them in a background thread
|
||||
with MessageBox. The idea is that if the whole program closes
|
||||
as a result of this problem, then likely it is already a console
|
||||
program and you can read the stderr output in the console too.
|
||||
If it is not a console program, then it will likely show its own
|
||||
dialog to complain, or generally not abruptly close, and for this
|
||||
case the background thread should stay alive.
|
||||
*/
|
||||
static void *volatile _cffi_bootstrap_text;
|
||||
|
||||
static PyObject *_cffi_start_error_capture(void)
|
||||
{
|
||||
PyObject *result = NULL;
|
||||
PyObject *x, *m, *bi;
|
||||
|
||||
if (InterlockedCompareExchangePointer(&_cffi_bootstrap_text,
|
||||
(void *)1, NULL) != NULL)
|
||||
return (PyObject *)1;
|
||||
|
||||
m = PyImport_AddModule("_cffi_error_capture");
|
||||
if (m == NULL)
|
||||
goto error;
|
||||
|
||||
result = PyModule_GetDict(m);
|
||||
if (result == NULL)
|
||||
goto error;
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
bi = PyImport_ImportModule("builtins");
|
||||
#else
|
||||
bi = PyImport_ImportModule("__builtin__");
|
||||
#endif
|
||||
if (bi == NULL)
|
||||
goto error;
|
||||
PyDict_SetItemString(result, "__builtins__", bi);
|
||||
Py_DECREF(bi);
|
||||
|
||||
x = PyRun_String(
|
||||
"import sys\n"
|
||||
"class FileLike:\n"
|
||||
" def write(self, x):\n"
|
||||
" try:\n"
|
||||
" of.write(x)\n"
|
||||
" except: pass\n"
|
||||
" self.buf += x\n"
|
||||
" def flush(self):\n"
|
||||
" pass\n"
|
||||
"fl = FileLike()\n"
|
||||
"fl.buf = ''\n"
|
||||
"of = sys.stderr\n"
|
||||
"sys.stderr = fl\n"
|
||||
"def done():\n"
|
||||
" sys.stderr = of\n"
|
||||
" return fl.buf\n", /* make sure the returned value stays alive */
|
||||
Py_file_input,
|
||||
result, result);
|
||||
Py_XDECREF(x);
|
||||
|
||||
error:
|
||||
if (PyErr_Occurred())
|
||||
{
|
||||
PyErr_WriteUnraisable(Py_None);
|
||||
PyErr_Clear();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
#pragma comment(lib, "user32.lib")
|
||||
|
||||
static DWORD WINAPI _cffi_bootstrap_dialog(LPVOID ignored)
|
||||
{
|
||||
Sleep(666); /* may be interrupted if the whole process is closing */
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
MessageBoxW(NULL, (wchar_t *)_cffi_bootstrap_text,
|
||||
L"Python-CFFI error",
|
||||
MB_OK | MB_ICONERROR);
|
||||
#else
|
||||
MessageBoxA(NULL, (char *)_cffi_bootstrap_text,
|
||||
"Python-CFFI error",
|
||||
MB_OK | MB_ICONERROR);
|
||||
#endif
|
||||
_cffi_bootstrap_text = NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void _cffi_stop_error_capture(PyObject *ecap)
|
||||
{
|
||||
PyObject *s;
|
||||
void *text;
|
||||
|
||||
if (ecap == (PyObject *)1)
|
||||
return;
|
||||
|
||||
if (ecap == NULL)
|
||||
goto error;
|
||||
|
||||
s = PyRun_String("done()", Py_eval_input, ecap, ecap);
|
||||
if (s == NULL)
|
||||
goto error;
|
||||
|
||||
/* Show a dialog box, but in a background thread, and
|
||||
never show multiple dialog boxes at once. */
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
text = PyUnicode_AsWideCharString(s, NULL);
|
||||
#else
|
||||
text = PyString_AsString(s);
|
||||
#endif
|
||||
|
||||
_cffi_bootstrap_text = text;
|
||||
|
||||
if (text != NULL)
|
||||
{
|
||||
HANDLE h;
|
||||
h = CreateThread(NULL, 0, _cffi_bootstrap_dialog,
|
||||
NULL, 0, NULL);
|
||||
if (h != NULL)
|
||||
CloseHandle(h);
|
||||
}
|
||||
/* decref the string, but it should stay alive as 'fl.buf'
|
||||
in the small module above. It will really be freed only if
|
||||
we later get another similar error. So it's a leak of at
|
||||
most one copy of the small module. That's fine for this
|
||||
situation which is usually a "fatal error" anyway. */
|
||||
Py_DECREF(s);
|
||||
PyErr_Clear();
|
||||
return;
|
||||
|
||||
error:
|
||||
_cffi_bootstrap_text = NULL;
|
||||
PyErr_Clear();
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static PyObject *_cffi_start_error_capture(void) { return NULL; }
|
||||
static void _cffi_stop_error_capture(PyObject *ecap) { }
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,389 @@
|
||||
#define _CFFI_
|
||||
|
||||
/* We try to define Py_LIMITED_API before including Python.h.
|
||||
|
||||
Mess: we can only define it if Py_DEBUG, Py_TRACE_REFS and
|
||||
Py_REF_DEBUG are not defined. This is a best-effort approximation:
|
||||
we can learn about Py_DEBUG from pyconfig.h, but it is unclear if
|
||||
the same works for the other two macros. Py_DEBUG implies them,
|
||||
but not the other way around.
|
||||
|
||||
The implementation is messy (issue #350): on Windows, with _MSC_VER,
|
||||
we have to define Py_LIMITED_API even before including pyconfig.h.
|
||||
In that case, we guess what pyconfig.h will do to the macros above,
|
||||
and check our guess after the #include.
|
||||
|
||||
Note that on Windows, with CPython 3.x, you need >= 3.5 and virtualenv
|
||||
version >= 16.0.0. With older versions of either, you don't get a
|
||||
copy of PYTHON3.DLL in the virtualenv. We can't check the version of
|
||||
CPython *before* we even include pyconfig.h. ffi.set_source() puts
|
||||
a ``#define _CFFI_NO_LIMITED_API'' at the start of this file if it is
|
||||
running on Windows < 3.5, as an attempt at fixing it, but that's
|
||||
arguably wrong because it may not be the target version of Python.
|
||||
Still better than nothing I guess. As another workaround, you can
|
||||
remove the definition of Py_LIMITED_API here.
|
||||
|
||||
See also 'py_limited_api' in cffi/setuptools_ext.py.
|
||||
*/
|
||||
#if !defined(_CFFI_USE_EMBEDDING) && !defined(Py_LIMITED_API)
|
||||
# ifdef _MSC_VER
|
||||
# if !defined(_DEBUG) && !defined(Py_DEBUG) && !defined(Py_TRACE_REFS) && !defined(Py_REF_DEBUG) && !defined(_CFFI_NO_LIMITED_API)
|
||||
# define Py_LIMITED_API
|
||||
# endif
|
||||
# include <pyconfig.h>
|
||||
/* sanity-check: Py_LIMITED_API will cause crashes if any of these
|
||||
are also defined. Normally, the Python file PC/pyconfig.h does not
|
||||
cause any of these to be defined, with the exception that _DEBUG
|
||||
causes Py_DEBUG. Double-check that. */
|
||||
# ifdef Py_LIMITED_API
|
||||
# if defined(Py_DEBUG)
|
||||
# error "pyconfig.h unexpectedly defines Py_DEBUG, but Py_LIMITED_API is set"
|
||||
# endif
|
||||
# if defined(Py_TRACE_REFS)
|
||||
# error "pyconfig.h unexpectedly defines Py_TRACE_REFS, but Py_LIMITED_API is set"
|
||||
# endif
|
||||
# if defined(Py_REF_DEBUG)
|
||||
# error "pyconfig.h unexpectedly defines Py_REF_DEBUG, but Py_LIMITED_API is set"
|
||||
# endif
|
||||
# endif
|
||||
# else
|
||||
# include <pyconfig.h>
|
||||
# if !defined(Py_DEBUG) && !defined(Py_TRACE_REFS) && !defined(Py_REF_DEBUG) && !defined(_CFFI_NO_LIMITED_API)
|
||||
# define Py_LIMITED_API
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#include <Python.h>
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include <stddef.h>
|
||||
#include "parse_c_type.h"
|
||||
|
||||
/* this block of #ifs should be kept exactly identical between
|
||||
c/_cffi_backend.c, cffi/vengine_cpy.py, cffi/vengine_gen.py
|
||||
and cffi/_cffi_include.h */
|
||||
#if defined(_MSC_VER)
|
||||
# include <malloc.h> /* for alloca() */
|
||||
# if _MSC_VER < 1600 /* MSVC < 2010 */
|
||||
typedef __int8 int8_t;
|
||||
typedef __int16 int16_t;
|
||||
typedef __int32 int32_t;
|
||||
typedef __int64 int64_t;
|
||||
typedef unsigned __int8 uint8_t;
|
||||
typedef unsigned __int16 uint16_t;
|
||||
typedef unsigned __int32 uint32_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
typedef __int8 int_least8_t;
|
||||
typedef __int16 int_least16_t;
|
||||
typedef __int32 int_least32_t;
|
||||
typedef __int64 int_least64_t;
|
||||
typedef unsigned __int8 uint_least8_t;
|
||||
typedef unsigned __int16 uint_least16_t;
|
||||
typedef unsigned __int32 uint_least32_t;
|
||||
typedef unsigned __int64 uint_least64_t;
|
||||
typedef __int8 int_fast8_t;
|
||||
typedef __int16 int_fast16_t;
|
||||
typedef __int32 int_fast32_t;
|
||||
typedef __int64 int_fast64_t;
|
||||
typedef unsigned __int8 uint_fast8_t;
|
||||
typedef unsigned __int16 uint_fast16_t;
|
||||
typedef unsigned __int32 uint_fast32_t;
|
||||
typedef unsigned __int64 uint_fast64_t;
|
||||
typedef __int64 intmax_t;
|
||||
typedef unsigned __int64 uintmax_t;
|
||||
# else
|
||||
# include <stdint.h>
|
||||
# endif
|
||||
# if _MSC_VER < 1800 /* MSVC < 2013 */
|
||||
# ifndef __cplusplus
|
||||
typedef unsigned char _Bool;
|
||||
# endif
|
||||
# endif
|
||||
# define _cffi_float_complex_t _Fcomplex /* include <complex.h> for it */
|
||||
# define _cffi_double_complex_t _Dcomplex /* include <complex.h> for it */
|
||||
#else
|
||||
# include <stdint.h>
|
||||
# if (defined (__SVR4) && defined (__sun)) || defined(_AIX) || defined(__hpux)
|
||||
# include <alloca.h>
|
||||
# endif
|
||||
# define _cffi_float_complex_t float _Complex
|
||||
# define _cffi_double_complex_t double _Complex
|
||||
#endif
|
||||
|
||||
#ifdef __GNUC__
|
||||
# define _CFFI_UNUSED_FN __attribute__((unused))
|
||||
#else
|
||||
# define _CFFI_UNUSED_FN /* nothing */
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
# ifndef _Bool
|
||||
typedef bool _Bool; /* semi-hackish: C++ has no _Bool; bool is builtin */
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/********** CPython-specific section **********/
|
||||
#ifndef PYPY_VERSION
|
||||
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
# define PyInt_FromLong PyLong_FromLong
|
||||
#endif
|
||||
|
||||
#define _cffi_from_c_double PyFloat_FromDouble
|
||||
#define _cffi_from_c_float PyFloat_FromDouble
|
||||
#define _cffi_from_c_long PyInt_FromLong
|
||||
#define _cffi_from_c_ulong PyLong_FromUnsignedLong
|
||||
#define _cffi_from_c_longlong PyLong_FromLongLong
|
||||
#define _cffi_from_c_ulonglong PyLong_FromUnsignedLongLong
|
||||
#define _cffi_from_c__Bool PyBool_FromLong
|
||||
|
||||
#define _cffi_to_c_double PyFloat_AsDouble
|
||||
#define _cffi_to_c_float PyFloat_AsDouble
|
||||
|
||||
#define _cffi_from_c_int(x, type) \
|
||||
(((type)-1) > 0 ? /* unsigned */ \
|
||||
(sizeof(type) < sizeof(long) ? \
|
||||
PyInt_FromLong((long)x) : \
|
||||
sizeof(type) == sizeof(long) ? \
|
||||
PyLong_FromUnsignedLong((unsigned long)x) : \
|
||||
PyLong_FromUnsignedLongLong((unsigned long long)x)) : \
|
||||
(sizeof(type) <= sizeof(long) ? \
|
||||
PyInt_FromLong((long)x) : \
|
||||
PyLong_FromLongLong((long long)x)))
|
||||
|
||||
#define _cffi_to_c_int(o, type) \
|
||||
((type)( \
|
||||
sizeof(type) == 1 ? (((type)-1) > 0 ? (type)_cffi_to_c_u8(o) \
|
||||
: (type)_cffi_to_c_i8(o)) : \
|
||||
sizeof(type) == 2 ? (((type)-1) > 0 ? (type)_cffi_to_c_u16(o) \
|
||||
: (type)_cffi_to_c_i16(o)) : \
|
||||
sizeof(type) == 4 ? (((type)-1) > 0 ? (type)_cffi_to_c_u32(o) \
|
||||
: (type)_cffi_to_c_i32(o)) : \
|
||||
sizeof(type) == 8 ? (((type)-1) > 0 ? (type)_cffi_to_c_u64(o) \
|
||||
: (type)_cffi_to_c_i64(o)) : \
|
||||
(Py_FatalError("unsupported size for type " #type), (type)0)))
|
||||
|
||||
#define _cffi_to_c_i8 \
|
||||
((int(*)(PyObject *))_cffi_exports[1])
|
||||
#define _cffi_to_c_u8 \
|
||||
((int(*)(PyObject *))_cffi_exports[2])
|
||||
#define _cffi_to_c_i16 \
|
||||
((int(*)(PyObject *))_cffi_exports[3])
|
||||
#define _cffi_to_c_u16 \
|
||||
((int(*)(PyObject *))_cffi_exports[4])
|
||||
#define _cffi_to_c_i32 \
|
||||
((int(*)(PyObject *))_cffi_exports[5])
|
||||
#define _cffi_to_c_u32 \
|
||||
((unsigned int(*)(PyObject *))_cffi_exports[6])
|
||||
#define _cffi_to_c_i64 \
|
||||
((long long(*)(PyObject *))_cffi_exports[7])
|
||||
#define _cffi_to_c_u64 \
|
||||
((unsigned long long(*)(PyObject *))_cffi_exports[8])
|
||||
#define _cffi_to_c_char \
|
||||
((int(*)(PyObject *))_cffi_exports[9])
|
||||
#define _cffi_from_c_pointer \
|
||||
((PyObject *(*)(char *, struct _cffi_ctypedescr *))_cffi_exports[10])
|
||||
#define _cffi_to_c_pointer \
|
||||
((char *(*)(PyObject *, struct _cffi_ctypedescr *))_cffi_exports[11])
|
||||
#define _cffi_get_struct_layout \
|
||||
not used any more
|
||||
#define _cffi_restore_errno \
|
||||
((void(*)(void))_cffi_exports[13])
|
||||
#define _cffi_save_errno \
|
||||
((void(*)(void))_cffi_exports[14])
|
||||
#define _cffi_from_c_char \
|
||||
((PyObject *(*)(char))_cffi_exports[15])
|
||||
#define _cffi_from_c_deref \
|
||||
((PyObject *(*)(char *, struct _cffi_ctypedescr *))_cffi_exports[16])
|
||||
#define _cffi_to_c \
|
||||
((int(*)(char *, struct _cffi_ctypedescr *, PyObject *))_cffi_exports[17])
|
||||
#define _cffi_from_c_struct \
|
||||
((PyObject *(*)(char *, struct _cffi_ctypedescr *))_cffi_exports[18])
|
||||
#define _cffi_to_c_wchar_t \
|
||||
((_cffi_wchar_t(*)(PyObject *))_cffi_exports[19])
|
||||
#define _cffi_from_c_wchar_t \
|
||||
((PyObject *(*)(_cffi_wchar_t))_cffi_exports[20])
|
||||
#define _cffi_to_c_long_double \
|
||||
((long double(*)(PyObject *))_cffi_exports[21])
|
||||
#define _cffi_to_c__Bool \
|
||||
((_Bool(*)(PyObject *))_cffi_exports[22])
|
||||
#define _cffi_prepare_pointer_call_argument \
|
||||
((Py_ssize_t(*)(struct _cffi_ctypedescr *, \
|
||||
PyObject *, char **))_cffi_exports[23])
|
||||
#define _cffi_convert_array_from_object \
|
||||
((int(*)(char *, struct _cffi_ctypedescr *, PyObject *))_cffi_exports[24])
|
||||
#define _CFFI_CPIDX 25
|
||||
#define _cffi_call_python \
|
||||
((void(*)(struct _cffi_externpy_s *, char *))_cffi_exports[_CFFI_CPIDX])
|
||||
#define _cffi_to_c_wchar3216_t \
|
||||
((int(*)(PyObject *))_cffi_exports[26])
|
||||
#define _cffi_from_c_wchar3216_t \
|
||||
((PyObject *(*)(int))_cffi_exports[27])
|
||||
#define _CFFI_NUM_EXPORTS 28
|
||||
|
||||
struct _cffi_ctypedescr;
|
||||
|
||||
static void *_cffi_exports[_CFFI_NUM_EXPORTS];
|
||||
|
||||
#define _cffi_type(index) ( \
|
||||
assert((((uintptr_t)_cffi_types[index]) & 1) == 0), \
|
||||
(struct _cffi_ctypedescr *)_cffi_types[index])
|
||||
|
||||
static PyObject *_cffi_init(const char *module_name, Py_ssize_t version,
|
||||
const struct _cffi_type_context_s *ctx)
|
||||
{
|
||||
PyObject *module, *o_arg, *new_module;
|
||||
void *raw[] = {
|
||||
(void *)module_name,
|
||||
(void *)version,
|
||||
(void *)_cffi_exports,
|
||||
(void *)ctx,
|
||||
};
|
||||
|
||||
module = PyImport_ImportModule("_cffi_backend");
|
||||
if (module == NULL)
|
||||
goto failure;
|
||||
|
||||
o_arg = PyLong_FromVoidPtr((void *)raw);
|
||||
if (o_arg == NULL)
|
||||
goto failure;
|
||||
|
||||
new_module = PyObject_CallMethod(
|
||||
module, (char *)"_init_cffi_1_0_external_module", (char *)"O", o_arg);
|
||||
|
||||
Py_DECREF(o_arg);
|
||||
Py_DECREF(module);
|
||||
return new_module;
|
||||
|
||||
failure:
|
||||
Py_XDECREF(module);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAVE_WCHAR_H
|
||||
typedef wchar_t _cffi_wchar_t;
|
||||
#else
|
||||
typedef uint16_t _cffi_wchar_t; /* same random pick as _cffi_backend.c */
|
||||
#endif
|
||||
|
||||
_CFFI_UNUSED_FN static uint16_t _cffi_to_c_char16_t(PyObject *o)
|
||||
{
|
||||
if (sizeof(_cffi_wchar_t) == 2)
|
||||
return (uint16_t)_cffi_to_c_wchar_t(o);
|
||||
else
|
||||
return (uint16_t)_cffi_to_c_wchar3216_t(o);
|
||||
}
|
||||
|
||||
_CFFI_UNUSED_FN static PyObject *_cffi_from_c_char16_t(uint16_t x)
|
||||
{
|
||||
if (sizeof(_cffi_wchar_t) == 2)
|
||||
return _cffi_from_c_wchar_t((_cffi_wchar_t)x);
|
||||
else
|
||||
return _cffi_from_c_wchar3216_t((int)x);
|
||||
}
|
||||
|
||||
_CFFI_UNUSED_FN static int _cffi_to_c_char32_t(PyObject *o)
|
||||
{
|
||||
if (sizeof(_cffi_wchar_t) == 4)
|
||||
return (int)_cffi_to_c_wchar_t(o);
|
||||
else
|
||||
return (int)_cffi_to_c_wchar3216_t(o);
|
||||
}
|
||||
|
||||
_CFFI_UNUSED_FN static PyObject *_cffi_from_c_char32_t(unsigned int x)
|
||||
{
|
||||
if (sizeof(_cffi_wchar_t) == 4)
|
||||
return _cffi_from_c_wchar_t((_cffi_wchar_t)x);
|
||||
else
|
||||
return _cffi_from_c_wchar3216_t((int)x);
|
||||
}
|
||||
|
||||
union _cffi_union_alignment_u {
|
||||
unsigned char m_char;
|
||||
unsigned short m_short;
|
||||
unsigned int m_int;
|
||||
unsigned long m_long;
|
||||
unsigned long long m_longlong;
|
||||
float m_float;
|
||||
double m_double;
|
||||
long double m_longdouble;
|
||||
};
|
||||
|
||||
struct _cffi_freeme_s {
|
||||
struct _cffi_freeme_s *next;
|
||||
union _cffi_union_alignment_u alignment;
|
||||
};
|
||||
|
||||
_CFFI_UNUSED_FN static int
|
||||
_cffi_convert_array_argument(struct _cffi_ctypedescr *ctptr, PyObject *arg,
|
||||
char **output_data, Py_ssize_t datasize,
|
||||
struct _cffi_freeme_s **freeme)
|
||||
{
|
||||
char *p;
|
||||
if (datasize < 0)
|
||||
return -1;
|
||||
|
||||
p = *output_data;
|
||||
if (p == NULL) {
|
||||
struct _cffi_freeme_s *fp = (struct _cffi_freeme_s *)PyObject_Malloc(
|
||||
offsetof(struct _cffi_freeme_s, alignment) + (size_t)datasize);
|
||||
if (fp == NULL)
|
||||
return -1;
|
||||
fp->next = *freeme;
|
||||
*freeme = fp;
|
||||
p = *output_data = (char *)&fp->alignment;
|
||||
}
|
||||
memset((void *)p, 0, (size_t)datasize);
|
||||
return _cffi_convert_array_from_object(p, ctptr, arg);
|
||||
}
|
||||
|
||||
_CFFI_UNUSED_FN static void
|
||||
_cffi_free_array_arguments(struct _cffi_freeme_s *freeme)
|
||||
{
|
||||
do {
|
||||
void *p = (void *)freeme;
|
||||
freeme = freeme->next;
|
||||
PyObject_Free(p);
|
||||
} while (freeme != NULL);
|
||||
}
|
||||
|
||||
/********** end CPython-specific section **********/
|
||||
#else
|
||||
_CFFI_UNUSED_FN
|
||||
static void (*_cffi_call_python_org)(struct _cffi_externpy_s *, char *);
|
||||
# define _cffi_call_python _cffi_call_python_org
|
||||
#endif
|
||||
|
||||
|
||||
#define _cffi_array_len(array) (sizeof(array) / sizeof((array)[0]))
|
||||
|
||||
#define _cffi_prim_int(size, sign) \
|
||||
((size) == 1 ? ((sign) ? _CFFI_PRIM_INT8 : _CFFI_PRIM_UINT8) : \
|
||||
(size) == 2 ? ((sign) ? _CFFI_PRIM_INT16 : _CFFI_PRIM_UINT16) : \
|
||||
(size) == 4 ? ((sign) ? _CFFI_PRIM_INT32 : _CFFI_PRIM_UINT32) : \
|
||||
(size) == 8 ? ((sign) ? _CFFI_PRIM_INT64 : _CFFI_PRIM_UINT64) : \
|
||||
_CFFI__UNKNOWN_PRIM)
|
||||
|
||||
#define _cffi_prim_float(size) \
|
||||
((size) == sizeof(float) ? _CFFI_PRIM_FLOAT : \
|
||||
(size) == sizeof(double) ? _CFFI_PRIM_DOUBLE : \
|
||||
(size) == sizeof(long double) ? _CFFI__UNKNOWN_LONG_DOUBLE : \
|
||||
_CFFI__UNKNOWN_FLOAT_PRIM)
|
||||
|
||||
#define _cffi_check_int(got, got_nonpos, expected) \
|
||||
((got_nonpos) == (expected <= 0) && \
|
||||
(got) == (unsigned long long)expected)
|
||||
|
||||
#ifdef MS_WIN32
|
||||
# define _cffi_stdcall __stdcall
|
||||
#else
|
||||
# define _cffi_stdcall /* nothing */
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,550 @@
|
||||
|
||||
/***** Support code for embedding *****/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(_WIN32)
|
||||
# define CFFI_DLLEXPORT __declspec(dllexport)
|
||||
#elif defined(__GNUC__)
|
||||
# define CFFI_DLLEXPORT __attribute__((visibility("default")))
|
||||
#else
|
||||
# define CFFI_DLLEXPORT /* nothing */
|
||||
#endif
|
||||
|
||||
|
||||
/* There are two global variables of type _cffi_call_python_fnptr:
|
||||
|
||||
* _cffi_call_python, which we declare just below, is the one called
|
||||
by ``extern "Python"`` implementations.
|
||||
|
||||
* _cffi_call_python_org, which on CPython is actually part of the
|
||||
_cffi_exports[] array, is the function pointer copied from
|
||||
_cffi_backend. If _cffi_start_python() fails, then this is set
|
||||
to NULL; otherwise, it should never be NULL.
|
||||
|
||||
After initialization is complete, both are equal. However, the
|
||||
first one remains equal to &_cffi_start_and_call_python until the
|
||||
very end of initialization, when we are (or should be) sure that
|
||||
concurrent threads also see a completely initialized world, and
|
||||
only then is it changed.
|
||||
*/
|
||||
#undef _cffi_call_python
|
||||
typedef void (*_cffi_call_python_fnptr)(struct _cffi_externpy_s *, char *);
|
||||
static void _cffi_start_and_call_python(struct _cffi_externpy_s *, char *);
|
||||
static _cffi_call_python_fnptr _cffi_call_python = &_cffi_start_and_call_python;
|
||||
|
||||
|
||||
#ifndef _MSC_VER
|
||||
/* --- Assuming a GCC not infinitely old --- */
|
||||
# define cffi_compare_and_swap(l,o,n) __sync_bool_compare_and_swap(l,o,n)
|
||||
# define cffi_write_barrier() __sync_synchronize()
|
||||
# if !defined(__amd64__) && !defined(__x86_64__) && \
|
||||
!defined(__i386__) && !defined(__i386)
|
||||
# define cffi_read_barrier() __sync_synchronize()
|
||||
# else
|
||||
# define cffi_read_barrier() (void)0
|
||||
# endif
|
||||
#else
|
||||
/* --- Windows threads version --- */
|
||||
# include <Windows.h>
|
||||
# define cffi_compare_and_swap(l,o,n) \
|
||||
(InterlockedCompareExchangePointer(l,n,o) == (o))
|
||||
# define cffi_write_barrier() InterlockedCompareExchange(&_cffi_dummy,0,0)
|
||||
# define cffi_read_barrier() (void)0
|
||||
static volatile LONG _cffi_dummy;
|
||||
#endif
|
||||
|
||||
#ifdef WITH_THREAD
|
||||
# ifndef _MSC_VER
|
||||
# include <pthread.h>
|
||||
static pthread_mutex_t _cffi_embed_startup_lock;
|
||||
# else
|
||||
static CRITICAL_SECTION _cffi_embed_startup_lock;
|
||||
# endif
|
||||
static char _cffi_embed_startup_lock_ready = 0;
|
||||
#endif
|
||||
|
||||
static void _cffi_acquire_reentrant_mutex(void)
|
||||
{
|
||||
static void *volatile lock = NULL;
|
||||
|
||||
while (!cffi_compare_and_swap(&lock, NULL, (void *)1)) {
|
||||
/* should ideally do a spin loop instruction here, but
|
||||
hard to do it portably and doesn't really matter I
|
||||
think: pthread_mutex_init() should be very fast, and
|
||||
this is only run at start-up anyway. */
|
||||
}
|
||||
|
||||
#ifdef WITH_THREAD
|
||||
if (!_cffi_embed_startup_lock_ready) {
|
||||
# ifndef _MSC_VER
|
||||
pthread_mutexattr_t attr;
|
||||
pthread_mutexattr_init(&attr);
|
||||
pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
|
||||
pthread_mutex_init(&_cffi_embed_startup_lock, &attr);
|
||||
# else
|
||||
InitializeCriticalSection(&_cffi_embed_startup_lock);
|
||||
# endif
|
||||
_cffi_embed_startup_lock_ready = 1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while (!cffi_compare_and_swap(&lock, (void *)1, NULL))
|
||||
;
|
||||
|
||||
#ifndef _MSC_VER
|
||||
pthread_mutex_lock(&_cffi_embed_startup_lock);
|
||||
#else
|
||||
EnterCriticalSection(&_cffi_embed_startup_lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void _cffi_release_reentrant_mutex(void)
|
||||
{
|
||||
#ifndef _MSC_VER
|
||||
pthread_mutex_unlock(&_cffi_embed_startup_lock);
|
||||
#else
|
||||
LeaveCriticalSection(&_cffi_embed_startup_lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/********** CPython-specific section **********/
|
||||
#ifndef PYPY_VERSION
|
||||
|
||||
#include "_cffi_errors.h"
|
||||
|
||||
|
||||
#define _cffi_call_python_org _cffi_exports[_CFFI_CPIDX]
|
||||
|
||||
PyMODINIT_FUNC _CFFI_PYTHON_STARTUP_FUNC(void); /* forward */
|
||||
|
||||
static void _cffi_py_initialize(void)
|
||||
{
|
||||
/* XXX use initsigs=0, which "skips initialization registration of
|
||||
signal handlers, which might be useful when Python is
|
||||
embedded" according to the Python docs. But review and think
|
||||
if it should be a user-controllable setting.
|
||||
|
||||
XXX we should also give a way to write errors to a buffer
|
||||
instead of to stderr.
|
||||
|
||||
XXX if importing 'site' fails, CPython (any version) calls
|
||||
exit(). Should we try to work around this behavior here?
|
||||
*/
|
||||
Py_InitializeEx(0);
|
||||
}
|
||||
|
||||
static int _cffi_initialize_python(void)
|
||||
{
|
||||
/* This initializes Python, imports _cffi_backend, and then the
|
||||
present .dll/.so is set up as a CPython C extension module.
|
||||
*/
|
||||
int result;
|
||||
PyGILState_STATE state;
|
||||
PyObject *pycode=NULL, *global_dict=NULL, *x;
|
||||
PyObject *builtins;
|
||||
|
||||
state = PyGILState_Ensure();
|
||||
|
||||
/* Call the initxxx() function from the present module. It will
|
||||
create and initialize us as a CPython extension module, instead
|
||||
of letting the startup Python code do it---it might reimport
|
||||
the same .dll/.so and get maybe confused on some platforms.
|
||||
It might also have troubles locating the .dll/.so again for all
|
||||
I know.
|
||||
*/
|
||||
(void)_CFFI_PYTHON_STARTUP_FUNC();
|
||||
if (PyErr_Occurred())
|
||||
goto error;
|
||||
|
||||
/* Now run the Python code provided to ffi.embedding_init_code().
|
||||
*/
|
||||
pycode = Py_CompileString(_CFFI_PYTHON_STARTUP_CODE,
|
||||
"<init code for '" _CFFI_MODULE_NAME "'>",
|
||||
Py_file_input);
|
||||
if (pycode == NULL)
|
||||
goto error;
|
||||
global_dict = PyDict_New();
|
||||
if (global_dict == NULL)
|
||||
goto error;
|
||||
builtins = PyEval_GetBuiltins();
|
||||
if (builtins == NULL)
|
||||
goto error;
|
||||
if (PyDict_SetItemString(global_dict, "__builtins__", builtins) < 0)
|
||||
goto error;
|
||||
x = PyEval_EvalCode(
|
||||
#if PY_MAJOR_VERSION < 3
|
||||
(PyCodeObject *)
|
||||
#endif
|
||||
pycode, global_dict, global_dict);
|
||||
if (x == NULL)
|
||||
goto error;
|
||||
Py_DECREF(x);
|
||||
|
||||
/* Done! Now if we've been called from
|
||||
_cffi_start_and_call_python() in an ``extern "Python"``, we can
|
||||
only hope that the Python code did correctly set up the
|
||||
corresponding @ffi.def_extern() function. Otherwise, the
|
||||
general logic of ``extern "Python"`` functions (inside the
|
||||
_cffi_backend module) will find that the reference is still
|
||||
missing and print an error.
|
||||
*/
|
||||
result = 0;
|
||||
done:
|
||||
Py_XDECREF(pycode);
|
||||
Py_XDECREF(global_dict);
|
||||
PyGILState_Release(state);
|
||||
return result;
|
||||
|
||||
error:;
|
||||
{
|
||||
/* Print as much information as potentially useful.
|
||||
Debugging load-time failures with embedding is not fun
|
||||
*/
|
||||
PyObject *ecap;
|
||||
PyObject *exception, *v, *tb, *f, *modules, *mod;
|
||||
PyErr_Fetch(&exception, &v, &tb);
|
||||
ecap = _cffi_start_error_capture();
|
||||
f = PySys_GetObject((char *)"stderr");
|
||||
if (f != NULL && f != Py_None) {
|
||||
PyFile_WriteString(
|
||||
"Failed to initialize the Python-CFFI embedding logic:\n\n", f);
|
||||
}
|
||||
|
||||
if (exception != NULL) {
|
||||
PyErr_NormalizeException(&exception, &v, &tb);
|
||||
PyErr_Display(exception, v, tb);
|
||||
}
|
||||
Py_XDECREF(exception);
|
||||
Py_XDECREF(v);
|
||||
Py_XDECREF(tb);
|
||||
|
||||
if (f != NULL && f != Py_None) {
|
||||
PyFile_WriteString("\nFrom: " _CFFI_MODULE_NAME
|
||||
"\ncompiled with cffi version: 2.0.0"
|
||||
"\n_cffi_backend module: ", f);
|
||||
modules = PyImport_GetModuleDict();
|
||||
mod = PyDict_GetItemString(modules, "_cffi_backend");
|
||||
if (mod == NULL) {
|
||||
PyFile_WriteString("not loaded", f);
|
||||
}
|
||||
else {
|
||||
v = PyObject_GetAttrString(mod, "__file__");
|
||||
PyFile_WriteObject(v, f, 0);
|
||||
Py_XDECREF(v);
|
||||
}
|
||||
PyFile_WriteString("\nsys.path: ", f);
|
||||
PyFile_WriteObject(PySys_GetObject((char *)"path"), f, 0);
|
||||
PyFile_WriteString("\n\n", f);
|
||||
}
|
||||
_cffi_stop_error_capture(ecap);
|
||||
}
|
||||
result = -1;
|
||||
goto done;
|
||||
}
|
||||
|
||||
#if PY_VERSION_HEX < 0x03080000
|
||||
PyAPI_DATA(char *) _PyParser_TokenNames[]; /* from CPython */
|
||||
#endif
|
||||
|
||||
static int _cffi_carefully_make_gil(void)
|
||||
{
|
||||
/* This does the basic initialization of Python. It can be called
|
||||
completely concurrently from unrelated threads. It assumes
|
||||
that we don't hold the GIL before (if it exists), and we don't
|
||||
hold it afterwards.
|
||||
|
||||
(What it really does used to be completely different in Python 2
|
||||
and Python 3, with the Python 2 solution avoiding the spin-lock
|
||||
around the Py_InitializeEx() call. However, after recent changes
|
||||
to CPython 2.7 (issue #358) it no longer works. So we use the
|
||||
Python 3 solution everywhere.)
|
||||
|
||||
This initializes Python by calling Py_InitializeEx().
|
||||
Important: this must not be called concurrently at all.
|
||||
So we use a global variable as a simple spin lock. This global
|
||||
variable must be from 'libpythonX.Y.so', not from this
|
||||
cffi-based extension module, because it must be shared from
|
||||
different cffi-based extension modules.
|
||||
|
||||
In Python < 3.8, we choose
|
||||
_PyParser_TokenNames[0] as a completely arbitrary pointer value
|
||||
that is never written to. The default is to point to the
|
||||
string "ENDMARKER". We change it temporarily to point to the
|
||||
next character in that string. (Yes, I know it's REALLY
|
||||
obscure.)
|
||||
|
||||
In Python >= 3.8, this string array is no longer writable, so
|
||||
instead we pick PyCapsuleType.tp_version_tag. We can't change
|
||||
Python < 3.8 because someone might use a mixture of cffi
|
||||
embedded modules, some of which were compiled before this file
|
||||
changed.
|
||||
|
||||
In Python >= 3.12, this stopped working because that particular
|
||||
tp_version_tag gets modified during interpreter startup. It's
|
||||
arguably a bad idea before 3.12 too, but again we can't change
|
||||
that because someone might use a mixture of cffi embedded
|
||||
modules, and no-one reported a bug so far. In Python >= 3.12
|
||||
we go instead for PyCapsuleType.tp_as_buffer, which is supposed
|
||||
to always be NULL. We write to it temporarily a pointer to
|
||||
a struct full of NULLs, which is semantically the same.
|
||||
*/
|
||||
|
||||
#ifdef WITH_THREAD
|
||||
# if PY_VERSION_HEX < 0x03080000
|
||||
char *volatile *lock = (char *volatile *)_PyParser_TokenNames;
|
||||
char *old_value, *locked_value;
|
||||
|
||||
while (1) { /* spin loop */
|
||||
old_value = *lock;
|
||||
locked_value = old_value + 1;
|
||||
if (old_value[0] == 'E') {
|
||||
assert(old_value[1] == 'N');
|
||||
if (cffi_compare_and_swap(lock, old_value, locked_value))
|
||||
break;
|
||||
}
|
||||
else {
|
||||
assert(old_value[0] == 'N');
|
||||
/* should ideally do a spin loop instruction here, but
|
||||
hard to do it portably and doesn't really matter I
|
||||
think: PyEval_InitThreads() should be very fast, and
|
||||
this is only run at start-up anyway. */
|
||||
}
|
||||
}
|
||||
# else
|
||||
# if PY_VERSION_HEX < 0x030C0000
|
||||
int volatile *lock = (int volatile *)&PyCapsule_Type.tp_version_tag;
|
||||
int old_value, locked_value = -42;
|
||||
assert(!(PyCapsule_Type.tp_flags & Py_TPFLAGS_HAVE_VERSION_TAG));
|
||||
# else
|
||||
static struct ebp_s { PyBufferProcs buf; int mark; } empty_buffer_procs;
|
||||
empty_buffer_procs.mark = -42;
|
||||
PyBufferProcs *volatile *lock = (PyBufferProcs *volatile *)
|
||||
&PyCapsule_Type.tp_as_buffer;
|
||||
PyBufferProcs *old_value, *locked_value = &empty_buffer_procs.buf;
|
||||
# endif
|
||||
|
||||
while (1) { /* spin loop */
|
||||
old_value = *lock;
|
||||
if (old_value == 0) {
|
||||
if (cffi_compare_and_swap(lock, old_value, locked_value))
|
||||
break;
|
||||
}
|
||||
else {
|
||||
# if PY_VERSION_HEX < 0x030C0000
|
||||
assert(old_value == locked_value);
|
||||
# else
|
||||
/* The pointer should point to a possibly different
|
||||
empty_buffer_procs from another C extension module */
|
||||
assert(((struct ebp_s *)old_value)->mark == -42);
|
||||
# endif
|
||||
/* should ideally do a spin loop instruction here, but
|
||||
hard to do it portably and doesn't really matter I
|
||||
think: PyEval_InitThreads() should be very fast, and
|
||||
this is only run at start-up anyway. */
|
||||
}
|
||||
}
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* call Py_InitializeEx() */
|
||||
if (!Py_IsInitialized()) {
|
||||
_cffi_py_initialize();
|
||||
#if PY_VERSION_HEX < 0x03070000
|
||||
PyEval_InitThreads();
|
||||
#endif
|
||||
PyEval_SaveThread(); /* release the GIL */
|
||||
/* the returned tstate must be the one that has been stored into the
|
||||
autoTLSkey by _PyGILState_Init() called from Py_Initialize(). */
|
||||
}
|
||||
else {
|
||||
#if PY_VERSION_HEX < 0x03070000
|
||||
/* PyEval_InitThreads() is always a no-op from CPython 3.7 */
|
||||
PyGILState_STATE state = PyGILState_Ensure();
|
||||
PyEval_InitThreads();
|
||||
PyGILState_Release(state);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef WITH_THREAD
|
||||
/* release the lock */
|
||||
while (!cffi_compare_and_swap(lock, locked_value, old_value))
|
||||
;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/********** end CPython-specific section **********/
|
||||
|
||||
|
||||
#else
|
||||
|
||||
|
||||
/********** PyPy-specific section **********/
|
||||
|
||||
PyMODINIT_FUNC _CFFI_PYTHON_STARTUP_FUNC(const void *[]); /* forward */
|
||||
|
||||
static struct _cffi_pypy_init_s {
|
||||
const char *name;
|
||||
void *func; /* function pointer */
|
||||
const char *code;
|
||||
} _cffi_pypy_init = {
|
||||
_CFFI_MODULE_NAME,
|
||||
_CFFI_PYTHON_STARTUP_FUNC,
|
||||
_CFFI_PYTHON_STARTUP_CODE,
|
||||
};
|
||||
|
||||
extern int pypy_carefully_make_gil(const char *);
|
||||
extern int pypy_init_embedded_cffi_module(int, struct _cffi_pypy_init_s *);
|
||||
|
||||
static int _cffi_carefully_make_gil(void)
|
||||
{
|
||||
return pypy_carefully_make_gil(_CFFI_MODULE_NAME);
|
||||
}
|
||||
|
||||
static int _cffi_initialize_python(void)
|
||||
{
|
||||
return pypy_init_embedded_cffi_module(0xB011, &_cffi_pypy_init);
|
||||
}
|
||||
|
||||
/********** end PyPy-specific section **********/
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __GNUC__
|
||||
__attribute__((noinline))
|
||||
#endif
|
||||
static _cffi_call_python_fnptr _cffi_start_python(void)
|
||||
{
|
||||
/* Delicate logic to initialize Python. This function can be
|
||||
called multiple times concurrently, e.g. when the process calls
|
||||
its first ``extern "Python"`` functions in multiple threads at
|
||||
once. It can also be called recursively, in which case we must
|
||||
ignore it. We also have to consider what occurs if several
|
||||
different cffi-based extensions reach this code in parallel
|
||||
threads---it is a different copy of the code, then, and we
|
||||
can't have any shared global variable unless it comes from
|
||||
'libpythonX.Y.so'.
|
||||
|
||||
Idea:
|
||||
|
||||
* _cffi_carefully_make_gil(): "carefully" call
|
||||
PyEval_InitThreads() (possibly with Py_InitializeEx() first).
|
||||
|
||||
* then we use a (local) custom lock to make sure that a call to this
|
||||
cffi-based extension will wait if another call to the *same*
|
||||
extension is running the initialization in another thread.
|
||||
It is reentrant, so that a recursive call will not block, but
|
||||
only one from a different thread.
|
||||
|
||||
* then we grab the GIL and (Python 2) we call Py_InitializeEx().
|
||||
At this point, concurrent calls to Py_InitializeEx() are not
|
||||
possible: we have the GIL.
|
||||
|
||||
* do the rest of the specific initialization, which may
|
||||
temporarily release the GIL but not the custom lock.
|
||||
Only release the custom lock when we are done.
|
||||
*/
|
||||
static char called = 0;
|
||||
|
||||
if (_cffi_carefully_make_gil() != 0)
|
||||
return NULL;
|
||||
|
||||
_cffi_acquire_reentrant_mutex();
|
||||
|
||||
/* Here the GIL exists, but we don't have it. We're only protected
|
||||
from concurrency by the reentrant mutex. */
|
||||
|
||||
/* This file only initializes the embedded module once, the first
|
||||
time this is called, even if there are subinterpreters. */
|
||||
if (!called) {
|
||||
called = 1; /* invoke _cffi_initialize_python() only once,
|
||||
but don't set '_cffi_call_python' right now,
|
||||
otherwise concurrent threads won't call
|
||||
this function at all (we need them to wait) */
|
||||
if (_cffi_initialize_python() == 0) {
|
||||
/* now initialization is finished. Switch to the fast-path. */
|
||||
|
||||
/* We would like nobody to see the new value of
|
||||
'_cffi_call_python' without also seeing the rest of the
|
||||
data initialized. However, this is not possible. But
|
||||
the new value of '_cffi_call_python' is the function
|
||||
'cffi_call_python()' from _cffi_backend. So: */
|
||||
cffi_write_barrier();
|
||||
/* ^^^ we put a write barrier here, and a corresponding
|
||||
read barrier at the start of cffi_call_python(). This
|
||||
ensures that after that read barrier, we see everything
|
||||
done here before the write barrier.
|
||||
*/
|
||||
|
||||
assert(_cffi_call_python_org != NULL);
|
||||
_cffi_call_python = (_cffi_call_python_fnptr)_cffi_call_python_org;
|
||||
}
|
||||
else {
|
||||
/* initialization failed. Reset this to NULL, even if it was
|
||||
already set to some other value. Future calls to
|
||||
_cffi_start_python() are still forced to occur, and will
|
||||
always return NULL from now on. */
|
||||
_cffi_call_python_org = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
_cffi_release_reentrant_mutex();
|
||||
|
||||
return (_cffi_call_python_fnptr)_cffi_call_python_org;
|
||||
}
|
||||
|
||||
static
|
||||
void _cffi_start_and_call_python(struct _cffi_externpy_s *externpy, char *args)
|
||||
{
|
||||
_cffi_call_python_fnptr fnptr;
|
||||
int current_err = errno;
|
||||
#ifdef _MSC_VER
|
||||
int current_lasterr = GetLastError();
|
||||
#endif
|
||||
fnptr = _cffi_start_python();
|
||||
if (fnptr == NULL) {
|
||||
fprintf(stderr, "function %s() called, but initialization code "
|
||||
"failed. Returning 0.\n", externpy->name);
|
||||
memset(args, 0, externpy->size_of_result);
|
||||
}
|
||||
#ifdef _MSC_VER
|
||||
SetLastError(current_lasterr);
|
||||
#endif
|
||||
errno = current_err;
|
||||
|
||||
if (fnptr != NULL)
|
||||
fnptr(externpy, args);
|
||||
}
|
||||
|
||||
|
||||
/* The cffi_start_python() function makes sure Python is initialized
|
||||
and our cffi module is set up. It can be called manually from the
|
||||
user C code. The same effect is obtained automatically from any
|
||||
dll-exported ``extern "Python"`` function. This function returns
|
||||
-1 if initialization failed, 0 if all is OK. */
|
||||
_CFFI_UNUSED_FN
|
||||
static int cffi_start_python(void)
|
||||
{
|
||||
if (_cffi_call_python == &_cffi_start_and_call_python) {
|
||||
if (_cffi_start_python() == NULL)
|
||||
return -1;
|
||||
}
|
||||
cffi_read_barrier();
|
||||
return 0;
|
||||
}
|
||||
|
||||
#undef cffi_compare_and_swap
|
||||
#undef cffi_write_barrier
|
||||
#undef cffi_read_barrier
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@@ -0,0 +1,83 @@
|
||||
|
||||
try:
|
||||
# this works on Python < 3.12
|
||||
from imp import *
|
||||
|
||||
except ImportError:
|
||||
# this is a limited emulation for Python >= 3.12.
|
||||
# Note that this is used only for tests or for the old ffi.verify().
|
||||
# This is copied from the source code of Python 3.11.
|
||||
|
||||
from _imp import (acquire_lock, release_lock,
|
||||
is_builtin, is_frozen)
|
||||
|
||||
from importlib._bootstrap import _load
|
||||
|
||||
from importlib import machinery
|
||||
import os
|
||||
import sys
|
||||
import tokenize
|
||||
|
||||
SEARCH_ERROR = 0
|
||||
PY_SOURCE = 1
|
||||
PY_COMPILED = 2
|
||||
C_EXTENSION = 3
|
||||
PY_RESOURCE = 4
|
||||
PKG_DIRECTORY = 5
|
||||
C_BUILTIN = 6
|
||||
PY_FROZEN = 7
|
||||
PY_CODERESOURCE = 8
|
||||
IMP_HOOK = 9
|
||||
|
||||
def get_suffixes():
|
||||
extensions = [(s, 'rb', C_EXTENSION)
|
||||
for s in machinery.EXTENSION_SUFFIXES]
|
||||
source = [(s, 'r', PY_SOURCE) for s in machinery.SOURCE_SUFFIXES]
|
||||
bytecode = [(s, 'rb', PY_COMPILED) for s in machinery.BYTECODE_SUFFIXES]
|
||||
return extensions + source + bytecode
|
||||
|
||||
def find_module(name, path=None):
|
||||
if not isinstance(name, str):
|
||||
raise TypeError("'name' must be a str, not {}".format(type(name)))
|
||||
elif not isinstance(path, (type(None), list)):
|
||||
# Backwards-compatibility
|
||||
raise RuntimeError("'path' must be None or a list, "
|
||||
"not {}".format(type(path)))
|
||||
|
||||
if path is None:
|
||||
if is_builtin(name):
|
||||
return None, None, ('', '', C_BUILTIN)
|
||||
elif is_frozen(name):
|
||||
return None, None, ('', '', PY_FROZEN)
|
||||
else:
|
||||
path = sys.path
|
||||
|
||||
for entry in path:
|
||||
package_directory = os.path.join(entry, name)
|
||||
for suffix in ['.py', machinery.BYTECODE_SUFFIXES[0]]:
|
||||
package_file_name = '__init__' + suffix
|
||||
file_path = os.path.join(package_directory, package_file_name)
|
||||
if os.path.isfile(file_path):
|
||||
return None, package_directory, ('', '', PKG_DIRECTORY)
|
||||
for suffix, mode, type_ in get_suffixes():
|
||||
file_name = name + suffix
|
||||
file_path = os.path.join(entry, file_name)
|
||||
if os.path.isfile(file_path):
|
||||
break
|
||||
else:
|
||||
continue
|
||||
break # Break out of outer loop when breaking out of inner loop.
|
||||
else:
|
||||
raise ImportError(name, name=name)
|
||||
|
||||
encoding = None
|
||||
if 'b' not in mode:
|
||||
with open(file_path, 'rb') as file:
|
||||
encoding = tokenize.detect_encoding(file.readline)[0]
|
||||
file = open(file_path, mode, encoding=encoding)
|
||||
return file, file_path, (suffix, mode, type_)
|
||||
|
||||
def load_dynamic(name, path, file=None):
|
||||
loader = machinery.ExtensionFileLoader(name, path)
|
||||
spec = machinery.ModuleSpec(name=name, loader=loader, origin=path)
|
||||
return _load(spec)
|
||||
@@ -0,0 +1,45 @@
|
||||
"""
|
||||
Temporary shim module to indirect the bits of distutils we need from setuptools/distutils while providing useful
|
||||
error messages beyond `No module named 'distutils' on Python >= 3.12, or when setuptools' vendored distutils is broken.
|
||||
|
||||
This is a compromise to avoid a hard-dep on setuptools for Python >= 3.12, since many users don't need runtime compilation support from CFFI.
|
||||
"""
|
||||
import sys
|
||||
|
||||
try:
|
||||
# import setuptools first; this is the most robust way to ensure its embedded distutils is available
|
||||
# (the .pth shim should usually work, but this is even more robust)
|
||||
import setuptools
|
||||
except Exception as ex:
|
||||
if sys.version_info >= (3, 12):
|
||||
# Python 3.12 has no built-in distutils to fall back on, so any import problem is fatal
|
||||
raise Exception("This CFFI feature requires setuptools on Python >= 3.12. The setuptools module is missing or non-functional.") from ex
|
||||
|
||||
# silently ignore on older Pythons (support fallback to stdlib distutils where available)
|
||||
else:
|
||||
del setuptools
|
||||
|
||||
try:
|
||||
# bring in just the bits of distutils we need, whether they really came from setuptools or stdlib-embedded distutils
|
||||
from distutils import log, sysconfig
|
||||
from distutils.ccompiler import CCompiler
|
||||
from distutils.command.build_ext import build_ext
|
||||
from distutils.core import Distribution, Extension
|
||||
from distutils.dir_util import mkpath
|
||||
from distutils.errors import DistutilsSetupError, CompileError, LinkError
|
||||
from distutils.log import set_threshold, set_verbosity
|
||||
|
||||
if sys.platform == 'win32':
|
||||
try:
|
||||
# FUTURE: msvc9compiler module was removed in setuptools 74; consider removing, as it's only used by an ancient patch in `recompiler`
|
||||
from distutils.msvc9compiler import MSVCCompiler
|
||||
except ImportError:
|
||||
MSVCCompiler = None
|
||||
except Exception as ex:
|
||||
if sys.version_info >= (3, 12):
|
||||
raise Exception("This CFFI feature requires setuptools on Python >= 3.12. Please install the setuptools package.") from ex
|
||||
|
||||
# anything older, just let the underlying distutils import error fly
|
||||
raise Exception("This CFFI feature requires distutils. Please install the distutils or setuptools package.") from ex
|
||||
|
||||
del sys
|
||||
@@ -0,0 +1,967 @@
|
||||
import sys, types
|
||||
from .lock import allocate_lock
|
||||
from .error import CDefError
|
||||
from . import model
|
||||
|
||||
try:
|
||||
callable
|
||||
except NameError:
|
||||
# Python 3.1
|
||||
from collections import Callable
|
||||
callable = lambda x: isinstance(x, Callable)
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
# Python 3.x
|
||||
basestring = str
|
||||
|
||||
_unspecified = object()
|
||||
|
||||
|
||||
|
||||
class FFI(object):
|
||||
r'''
|
||||
The main top-level class that you instantiate once, or once per module.
|
||||
|
||||
Example usage:
|
||||
|
||||
ffi = FFI()
|
||||
ffi.cdef("""
|
||||
int printf(const char *, ...);
|
||||
""")
|
||||
|
||||
C = ffi.dlopen(None) # standard library
|
||||
-or-
|
||||
C = ffi.verify() # use a C compiler: verify the decl above is right
|
||||
|
||||
C.printf("hello, %s!\n", ffi.new("char[]", "world"))
|
||||
'''
|
||||
|
||||
def __init__(self, backend=None):
|
||||
"""Create an FFI instance. The 'backend' argument is used to
|
||||
select a non-default backend, mostly for tests.
|
||||
"""
|
||||
if backend is None:
|
||||
# You need PyPy (>= 2.0 beta), or a CPython (>= 2.6) with
|
||||
# _cffi_backend.so compiled.
|
||||
import _cffi_backend as backend
|
||||
from . import __version__
|
||||
if backend.__version__ != __version__:
|
||||
# bad version! Try to be as explicit as possible.
|
||||
if hasattr(backend, '__file__'):
|
||||
# CPython
|
||||
raise Exception("Version mismatch: this is the 'cffi' package version %s, located in %r. When we import the top-level '_cffi_backend' extension module, we get version %s, located in %r. The two versions should be equal; check your installation." % (
|
||||
__version__, __file__,
|
||||
backend.__version__, backend.__file__))
|
||||
else:
|
||||
# PyPy
|
||||
raise Exception("Version mismatch: this is the 'cffi' package version %s, located in %r. This interpreter comes with a built-in '_cffi_backend' module, which is version %s. The two versions should be equal; check your installation." % (
|
||||
__version__, __file__, backend.__version__))
|
||||
# (If you insist you can also try to pass the option
|
||||
# 'backend=backend_ctypes.CTypesBackend()', but don't
|
||||
# rely on it! It's probably not going to work well.)
|
||||
|
||||
from . import cparser
|
||||
self._backend = backend
|
||||
self._lock = allocate_lock()
|
||||
self._parser = cparser.Parser()
|
||||
self._cached_btypes = {}
|
||||
self._parsed_types = types.ModuleType('parsed_types').__dict__
|
||||
self._new_types = types.ModuleType('new_types').__dict__
|
||||
self._function_caches = []
|
||||
self._libraries = []
|
||||
self._cdefsources = []
|
||||
self._included_ffis = []
|
||||
self._windows_unicode = None
|
||||
self._init_once_cache = {}
|
||||
self._cdef_version = None
|
||||
self._embedding = None
|
||||
self._typecache = model.get_typecache(backend)
|
||||
if hasattr(backend, 'set_ffi'):
|
||||
backend.set_ffi(self)
|
||||
for name in list(backend.__dict__):
|
||||
if name.startswith('RTLD_'):
|
||||
setattr(self, name, getattr(backend, name))
|
||||
#
|
||||
with self._lock:
|
||||
self.BVoidP = self._get_cached_btype(model.voidp_type)
|
||||
self.BCharA = self._get_cached_btype(model.char_array_type)
|
||||
if isinstance(backend, types.ModuleType):
|
||||
# _cffi_backend: attach these constants to the class
|
||||
if not hasattr(FFI, 'NULL'):
|
||||
FFI.NULL = self.cast(self.BVoidP, 0)
|
||||
FFI.CData, FFI.CType = backend._get_types()
|
||||
else:
|
||||
# ctypes backend: attach these constants to the instance
|
||||
self.NULL = self.cast(self.BVoidP, 0)
|
||||
self.CData, self.CType = backend._get_types()
|
||||
self.buffer = backend.buffer
|
||||
|
||||
def cdef(self, csource, override=False, packed=False, pack=None):
|
||||
"""Parse the given C source. This registers all declared functions,
|
||||
types, and global variables. The functions and global variables can
|
||||
then be accessed via either 'ffi.dlopen()' or 'ffi.verify()'.
|
||||
The types can be used in 'ffi.new()' and other functions.
|
||||
If 'packed' is specified as True, all structs declared inside this
|
||||
cdef are packed, i.e. laid out without any field alignment at all.
|
||||
Alternatively, 'pack' can be a small integer, and requests for
|
||||
alignment greater than that are ignored (pack=1 is equivalent to
|
||||
packed=True).
|
||||
"""
|
||||
self._cdef(csource, override=override, packed=packed, pack=pack)
|
||||
|
||||
def embedding_api(self, csource, packed=False, pack=None):
|
||||
self._cdef(csource, packed=packed, pack=pack, dllexport=True)
|
||||
if self._embedding is None:
|
||||
self._embedding = ''
|
||||
|
||||
def _cdef(self, csource, override=False, **options):
|
||||
if not isinstance(csource, str): # unicode, on Python 2
|
||||
if not isinstance(csource, basestring):
|
||||
raise TypeError("cdef() argument must be a string")
|
||||
csource = csource.encode('ascii')
|
||||
with self._lock:
|
||||
self._cdef_version = object()
|
||||
self._parser.parse(csource, override=override, **options)
|
||||
self._cdefsources.append(csource)
|
||||
if override:
|
||||
for cache in self._function_caches:
|
||||
cache.clear()
|
||||
finishlist = self._parser._recomplete
|
||||
if finishlist:
|
||||
self._parser._recomplete = []
|
||||
for tp in finishlist:
|
||||
tp.finish_backend_type(self, finishlist)
|
||||
|
||||
def dlopen(self, name, flags=0):
|
||||
"""Load and return a dynamic library identified by 'name'.
|
||||
The standard C library can be loaded by passing None.
|
||||
Note that functions and types declared by 'ffi.cdef()' are not
|
||||
linked to a particular library, just like C headers; in the
|
||||
library we only look for the actual (untyped) symbols.
|
||||
"""
|
||||
if not (isinstance(name, basestring) or
|
||||
name is None or
|
||||
isinstance(name, self.CData)):
|
||||
raise TypeError("dlopen(name): name must be a file name, None, "
|
||||
"or an already-opened 'void *' handle")
|
||||
with self._lock:
|
||||
lib, function_cache = _make_ffi_library(self, name, flags)
|
||||
self._function_caches.append(function_cache)
|
||||
self._libraries.append(lib)
|
||||
return lib
|
||||
|
||||
def dlclose(self, lib):
|
||||
"""Close a library obtained with ffi.dlopen(). After this call,
|
||||
access to functions or variables from the library will fail
|
||||
(possibly with a segmentation fault).
|
||||
"""
|
||||
type(lib).__cffi_close__(lib)
|
||||
|
||||
def _typeof_locked(self, cdecl):
|
||||
# call me with the lock!
|
||||
key = cdecl
|
||||
if key in self._parsed_types:
|
||||
return self._parsed_types[key]
|
||||
#
|
||||
if not isinstance(cdecl, str): # unicode, on Python 2
|
||||
cdecl = cdecl.encode('ascii')
|
||||
#
|
||||
type = self._parser.parse_type(cdecl)
|
||||
really_a_function_type = type.is_raw_function
|
||||
if really_a_function_type:
|
||||
type = type.as_function_pointer()
|
||||
btype = self._get_cached_btype(type)
|
||||
result = btype, really_a_function_type
|
||||
self._parsed_types[key] = result
|
||||
return result
|
||||
|
||||
def _typeof(self, cdecl, consider_function_as_funcptr=False):
|
||||
# string -> ctype object
|
||||
try:
|
||||
result = self._parsed_types[cdecl]
|
||||
except KeyError:
|
||||
with self._lock:
|
||||
result = self._typeof_locked(cdecl)
|
||||
#
|
||||
btype, really_a_function_type = result
|
||||
if really_a_function_type and not consider_function_as_funcptr:
|
||||
raise CDefError("the type %r is a function type, not a "
|
||||
"pointer-to-function type" % (cdecl,))
|
||||
return btype
|
||||
|
||||
def typeof(self, cdecl):
|
||||
"""Parse the C type given as a string and return the
|
||||
corresponding <ctype> object.
|
||||
It can also be used on 'cdata' instance to get its C type.
|
||||
"""
|
||||
if isinstance(cdecl, basestring):
|
||||
return self._typeof(cdecl)
|
||||
if isinstance(cdecl, self.CData):
|
||||
return self._backend.typeof(cdecl)
|
||||
if isinstance(cdecl, types.BuiltinFunctionType):
|
||||
res = _builtin_function_type(cdecl)
|
||||
if res is not None:
|
||||
return res
|
||||
if (isinstance(cdecl, types.FunctionType)
|
||||
and hasattr(cdecl, '_cffi_base_type')):
|
||||
with self._lock:
|
||||
return self._get_cached_btype(cdecl._cffi_base_type)
|
||||
raise TypeError(type(cdecl))
|
||||
|
||||
def sizeof(self, cdecl):
|
||||
"""Return the size in bytes of the argument. It can be a
|
||||
string naming a C type, or a 'cdata' instance.
|
||||
"""
|
||||
if isinstance(cdecl, basestring):
|
||||
BType = self._typeof(cdecl)
|
||||
return self._backend.sizeof(BType)
|
||||
else:
|
||||
return self._backend.sizeof(cdecl)
|
||||
|
||||
def alignof(self, cdecl):
|
||||
"""Return the natural alignment size in bytes of the C type
|
||||
given as a string.
|
||||
"""
|
||||
if isinstance(cdecl, basestring):
|
||||
cdecl = self._typeof(cdecl)
|
||||
return self._backend.alignof(cdecl)
|
||||
|
||||
def offsetof(self, cdecl, *fields_or_indexes):
|
||||
"""Return the offset of the named field inside the given
|
||||
structure or array, which must be given as a C type name.
|
||||
You can give several field names in case of nested structures.
|
||||
You can also give numeric values which correspond to array
|
||||
items, in case of an array type.
|
||||
"""
|
||||
if isinstance(cdecl, basestring):
|
||||
cdecl = self._typeof(cdecl)
|
||||
return self._typeoffsetof(cdecl, *fields_or_indexes)[1]
|
||||
|
||||
def new(self, cdecl, init=None):
|
||||
"""Allocate an instance according to the specified C type and
|
||||
return a pointer to it. The specified C type must be either a
|
||||
pointer or an array: ``new('X *')`` allocates an X and returns
|
||||
a pointer to it, whereas ``new('X[n]')`` allocates an array of
|
||||
n X'es and returns an array referencing it (which works
|
||||
mostly like a pointer, like in C). You can also use
|
||||
``new('X[]', n)`` to allocate an array of a non-constant
|
||||
length n.
|
||||
|
||||
The memory is initialized following the rules of declaring a
|
||||
global variable in C: by default it is zero-initialized, but
|
||||
an explicit initializer can be given which can be used to
|
||||
fill all or part of the memory.
|
||||
|
||||
When the returned <cdata> object goes out of scope, the memory
|
||||
is freed. In other words the returned <cdata> object has
|
||||
ownership of the value of type 'cdecl' that it points to. This
|
||||
means that the raw data can be used as long as this object is
|
||||
kept alive, but must not be used for a longer time. Be careful
|
||||
about that when copying the pointer to the memory somewhere
|
||||
else, e.g. into another structure.
|
||||
"""
|
||||
if isinstance(cdecl, basestring):
|
||||
cdecl = self._typeof(cdecl)
|
||||
return self._backend.newp(cdecl, init)
|
||||
|
||||
def new_allocator(self, alloc=None, free=None,
|
||||
should_clear_after_alloc=True):
|
||||
"""Return a new allocator, i.e. a function that behaves like ffi.new()
|
||||
but uses the provided low-level 'alloc' and 'free' functions.
|
||||
|
||||
'alloc' is called with the size as argument. If it returns NULL, a
|
||||
MemoryError is raised. 'free' is called with the result of 'alloc'
|
||||
as argument. Both can be either Python function or directly C
|
||||
functions. If 'free' is None, then no free function is called.
|
||||
If both 'alloc' and 'free' are None, the default is used.
|
||||
|
||||
If 'should_clear_after_alloc' is set to False, then the memory
|
||||
returned by 'alloc' is assumed to be already cleared (or you are
|
||||
fine with garbage); otherwise CFFI will clear it.
|
||||
"""
|
||||
compiled_ffi = self._backend.FFI()
|
||||
allocator = compiled_ffi.new_allocator(alloc, free,
|
||||
should_clear_after_alloc)
|
||||
def allocate(cdecl, init=None):
|
||||
if isinstance(cdecl, basestring):
|
||||
cdecl = self._typeof(cdecl)
|
||||
return allocator(cdecl, init)
|
||||
return allocate
|
||||
|
||||
def cast(self, cdecl, source):
|
||||
"""Similar to a C cast: returns an instance of the named C
|
||||
type initialized with the given 'source'. The source is
|
||||
casted between integers or pointers of any type.
|
||||
"""
|
||||
if isinstance(cdecl, basestring):
|
||||
cdecl = self._typeof(cdecl)
|
||||
return self._backend.cast(cdecl, source)
|
||||
|
||||
def string(self, cdata, maxlen=-1):
|
||||
"""Return a Python string (or unicode string) from the 'cdata'.
|
||||
If 'cdata' is a pointer or array of characters or bytes, returns
|
||||
the null-terminated string. The returned string extends until
|
||||
the first null character, or at most 'maxlen' characters. If
|
||||
'cdata' is an array then 'maxlen' defaults to its length.
|
||||
|
||||
If 'cdata' is a pointer or array of wchar_t, returns a unicode
|
||||
string following the same rules.
|
||||
|
||||
If 'cdata' is a single character or byte or a wchar_t, returns
|
||||
it as a string or unicode string.
|
||||
|
||||
If 'cdata' is an enum, returns the value of the enumerator as a
|
||||
string, or 'NUMBER' if the value is out of range.
|
||||
"""
|
||||
return self._backend.string(cdata, maxlen)
|
||||
|
||||
def unpack(self, cdata, length):
|
||||
"""Unpack an array of C data of the given length,
|
||||
returning a Python string/unicode/list.
|
||||
|
||||
If 'cdata' is a pointer to 'char', returns a byte string.
|
||||
It does not stop at the first null. This is equivalent to:
|
||||
ffi.buffer(cdata, length)[:]
|
||||
|
||||
If 'cdata' is a pointer to 'wchar_t', returns a unicode string.
|
||||
'length' is measured in wchar_t's; it is not the size in bytes.
|
||||
|
||||
If 'cdata' is a pointer to anything else, returns a list of
|
||||
'length' items. This is a faster equivalent to:
|
||||
[cdata[i] for i in range(length)]
|
||||
"""
|
||||
return self._backend.unpack(cdata, length)
|
||||
|
||||
#def buffer(self, cdata, size=-1):
|
||||
# """Return a read-write buffer object that references the raw C data
|
||||
# pointed to by the given 'cdata'. The 'cdata' must be a pointer or
|
||||
# an array. Can be passed to functions expecting a buffer, or directly
|
||||
# manipulated with:
|
||||
#
|
||||
# buf[:] get a copy of it in a regular string, or
|
||||
# buf[idx] as a single character
|
||||
# buf[:] = ...
|
||||
# buf[idx] = ... change the content
|
||||
# """
|
||||
# note that 'buffer' is a type, set on this instance by __init__
|
||||
|
||||
def from_buffer(self, cdecl, python_buffer=_unspecified,
|
||||
require_writable=False):
|
||||
"""Return a cdata of the given type pointing to the data of the
|
||||
given Python object, which must support the buffer interface.
|
||||
Note that this is not meant to be used on the built-in types
|
||||
str or unicode (you can build 'char[]' arrays explicitly)
|
||||
but only on objects containing large quantities of raw data
|
||||
in some other format, like 'array.array' or numpy arrays.
|
||||
|
||||
The first argument is optional and default to 'char[]'.
|
||||
"""
|
||||
if python_buffer is _unspecified:
|
||||
cdecl, python_buffer = self.BCharA, cdecl
|
||||
elif isinstance(cdecl, basestring):
|
||||
cdecl = self._typeof(cdecl)
|
||||
return self._backend.from_buffer(cdecl, python_buffer,
|
||||
require_writable)
|
||||
|
||||
def memmove(self, dest, src, n):
|
||||
"""ffi.memmove(dest, src, n) copies n bytes of memory from src to dest.
|
||||
|
||||
Like the C function memmove(), the memory areas may overlap;
|
||||
apart from that it behaves like the C function memcpy().
|
||||
|
||||
'src' can be any cdata ptr or array, or any Python buffer object.
|
||||
'dest' can be any cdata ptr or array, or a writable Python buffer
|
||||
object. The size to copy, 'n', is always measured in bytes.
|
||||
|
||||
Unlike other methods, this one supports all Python buffer including
|
||||
byte strings and bytearrays---but it still does not support
|
||||
non-contiguous buffers.
|
||||
"""
|
||||
return self._backend.memmove(dest, src, n)
|
||||
|
||||
def callback(self, cdecl, python_callable=None, error=None, onerror=None):
|
||||
"""Return a callback object or a decorator making such a
|
||||
callback object. 'cdecl' must name a C function pointer type.
|
||||
The callback invokes the specified 'python_callable' (which may
|
||||
be provided either directly or via a decorator). Important: the
|
||||
callback object must be manually kept alive for as long as the
|
||||
callback may be invoked from the C level.
|
||||
"""
|
||||
def callback_decorator_wrap(python_callable):
|
||||
if not callable(python_callable):
|
||||
raise TypeError("the 'python_callable' argument "
|
||||
"is not callable")
|
||||
return self._backend.callback(cdecl, python_callable,
|
||||
error, onerror)
|
||||
if isinstance(cdecl, basestring):
|
||||
cdecl = self._typeof(cdecl, consider_function_as_funcptr=True)
|
||||
if python_callable is None:
|
||||
return callback_decorator_wrap # decorator mode
|
||||
else:
|
||||
return callback_decorator_wrap(python_callable) # direct mode
|
||||
|
||||
def getctype(self, cdecl, replace_with=''):
|
||||
"""Return a string giving the C type 'cdecl', which may be itself
|
||||
a string or a <ctype> object. If 'replace_with' is given, it gives
|
||||
extra text to append (or insert for more complicated C types), like
|
||||
a variable name, or '*' to get actually the C type 'pointer-to-cdecl'.
|
||||
"""
|
||||
if isinstance(cdecl, basestring):
|
||||
cdecl = self._typeof(cdecl)
|
||||
replace_with = replace_with.strip()
|
||||
if (replace_with.startswith('*')
|
||||
and '&[' in self._backend.getcname(cdecl, '&')):
|
||||
replace_with = '(%s)' % replace_with
|
||||
elif replace_with and not replace_with[0] in '[(':
|
||||
replace_with = ' ' + replace_with
|
||||
return self._backend.getcname(cdecl, replace_with)
|
||||
|
||||
def gc(self, cdata, destructor, size=0):
|
||||
"""Return a new cdata object that points to the same
|
||||
data. Later, when this new cdata object is garbage-collected,
|
||||
'destructor(old_cdata_object)' will be called.
|
||||
|
||||
The optional 'size' gives an estimate of the size, used to
|
||||
trigger the garbage collection more eagerly. So far only used
|
||||
on PyPy. It tells the GC that the returned object keeps alive
|
||||
roughly 'size' bytes of external memory.
|
||||
"""
|
||||
return self._backend.gcp(cdata, destructor, size)
|
||||
|
||||
def _get_cached_btype(self, type):
|
||||
assert self._lock.acquire(False) is False
|
||||
# call me with the lock!
|
||||
try:
|
||||
BType = self._cached_btypes[type]
|
||||
except KeyError:
|
||||
finishlist = []
|
||||
BType = type.get_cached_btype(self, finishlist)
|
||||
for type in finishlist:
|
||||
type.finish_backend_type(self, finishlist)
|
||||
return BType
|
||||
|
||||
def verify(self, source='', tmpdir=None, **kwargs):
|
||||
"""Verify that the current ffi signatures compile on this
|
||||
machine, and return a dynamic library object. The dynamic
|
||||
library can be used to call functions and access global
|
||||
variables declared in this 'ffi'. The library is compiled
|
||||
by the C compiler: it gives you C-level API compatibility
|
||||
(including calling macros). This is unlike 'ffi.dlopen()',
|
||||
which requires binary compatibility in the signatures.
|
||||
"""
|
||||
from .verifier import Verifier, _caller_dir_pycache
|
||||
#
|
||||
# If set_unicode(True) was called, insert the UNICODE and
|
||||
# _UNICODE macro declarations
|
||||
if self._windows_unicode:
|
||||
self._apply_windows_unicode(kwargs)
|
||||
#
|
||||
# Set the tmpdir here, and not in Verifier.__init__: it picks
|
||||
# up the caller's directory, which we want to be the caller of
|
||||
# ffi.verify(), as opposed to the caller of Veritier().
|
||||
tmpdir = tmpdir or _caller_dir_pycache()
|
||||
#
|
||||
# Make a Verifier() and use it to load the library.
|
||||
self.verifier = Verifier(self, source, tmpdir, **kwargs)
|
||||
lib = self.verifier.load_library()
|
||||
#
|
||||
# Save the loaded library for keep-alive purposes, even
|
||||
# if the caller doesn't keep it alive itself (it should).
|
||||
self._libraries.append(lib)
|
||||
return lib
|
||||
|
||||
def _get_errno(self):
|
||||
return self._backend.get_errno()
|
||||
def _set_errno(self, errno):
|
||||
self._backend.set_errno(errno)
|
||||
errno = property(_get_errno, _set_errno, None,
|
||||
"the value of 'errno' from/to the C calls")
|
||||
|
||||
def getwinerror(self, code=-1):
|
||||
return self._backend.getwinerror(code)
|
||||
|
||||
def _pointer_to(self, ctype):
|
||||
with self._lock:
|
||||
return model.pointer_cache(self, ctype)
|
||||
|
||||
def addressof(self, cdata, *fields_or_indexes):
|
||||
"""Return the address of a <cdata 'struct-or-union'>.
|
||||
If 'fields_or_indexes' are given, returns the address of that
|
||||
field or array item in the structure or array, recursively in
|
||||
case of nested structures.
|
||||
"""
|
||||
try:
|
||||
ctype = self._backend.typeof(cdata)
|
||||
except TypeError:
|
||||
if '__addressof__' in type(cdata).__dict__:
|
||||
return type(cdata).__addressof__(cdata, *fields_or_indexes)
|
||||
raise
|
||||
if fields_or_indexes:
|
||||
ctype, offset = self._typeoffsetof(ctype, *fields_or_indexes)
|
||||
else:
|
||||
if ctype.kind == "pointer":
|
||||
raise TypeError("addressof(pointer)")
|
||||
offset = 0
|
||||
ctypeptr = self._pointer_to(ctype)
|
||||
return self._backend.rawaddressof(ctypeptr, cdata, offset)
|
||||
|
||||
def _typeoffsetof(self, ctype, field_or_index, *fields_or_indexes):
|
||||
ctype, offset = self._backend.typeoffsetof(ctype, field_or_index)
|
||||
for field1 in fields_or_indexes:
|
||||
ctype, offset1 = self._backend.typeoffsetof(ctype, field1, 1)
|
||||
offset += offset1
|
||||
return ctype, offset
|
||||
|
||||
def include(self, ffi_to_include):
|
||||
"""Includes the typedefs, structs, unions and enums defined
|
||||
in another FFI instance. Usage is similar to a #include in C,
|
||||
where a part of the program might include types defined in
|
||||
another part for its own usage. Note that the include()
|
||||
method has no effect on functions, constants and global
|
||||
variables, which must anyway be accessed directly from the
|
||||
lib object returned by the original FFI instance.
|
||||
"""
|
||||
if not isinstance(ffi_to_include, FFI):
|
||||
raise TypeError("ffi.include() expects an argument that is also of"
|
||||
" type cffi.FFI, not %r" % (
|
||||
type(ffi_to_include).__name__,))
|
||||
if ffi_to_include is self:
|
||||
raise ValueError("self.include(self)")
|
||||
with ffi_to_include._lock:
|
||||
with self._lock:
|
||||
self._parser.include(ffi_to_include._parser)
|
||||
self._cdefsources.append('[')
|
||||
self._cdefsources.extend(ffi_to_include._cdefsources)
|
||||
self._cdefsources.append(']')
|
||||
self._included_ffis.append(ffi_to_include)
|
||||
|
||||
def new_handle(self, x):
|
||||
return self._backend.newp_handle(self.BVoidP, x)
|
||||
|
||||
def from_handle(self, x):
|
||||
return self._backend.from_handle(x)
|
||||
|
||||
def release(self, x):
|
||||
self._backend.release(x)
|
||||
|
||||
def set_unicode(self, enabled_flag):
|
||||
"""Windows: if 'enabled_flag' is True, enable the UNICODE and
|
||||
_UNICODE defines in C, and declare the types like TCHAR and LPTCSTR
|
||||
to be (pointers to) wchar_t. If 'enabled_flag' is False,
|
||||
declare these types to be (pointers to) plain 8-bit characters.
|
||||
This is mostly for backward compatibility; you usually want True.
|
||||
"""
|
||||
if self._windows_unicode is not None:
|
||||
raise ValueError("set_unicode() can only be called once")
|
||||
enabled_flag = bool(enabled_flag)
|
||||
if enabled_flag:
|
||||
self.cdef("typedef wchar_t TBYTE;"
|
||||
"typedef wchar_t TCHAR;"
|
||||
"typedef const wchar_t *LPCTSTR;"
|
||||
"typedef const wchar_t *PCTSTR;"
|
||||
"typedef wchar_t *LPTSTR;"
|
||||
"typedef wchar_t *PTSTR;"
|
||||
"typedef TBYTE *PTBYTE;"
|
||||
"typedef TCHAR *PTCHAR;")
|
||||
else:
|
||||
self.cdef("typedef char TBYTE;"
|
||||
"typedef char TCHAR;"
|
||||
"typedef const char *LPCTSTR;"
|
||||
"typedef const char *PCTSTR;"
|
||||
"typedef char *LPTSTR;"
|
||||
"typedef char *PTSTR;"
|
||||
"typedef TBYTE *PTBYTE;"
|
||||
"typedef TCHAR *PTCHAR;")
|
||||
self._windows_unicode = enabled_flag
|
||||
|
||||
def _apply_windows_unicode(self, kwds):
|
||||
defmacros = kwds.get('define_macros', ())
|
||||
if not isinstance(defmacros, (list, tuple)):
|
||||
raise TypeError("'define_macros' must be a list or tuple")
|
||||
defmacros = list(defmacros) + [('UNICODE', '1'),
|
||||
('_UNICODE', '1')]
|
||||
kwds['define_macros'] = defmacros
|
||||
|
||||
def _apply_embedding_fix(self, kwds):
|
||||
# must include an argument like "-lpython2.7" for the compiler
|
||||
def ensure(key, value):
|
||||
lst = kwds.setdefault(key, [])
|
||||
if value not in lst:
|
||||
lst.append(value)
|
||||
#
|
||||
if '__pypy__' in sys.builtin_module_names:
|
||||
import os
|
||||
if sys.platform == "win32":
|
||||
# we need 'libpypy-c.lib'. Current distributions of
|
||||
# pypy (>= 4.1) contain it as 'libs/python27.lib'.
|
||||
pythonlib = "python{0[0]}{0[1]}".format(sys.version_info)
|
||||
if hasattr(sys, 'prefix'):
|
||||
ensure('library_dirs', os.path.join(sys.prefix, 'libs'))
|
||||
else:
|
||||
# we need 'libpypy-c.{so,dylib}', which should be by
|
||||
# default located in 'sys.prefix/bin' for installed
|
||||
# systems.
|
||||
if sys.version_info < (3,):
|
||||
pythonlib = "pypy-c"
|
||||
else:
|
||||
pythonlib = "pypy3-c"
|
||||
if hasattr(sys, 'prefix'):
|
||||
ensure('library_dirs', os.path.join(sys.prefix, 'bin'))
|
||||
# On uninstalled pypy's, the libpypy-c is typically found in
|
||||
# .../pypy/goal/.
|
||||
if hasattr(sys, 'prefix'):
|
||||
ensure('library_dirs', os.path.join(sys.prefix, 'pypy', 'goal'))
|
||||
else:
|
||||
if sys.platform == "win32":
|
||||
template = "python%d%d"
|
||||
if hasattr(sys, 'gettotalrefcount'):
|
||||
template += '_d'
|
||||
else:
|
||||
try:
|
||||
import sysconfig
|
||||
except ImportError: # 2.6
|
||||
from cffi._shimmed_dist_utils import sysconfig
|
||||
template = "python%d.%d"
|
||||
if sysconfig.get_config_var('DEBUG_EXT'):
|
||||
template += sysconfig.get_config_var('DEBUG_EXT')
|
||||
pythonlib = (template %
|
||||
(sys.hexversion >> 24, (sys.hexversion >> 16) & 0xff))
|
||||
if hasattr(sys, 'abiflags'):
|
||||
pythonlib += sys.abiflags
|
||||
ensure('libraries', pythonlib)
|
||||
if sys.platform == "win32":
|
||||
ensure('extra_link_args', '/MANIFEST')
|
||||
|
||||
def set_source(self, module_name, source, source_extension='.c', **kwds):
|
||||
import os
|
||||
if hasattr(self, '_assigned_source'):
|
||||
raise ValueError("set_source() cannot be called several times "
|
||||
"per ffi object")
|
||||
if not isinstance(module_name, basestring):
|
||||
raise TypeError("'module_name' must be a string")
|
||||
if os.sep in module_name or (os.altsep and os.altsep in module_name):
|
||||
raise ValueError("'module_name' must not contain '/': use a dotted "
|
||||
"name to make a 'package.module' location")
|
||||
self._assigned_source = (str(module_name), source,
|
||||
source_extension, kwds)
|
||||
|
||||
def set_source_pkgconfig(self, module_name, pkgconfig_libs, source,
|
||||
source_extension='.c', **kwds):
|
||||
from . import pkgconfig
|
||||
if not isinstance(pkgconfig_libs, list):
|
||||
raise TypeError("the pkgconfig_libs argument must be a list "
|
||||
"of package names")
|
||||
kwds2 = pkgconfig.flags_from_pkgconfig(pkgconfig_libs)
|
||||
pkgconfig.merge_flags(kwds, kwds2)
|
||||
self.set_source(module_name, source, source_extension, **kwds)
|
||||
|
||||
def distutils_extension(self, tmpdir='build', verbose=True):
|
||||
from cffi._shimmed_dist_utils import mkpath
|
||||
from .recompiler import recompile
|
||||
#
|
||||
if not hasattr(self, '_assigned_source'):
|
||||
if hasattr(self, 'verifier'): # fallback, 'tmpdir' ignored
|
||||
return self.verifier.get_extension()
|
||||
raise ValueError("set_source() must be called before"
|
||||
" distutils_extension()")
|
||||
module_name, source, source_extension, kwds = self._assigned_source
|
||||
if source is None:
|
||||
raise TypeError("distutils_extension() is only for C extension "
|
||||
"modules, not for dlopen()-style pure Python "
|
||||
"modules")
|
||||
mkpath(tmpdir)
|
||||
ext, updated = recompile(self, module_name,
|
||||
source, tmpdir=tmpdir, extradir=tmpdir,
|
||||
source_extension=source_extension,
|
||||
call_c_compiler=False, **kwds)
|
||||
if verbose:
|
||||
if updated:
|
||||
sys.stderr.write("regenerated: %r\n" % (ext.sources[0],))
|
||||
else:
|
||||
sys.stderr.write("not modified: %r\n" % (ext.sources[0],))
|
||||
return ext
|
||||
|
||||
def emit_c_code(self, filename):
|
||||
from .recompiler import recompile
|
||||
#
|
||||
if not hasattr(self, '_assigned_source'):
|
||||
raise ValueError("set_source() must be called before emit_c_code()")
|
||||
module_name, source, source_extension, kwds = self._assigned_source
|
||||
if source is None:
|
||||
raise TypeError("emit_c_code() is only for C extension modules, "
|
||||
"not for dlopen()-style pure Python modules")
|
||||
recompile(self, module_name, source,
|
||||
c_file=filename, call_c_compiler=False,
|
||||
uses_ffiplatform=False, **kwds)
|
||||
|
||||
def emit_python_code(self, filename):
|
||||
from .recompiler import recompile
|
||||
#
|
||||
if not hasattr(self, '_assigned_source'):
|
||||
raise ValueError("set_source() must be called before emit_c_code()")
|
||||
module_name, source, source_extension, kwds = self._assigned_source
|
||||
if source is not None:
|
||||
raise TypeError("emit_python_code() is only for dlopen()-style "
|
||||
"pure Python modules, not for C extension modules")
|
||||
recompile(self, module_name, source,
|
||||
c_file=filename, call_c_compiler=False,
|
||||
uses_ffiplatform=False, **kwds)
|
||||
|
||||
def compile(self, tmpdir='.', verbose=0, target=None, debug=None):
|
||||
"""The 'target' argument gives the final file name of the
|
||||
compiled DLL. Use '*' to force distutils' choice, suitable for
|
||||
regular CPython C API modules. Use a file name ending in '.*'
|
||||
to ask for the system's default extension for dynamic libraries
|
||||
(.so/.dll/.dylib).
|
||||
|
||||
The default is '*' when building a non-embedded C API extension,
|
||||
and (module_name + '.*') when building an embedded library.
|
||||
"""
|
||||
from .recompiler import recompile
|
||||
#
|
||||
if not hasattr(self, '_assigned_source'):
|
||||
raise ValueError("set_source() must be called before compile()")
|
||||
module_name, source, source_extension, kwds = self._assigned_source
|
||||
return recompile(self, module_name, source, tmpdir=tmpdir,
|
||||
target=target, source_extension=source_extension,
|
||||
compiler_verbose=verbose, debug=debug, **kwds)
|
||||
|
||||
def init_once(self, func, tag):
|
||||
# Read _init_once_cache[tag], which is either (False, lock) if
|
||||
# we're calling the function now in some thread, or (True, result).
|
||||
# Don't call setdefault() in most cases, to avoid allocating and
|
||||
# immediately freeing a lock; but still use setdefaut() to avoid
|
||||
# races.
|
||||
try:
|
||||
x = self._init_once_cache[tag]
|
||||
except KeyError:
|
||||
x = self._init_once_cache.setdefault(tag, (False, allocate_lock()))
|
||||
# Common case: we got (True, result), so we return the result.
|
||||
if x[0]:
|
||||
return x[1]
|
||||
# Else, it's a lock. Acquire it to serialize the following tests.
|
||||
with x[1]:
|
||||
# Read again from _init_once_cache the current status.
|
||||
x = self._init_once_cache[tag]
|
||||
if x[0]:
|
||||
return x[1]
|
||||
# Call the function and store the result back.
|
||||
result = func()
|
||||
self._init_once_cache[tag] = (True, result)
|
||||
return result
|
||||
|
||||
def embedding_init_code(self, pysource):
|
||||
if self._embedding:
|
||||
raise ValueError("embedding_init_code() can only be called once")
|
||||
# fix 'pysource' before it gets dumped into the C file:
|
||||
# - remove empty lines at the beginning, so it starts at "line 1"
|
||||
# - dedent, if all non-empty lines are indented
|
||||
# - check for SyntaxErrors
|
||||
import re
|
||||
match = re.match(r'\s*\n', pysource)
|
||||
if match:
|
||||
pysource = pysource[match.end():]
|
||||
lines = pysource.splitlines() or ['']
|
||||
prefix = re.match(r'\s*', lines[0]).group()
|
||||
for i in range(1, len(lines)):
|
||||
line = lines[i]
|
||||
if line.rstrip():
|
||||
while not line.startswith(prefix):
|
||||
prefix = prefix[:-1]
|
||||
i = len(prefix)
|
||||
lines = [line[i:]+'\n' for line in lines]
|
||||
pysource = ''.join(lines)
|
||||
#
|
||||
compile(pysource, "cffi_init", "exec")
|
||||
#
|
||||
self._embedding = pysource
|
||||
|
||||
def def_extern(self, *args, **kwds):
|
||||
raise ValueError("ffi.def_extern() is only available on API-mode FFI "
|
||||
"objects")
|
||||
|
||||
def list_types(self):
|
||||
"""Returns the user type names known to this FFI instance.
|
||||
This returns a tuple containing three lists of names:
|
||||
(typedef_names, names_of_structs, names_of_unions)
|
||||
"""
|
||||
typedefs = []
|
||||
structs = []
|
||||
unions = []
|
||||
for key in self._parser._declarations:
|
||||
if key.startswith('typedef '):
|
||||
typedefs.append(key[8:])
|
||||
elif key.startswith('struct '):
|
||||
structs.append(key[7:])
|
||||
elif key.startswith('union '):
|
||||
unions.append(key[6:])
|
||||
typedefs.sort()
|
||||
structs.sort()
|
||||
unions.sort()
|
||||
return (typedefs, structs, unions)
|
||||
|
||||
|
||||
def _load_backend_lib(backend, name, flags):
|
||||
import os
|
||||
if not isinstance(name, basestring):
|
||||
if sys.platform != "win32" or name is not None:
|
||||
return backend.load_library(name, flags)
|
||||
name = "c" # Windows: load_library(None) fails, but this works
|
||||
# on Python 2 (backward compatibility hack only)
|
||||
first_error = None
|
||||
if '.' in name or '/' in name or os.sep in name:
|
||||
try:
|
||||
return backend.load_library(name, flags)
|
||||
except OSError as e:
|
||||
first_error = e
|
||||
import ctypes.util
|
||||
path = ctypes.util.find_library(name)
|
||||
if path is None:
|
||||
if name == "c" and sys.platform == "win32" and sys.version_info >= (3,):
|
||||
raise OSError("dlopen(None) cannot work on Windows for Python 3 "
|
||||
"(see http://bugs.python.org/issue23606)")
|
||||
msg = ("ctypes.util.find_library() did not manage "
|
||||
"to locate a library called %r" % (name,))
|
||||
if first_error is not None:
|
||||
msg = "%s. Additionally, %s" % (first_error, msg)
|
||||
raise OSError(msg)
|
||||
return backend.load_library(path, flags)
|
||||
|
||||
def _make_ffi_library(ffi, libname, flags):
|
||||
backend = ffi._backend
|
||||
backendlib = _load_backend_lib(backend, libname, flags)
|
||||
#
|
||||
def accessor_function(name):
|
||||
key = 'function ' + name
|
||||
tp, _ = ffi._parser._declarations[key]
|
||||
BType = ffi._get_cached_btype(tp)
|
||||
value = backendlib.load_function(BType, name)
|
||||
library.__dict__[name] = value
|
||||
#
|
||||
def accessor_variable(name):
|
||||
key = 'variable ' + name
|
||||
tp, _ = ffi._parser._declarations[key]
|
||||
BType = ffi._get_cached_btype(tp)
|
||||
read_variable = backendlib.read_variable
|
||||
write_variable = backendlib.write_variable
|
||||
setattr(FFILibrary, name, property(
|
||||
lambda self: read_variable(BType, name),
|
||||
lambda self, value: write_variable(BType, name, value)))
|
||||
#
|
||||
def addressof_var(name):
|
||||
try:
|
||||
return addr_variables[name]
|
||||
except KeyError:
|
||||
with ffi._lock:
|
||||
if name not in addr_variables:
|
||||
key = 'variable ' + name
|
||||
tp, _ = ffi._parser._declarations[key]
|
||||
BType = ffi._get_cached_btype(tp)
|
||||
if BType.kind != 'array':
|
||||
BType = model.pointer_cache(ffi, BType)
|
||||
p = backendlib.load_function(BType, name)
|
||||
addr_variables[name] = p
|
||||
return addr_variables[name]
|
||||
#
|
||||
def accessor_constant(name):
|
||||
raise NotImplementedError("non-integer constant '%s' cannot be "
|
||||
"accessed from a dlopen() library" % (name,))
|
||||
#
|
||||
def accessor_int_constant(name):
|
||||
library.__dict__[name] = ffi._parser._int_constants[name]
|
||||
#
|
||||
accessors = {}
|
||||
accessors_version = [False]
|
||||
addr_variables = {}
|
||||
#
|
||||
def update_accessors():
|
||||
if accessors_version[0] is ffi._cdef_version:
|
||||
return
|
||||
#
|
||||
for key, (tp, _) in ffi._parser._declarations.items():
|
||||
if not isinstance(tp, model.EnumType):
|
||||
tag, name = key.split(' ', 1)
|
||||
if tag == 'function':
|
||||
accessors[name] = accessor_function
|
||||
elif tag == 'variable':
|
||||
accessors[name] = accessor_variable
|
||||
elif tag == 'constant':
|
||||
accessors[name] = accessor_constant
|
||||
else:
|
||||
for i, enumname in enumerate(tp.enumerators):
|
||||
def accessor_enum(name, tp=tp, i=i):
|
||||
tp.check_not_partial()
|
||||
library.__dict__[name] = tp.enumvalues[i]
|
||||
accessors[enumname] = accessor_enum
|
||||
for name in ffi._parser._int_constants:
|
||||
accessors.setdefault(name, accessor_int_constant)
|
||||
accessors_version[0] = ffi._cdef_version
|
||||
#
|
||||
def make_accessor(name):
|
||||
with ffi._lock:
|
||||
if name in library.__dict__ or name in FFILibrary.__dict__:
|
||||
return # added by another thread while waiting for the lock
|
||||
if name not in accessors:
|
||||
update_accessors()
|
||||
if name not in accessors:
|
||||
raise AttributeError(name)
|
||||
accessors[name](name)
|
||||
#
|
||||
class FFILibrary(object):
|
||||
def __getattr__(self, name):
|
||||
make_accessor(name)
|
||||
return getattr(self, name)
|
||||
def __setattr__(self, name, value):
|
||||
try:
|
||||
property = getattr(self.__class__, name)
|
||||
except AttributeError:
|
||||
make_accessor(name)
|
||||
setattr(self, name, value)
|
||||
else:
|
||||
property.__set__(self, value)
|
||||
def __dir__(self):
|
||||
with ffi._lock:
|
||||
update_accessors()
|
||||
return accessors.keys()
|
||||
def __addressof__(self, name):
|
||||
if name in library.__dict__:
|
||||
return library.__dict__[name]
|
||||
if name in FFILibrary.__dict__:
|
||||
return addressof_var(name)
|
||||
make_accessor(name)
|
||||
if name in library.__dict__:
|
||||
return library.__dict__[name]
|
||||
if name in FFILibrary.__dict__:
|
||||
return addressof_var(name)
|
||||
raise AttributeError("cffi library has no function or "
|
||||
"global variable named '%s'" % (name,))
|
||||
def __cffi_close__(self):
|
||||
backendlib.close_lib()
|
||||
self.__dict__.clear()
|
||||
#
|
||||
if isinstance(libname, basestring):
|
||||
try:
|
||||
if not isinstance(libname, str): # unicode, on Python 2
|
||||
libname = libname.encode('utf-8')
|
||||
FFILibrary.__name__ = 'FFILibrary_%s' % libname
|
||||
except UnicodeError:
|
||||
pass
|
||||
library = FFILibrary()
|
||||
return library, library.__dict__
|
||||
|
||||
def _builtin_function_type(func):
|
||||
# a hack to make at least ffi.typeof(builtin_function) work,
|
||||
# if the builtin function was obtained by 'vengine_cpy'.
|
||||
import sys
|
||||
try:
|
||||
module = sys.modules[func.__module__]
|
||||
ffi = module._cffi_original_ffi
|
||||
types_of_builtin_funcs = module._cffi_types_of_builtin_funcs
|
||||
tp = types_of_builtin_funcs[func]
|
||||
except (KeyError, AttributeError, TypeError):
|
||||
return None
|
||||
else:
|
||||
with ffi._lock:
|
||||
return ffi._get_cached_btype(tp)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,187 @@
|
||||
from .error import VerificationError
|
||||
|
||||
class CffiOp(object):
|
||||
def __init__(self, op, arg):
|
||||
self.op = op
|
||||
self.arg = arg
|
||||
|
||||
def as_c_expr(self):
|
||||
if self.op is None:
|
||||
assert isinstance(self.arg, str)
|
||||
return '(_cffi_opcode_t)(%s)' % (self.arg,)
|
||||
classname = CLASS_NAME[self.op]
|
||||
return '_CFFI_OP(_CFFI_OP_%s, %s)' % (classname, self.arg)
|
||||
|
||||
def as_python_bytes(self):
|
||||
if self.op is None and self.arg.isdigit():
|
||||
value = int(self.arg) # non-negative: '-' not in self.arg
|
||||
if value >= 2**31:
|
||||
raise OverflowError("cannot emit %r: limited to 2**31-1"
|
||||
% (self.arg,))
|
||||
return format_four_bytes(value)
|
||||
if isinstance(self.arg, str):
|
||||
raise VerificationError("cannot emit to Python: %r" % (self.arg,))
|
||||
return format_four_bytes((self.arg << 8) | self.op)
|
||||
|
||||
def __str__(self):
|
||||
classname = CLASS_NAME.get(self.op, self.op)
|
||||
return '(%s %s)' % (classname, self.arg)
|
||||
|
||||
def format_four_bytes(num):
|
||||
return '\\x%02X\\x%02X\\x%02X\\x%02X' % (
|
||||
(num >> 24) & 0xFF,
|
||||
(num >> 16) & 0xFF,
|
||||
(num >> 8) & 0xFF,
|
||||
(num ) & 0xFF)
|
||||
|
||||
OP_PRIMITIVE = 1
|
||||
OP_POINTER = 3
|
||||
OP_ARRAY = 5
|
||||
OP_OPEN_ARRAY = 7
|
||||
OP_STRUCT_UNION = 9
|
||||
OP_ENUM = 11
|
||||
OP_FUNCTION = 13
|
||||
OP_FUNCTION_END = 15
|
||||
OP_NOOP = 17
|
||||
OP_BITFIELD = 19
|
||||
OP_TYPENAME = 21
|
||||
OP_CPYTHON_BLTN_V = 23 # varargs
|
||||
OP_CPYTHON_BLTN_N = 25 # noargs
|
||||
OP_CPYTHON_BLTN_O = 27 # O (i.e. a single arg)
|
||||
OP_CONSTANT = 29
|
||||
OP_CONSTANT_INT = 31
|
||||
OP_GLOBAL_VAR = 33
|
||||
OP_DLOPEN_FUNC = 35
|
||||
OP_DLOPEN_CONST = 37
|
||||
OP_GLOBAL_VAR_F = 39
|
||||
OP_EXTERN_PYTHON = 41
|
||||
|
||||
PRIM_VOID = 0
|
||||
PRIM_BOOL = 1
|
||||
PRIM_CHAR = 2
|
||||
PRIM_SCHAR = 3
|
||||
PRIM_UCHAR = 4
|
||||
PRIM_SHORT = 5
|
||||
PRIM_USHORT = 6
|
||||
PRIM_INT = 7
|
||||
PRIM_UINT = 8
|
||||
PRIM_LONG = 9
|
||||
PRIM_ULONG = 10
|
||||
PRIM_LONGLONG = 11
|
||||
PRIM_ULONGLONG = 12
|
||||
PRIM_FLOAT = 13
|
||||
PRIM_DOUBLE = 14
|
||||
PRIM_LONGDOUBLE = 15
|
||||
|
||||
PRIM_WCHAR = 16
|
||||
PRIM_INT8 = 17
|
||||
PRIM_UINT8 = 18
|
||||
PRIM_INT16 = 19
|
||||
PRIM_UINT16 = 20
|
||||
PRIM_INT32 = 21
|
||||
PRIM_UINT32 = 22
|
||||
PRIM_INT64 = 23
|
||||
PRIM_UINT64 = 24
|
||||
PRIM_INTPTR = 25
|
||||
PRIM_UINTPTR = 26
|
||||
PRIM_PTRDIFF = 27
|
||||
PRIM_SIZE = 28
|
||||
PRIM_SSIZE = 29
|
||||
PRIM_INT_LEAST8 = 30
|
||||
PRIM_UINT_LEAST8 = 31
|
||||
PRIM_INT_LEAST16 = 32
|
||||
PRIM_UINT_LEAST16 = 33
|
||||
PRIM_INT_LEAST32 = 34
|
||||
PRIM_UINT_LEAST32 = 35
|
||||
PRIM_INT_LEAST64 = 36
|
||||
PRIM_UINT_LEAST64 = 37
|
||||
PRIM_INT_FAST8 = 38
|
||||
PRIM_UINT_FAST8 = 39
|
||||
PRIM_INT_FAST16 = 40
|
||||
PRIM_UINT_FAST16 = 41
|
||||
PRIM_INT_FAST32 = 42
|
||||
PRIM_UINT_FAST32 = 43
|
||||
PRIM_INT_FAST64 = 44
|
||||
PRIM_UINT_FAST64 = 45
|
||||
PRIM_INTMAX = 46
|
||||
PRIM_UINTMAX = 47
|
||||
PRIM_FLOATCOMPLEX = 48
|
||||
PRIM_DOUBLECOMPLEX = 49
|
||||
PRIM_CHAR16 = 50
|
||||
PRIM_CHAR32 = 51
|
||||
|
||||
_NUM_PRIM = 52
|
||||
_UNKNOWN_PRIM = -1
|
||||
_UNKNOWN_FLOAT_PRIM = -2
|
||||
_UNKNOWN_LONG_DOUBLE = -3
|
||||
|
||||
_IO_FILE_STRUCT = -1
|
||||
|
||||
PRIMITIVE_TO_INDEX = {
|
||||
'char': PRIM_CHAR,
|
||||
'short': PRIM_SHORT,
|
||||
'int': PRIM_INT,
|
||||
'long': PRIM_LONG,
|
||||
'long long': PRIM_LONGLONG,
|
||||
'signed char': PRIM_SCHAR,
|
||||
'unsigned char': PRIM_UCHAR,
|
||||
'unsigned short': PRIM_USHORT,
|
||||
'unsigned int': PRIM_UINT,
|
||||
'unsigned long': PRIM_ULONG,
|
||||
'unsigned long long': PRIM_ULONGLONG,
|
||||
'float': PRIM_FLOAT,
|
||||
'double': PRIM_DOUBLE,
|
||||
'long double': PRIM_LONGDOUBLE,
|
||||
'_cffi_float_complex_t': PRIM_FLOATCOMPLEX,
|
||||
'_cffi_double_complex_t': PRIM_DOUBLECOMPLEX,
|
||||
'_Bool': PRIM_BOOL,
|
||||
'wchar_t': PRIM_WCHAR,
|
||||
'char16_t': PRIM_CHAR16,
|
||||
'char32_t': PRIM_CHAR32,
|
||||
'int8_t': PRIM_INT8,
|
||||
'uint8_t': PRIM_UINT8,
|
||||
'int16_t': PRIM_INT16,
|
||||
'uint16_t': PRIM_UINT16,
|
||||
'int32_t': PRIM_INT32,
|
||||
'uint32_t': PRIM_UINT32,
|
||||
'int64_t': PRIM_INT64,
|
||||
'uint64_t': PRIM_UINT64,
|
||||
'intptr_t': PRIM_INTPTR,
|
||||
'uintptr_t': PRIM_UINTPTR,
|
||||
'ptrdiff_t': PRIM_PTRDIFF,
|
||||
'size_t': PRIM_SIZE,
|
||||
'ssize_t': PRIM_SSIZE,
|
||||
'int_least8_t': PRIM_INT_LEAST8,
|
||||
'uint_least8_t': PRIM_UINT_LEAST8,
|
||||
'int_least16_t': PRIM_INT_LEAST16,
|
||||
'uint_least16_t': PRIM_UINT_LEAST16,
|
||||
'int_least32_t': PRIM_INT_LEAST32,
|
||||
'uint_least32_t': PRIM_UINT_LEAST32,
|
||||
'int_least64_t': PRIM_INT_LEAST64,
|
||||
'uint_least64_t': PRIM_UINT_LEAST64,
|
||||
'int_fast8_t': PRIM_INT_FAST8,
|
||||
'uint_fast8_t': PRIM_UINT_FAST8,
|
||||
'int_fast16_t': PRIM_INT_FAST16,
|
||||
'uint_fast16_t': PRIM_UINT_FAST16,
|
||||
'int_fast32_t': PRIM_INT_FAST32,
|
||||
'uint_fast32_t': PRIM_UINT_FAST32,
|
||||
'int_fast64_t': PRIM_INT_FAST64,
|
||||
'uint_fast64_t': PRIM_UINT_FAST64,
|
||||
'intmax_t': PRIM_INTMAX,
|
||||
'uintmax_t': PRIM_UINTMAX,
|
||||
}
|
||||
|
||||
F_UNION = 0x01
|
||||
F_CHECK_FIELDS = 0x02
|
||||
F_PACKED = 0x04
|
||||
F_EXTERNAL = 0x08
|
||||
F_OPAQUE = 0x10
|
||||
|
||||
G_FLAGS = dict([('_CFFI_' + _key, globals()[_key])
|
||||
for _key in ['F_UNION', 'F_CHECK_FIELDS', 'F_PACKED',
|
||||
'F_EXTERNAL', 'F_OPAQUE']])
|
||||
|
||||
CLASS_NAME = {}
|
||||
for _name, _value in list(globals().items()):
|
||||
if _name.startswith('OP_') and isinstance(_value, int):
|
||||
CLASS_NAME[_value] = _name[3:]
|
||||
@@ -0,0 +1,82 @@
|
||||
import sys
|
||||
from . import model
|
||||
from .error import FFIError
|
||||
|
||||
|
||||
COMMON_TYPES = {}
|
||||
|
||||
try:
|
||||
# fetch "bool" and all simple Windows types
|
||||
from _cffi_backend import _get_common_types
|
||||
_get_common_types(COMMON_TYPES)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
COMMON_TYPES['FILE'] = model.unknown_type('FILE', '_IO_FILE')
|
||||
COMMON_TYPES['bool'] = '_Bool' # in case we got ImportError above
|
||||
COMMON_TYPES['float _Complex'] = '_cffi_float_complex_t'
|
||||
COMMON_TYPES['double _Complex'] = '_cffi_double_complex_t'
|
||||
|
||||
for _type in model.PrimitiveType.ALL_PRIMITIVE_TYPES:
|
||||
if _type.endswith('_t'):
|
||||
COMMON_TYPES[_type] = _type
|
||||
del _type
|
||||
|
||||
_CACHE = {}
|
||||
|
||||
def resolve_common_type(parser, commontype):
|
||||
try:
|
||||
return _CACHE[commontype]
|
||||
except KeyError:
|
||||
cdecl = COMMON_TYPES.get(commontype, commontype)
|
||||
if not isinstance(cdecl, str):
|
||||
result, quals = cdecl, 0 # cdecl is already a BaseType
|
||||
elif cdecl in model.PrimitiveType.ALL_PRIMITIVE_TYPES:
|
||||
result, quals = model.PrimitiveType(cdecl), 0
|
||||
elif cdecl == 'set-unicode-needed':
|
||||
raise FFIError("The Windows type %r is only available after "
|
||||
"you call ffi.set_unicode()" % (commontype,))
|
||||
else:
|
||||
if commontype == cdecl:
|
||||
raise FFIError(
|
||||
"Unsupported type: %r. Please look at "
|
||||
"http://cffi.readthedocs.io/en/latest/cdef.html#ffi-cdef-limitations "
|
||||
"and file an issue if you think this type should really "
|
||||
"be supported." % (commontype,))
|
||||
result, quals = parser.parse_type_and_quals(cdecl) # recursive
|
||||
|
||||
assert isinstance(result, model.BaseTypeByIdentity)
|
||||
_CACHE[commontype] = result, quals
|
||||
return result, quals
|
||||
|
||||
|
||||
# ____________________________________________________________
|
||||
# extra types for Windows (most of them are in commontypes.c)
|
||||
|
||||
|
||||
def win_common_types():
|
||||
return {
|
||||
"UNICODE_STRING": model.StructType(
|
||||
"_UNICODE_STRING",
|
||||
["Length",
|
||||
"MaximumLength",
|
||||
"Buffer"],
|
||||
[model.PrimitiveType("unsigned short"),
|
||||
model.PrimitiveType("unsigned short"),
|
||||
model.PointerType(model.PrimitiveType("wchar_t"))],
|
||||
[-1, -1, -1]),
|
||||
"PUNICODE_STRING": "UNICODE_STRING *",
|
||||
"PCUNICODE_STRING": "const UNICODE_STRING *",
|
||||
|
||||
"TBYTE": "set-unicode-needed",
|
||||
"TCHAR": "set-unicode-needed",
|
||||
"LPCTSTR": "set-unicode-needed",
|
||||
"PCTSTR": "set-unicode-needed",
|
||||
"LPTSTR": "set-unicode-needed",
|
||||
"PTSTR": "set-unicode-needed",
|
||||
"PTBYTE": "set-unicode-needed",
|
||||
"PTCHAR": "set-unicode-needed",
|
||||
}
|
||||
|
||||
if sys.platform == 'win32':
|
||||
COMMON_TYPES.update(win_common_types())
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,31 @@
|
||||
|
||||
class FFIError(Exception):
|
||||
__module__ = 'cffi'
|
||||
|
||||
class CDefError(Exception):
|
||||
__module__ = 'cffi'
|
||||
def __str__(self):
|
||||
try:
|
||||
current_decl = self.args[1]
|
||||
filename = current_decl.coord.file
|
||||
linenum = current_decl.coord.line
|
||||
prefix = '%s:%d: ' % (filename, linenum)
|
||||
except (AttributeError, TypeError, IndexError):
|
||||
prefix = ''
|
||||
return '%s%s' % (prefix, self.args[0])
|
||||
|
||||
class VerificationError(Exception):
|
||||
""" An error raised when verification fails
|
||||
"""
|
||||
__module__ = 'cffi'
|
||||
|
||||
class VerificationMissing(Exception):
|
||||
""" An error raised when incomplete structures are passed into
|
||||
cdef, but no verification has been done
|
||||
"""
|
||||
__module__ = 'cffi'
|
||||
|
||||
class PkgConfigError(Exception):
|
||||
""" An error raised for missing modules in pkg-config
|
||||
"""
|
||||
__module__ = 'cffi'
|
||||
@@ -0,0 +1,113 @@
|
||||
import sys, os
|
||||
from .error import VerificationError
|
||||
|
||||
|
||||
LIST_OF_FILE_NAMES = ['sources', 'include_dirs', 'library_dirs',
|
||||
'extra_objects', 'depends']
|
||||
|
||||
def get_extension(srcfilename, modname, sources=(), **kwds):
|
||||
from cffi._shimmed_dist_utils import Extension
|
||||
allsources = [srcfilename]
|
||||
for src in sources:
|
||||
allsources.append(os.path.normpath(src))
|
||||
return Extension(name=modname, sources=allsources, **kwds)
|
||||
|
||||
def compile(tmpdir, ext, compiler_verbose=0, debug=None):
|
||||
"""Compile a C extension module using distutils."""
|
||||
|
||||
saved_environ = os.environ.copy()
|
||||
try:
|
||||
outputfilename = _build(tmpdir, ext, compiler_verbose, debug)
|
||||
outputfilename = os.path.abspath(outputfilename)
|
||||
finally:
|
||||
# workaround for a distutils bugs where some env vars can
|
||||
# become longer and longer every time it is used
|
||||
for key, value in saved_environ.items():
|
||||
if os.environ.get(key) != value:
|
||||
os.environ[key] = value
|
||||
return outputfilename
|
||||
|
||||
def _build(tmpdir, ext, compiler_verbose=0, debug=None):
|
||||
# XXX compact but horrible :-(
|
||||
from cffi._shimmed_dist_utils import Distribution, CompileError, LinkError, set_threshold, set_verbosity
|
||||
|
||||
dist = Distribution({'ext_modules': [ext]})
|
||||
dist.parse_config_files()
|
||||
options = dist.get_option_dict('build_ext')
|
||||
if debug is None:
|
||||
debug = sys.flags.debug
|
||||
options['debug'] = ('ffiplatform', debug)
|
||||
options['force'] = ('ffiplatform', True)
|
||||
options['build_lib'] = ('ffiplatform', tmpdir)
|
||||
options['build_temp'] = ('ffiplatform', tmpdir)
|
||||
#
|
||||
try:
|
||||
old_level = set_threshold(0) or 0
|
||||
try:
|
||||
set_verbosity(compiler_verbose)
|
||||
dist.run_command('build_ext')
|
||||
cmd_obj = dist.get_command_obj('build_ext')
|
||||
[soname] = cmd_obj.get_outputs()
|
||||
finally:
|
||||
set_threshold(old_level)
|
||||
except (CompileError, LinkError) as e:
|
||||
raise VerificationError('%s: %s' % (e.__class__.__name__, e))
|
||||
#
|
||||
return soname
|
||||
|
||||
try:
|
||||
from os.path import samefile
|
||||
except ImportError:
|
||||
def samefile(f1, f2):
|
||||
return os.path.abspath(f1) == os.path.abspath(f2)
|
||||
|
||||
def maybe_relative_path(path):
|
||||
if not os.path.isabs(path):
|
||||
return path # already relative
|
||||
dir = path
|
||||
names = []
|
||||
while True:
|
||||
prevdir = dir
|
||||
dir, name = os.path.split(prevdir)
|
||||
if dir == prevdir or not dir:
|
||||
return path # failed to make it relative
|
||||
names.append(name)
|
||||
try:
|
||||
if samefile(dir, os.curdir):
|
||||
names.reverse()
|
||||
return os.path.join(*names)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
# ____________________________________________________________
|
||||
|
||||
try:
|
||||
int_or_long = (int, long)
|
||||
import cStringIO
|
||||
except NameError:
|
||||
int_or_long = int # Python 3
|
||||
import io as cStringIO
|
||||
|
||||
def _flatten(x, f):
|
||||
if isinstance(x, str):
|
||||
f.write('%ds%s' % (len(x), x))
|
||||
elif isinstance(x, dict):
|
||||
keys = sorted(x.keys())
|
||||
f.write('%dd' % len(keys))
|
||||
for key in keys:
|
||||
_flatten(key, f)
|
||||
_flatten(x[key], f)
|
||||
elif isinstance(x, (list, tuple)):
|
||||
f.write('%dl' % len(x))
|
||||
for value in x:
|
||||
_flatten(value, f)
|
||||
elif isinstance(x, int_or_long):
|
||||
f.write('%di' % (x,))
|
||||
else:
|
||||
raise TypeError(
|
||||
"the keywords to verify() contains unsupported object %r" % (x,))
|
||||
|
||||
def flatten(x):
|
||||
f = cStringIO.StringIO()
|
||||
_flatten(x, f)
|
||||
return f.getvalue()
|
||||
@@ -0,0 +1,30 @@
|
||||
import sys
|
||||
|
||||
if sys.version_info < (3,):
|
||||
try:
|
||||
from thread import allocate_lock
|
||||
except ImportError:
|
||||
from dummy_thread import allocate_lock
|
||||
else:
|
||||
try:
|
||||
from _thread import allocate_lock
|
||||
except ImportError:
|
||||
from _dummy_thread import allocate_lock
|
||||
|
||||
|
||||
##import sys
|
||||
##l1 = allocate_lock
|
||||
|
||||
##class allocate_lock(object):
|
||||
## def __init__(self):
|
||||
## self._real = l1()
|
||||
## def __enter__(self):
|
||||
## for i in range(4, 0, -1):
|
||||
## print sys._getframe(i).f_code
|
||||
## print
|
||||
## return self._real.__enter__()
|
||||
## def __exit__(self, *args):
|
||||
## return self._real.__exit__(*args)
|
||||
## def acquire(self, f):
|
||||
## assert f is False
|
||||
## return self._real.acquire(f)
|
||||
@@ -0,0 +1,618 @@
|
||||
import types
|
||||
import weakref
|
||||
|
||||
from .lock import allocate_lock
|
||||
from .error import CDefError, VerificationError, VerificationMissing
|
||||
|
||||
# type qualifiers
|
||||
Q_CONST = 0x01
|
||||
Q_RESTRICT = 0x02
|
||||
Q_VOLATILE = 0x04
|
||||
|
||||
def qualify(quals, replace_with):
|
||||
if quals & Q_CONST:
|
||||
replace_with = ' const ' + replace_with.lstrip()
|
||||
if quals & Q_VOLATILE:
|
||||
replace_with = ' volatile ' + replace_with.lstrip()
|
||||
if quals & Q_RESTRICT:
|
||||
# It seems that __restrict is supported by gcc and msvc.
|
||||
# If you hit some different compiler, add a #define in
|
||||
# _cffi_include.h for it (and in its copies, documented there)
|
||||
replace_with = ' __restrict ' + replace_with.lstrip()
|
||||
return replace_with
|
||||
|
||||
|
||||
class BaseTypeByIdentity(object):
|
||||
is_array_type = False
|
||||
is_raw_function = False
|
||||
|
||||
def get_c_name(self, replace_with='', context='a C file', quals=0):
|
||||
result = self.c_name_with_marker
|
||||
assert result.count('&') == 1
|
||||
# some logic duplication with ffi.getctype()... :-(
|
||||
replace_with = replace_with.strip()
|
||||
if replace_with:
|
||||
if replace_with.startswith('*') and '&[' in result:
|
||||
replace_with = '(%s)' % replace_with
|
||||
elif not replace_with[0] in '[(':
|
||||
replace_with = ' ' + replace_with
|
||||
replace_with = qualify(quals, replace_with)
|
||||
result = result.replace('&', replace_with)
|
||||
if '$' in result:
|
||||
raise VerificationError(
|
||||
"cannot generate '%s' in %s: unknown type name"
|
||||
% (self._get_c_name(), context))
|
||||
return result
|
||||
|
||||
def _get_c_name(self):
|
||||
return self.c_name_with_marker.replace('&', '')
|
||||
|
||||
def has_c_name(self):
|
||||
return '$' not in self._get_c_name()
|
||||
|
||||
def is_integer_type(self):
|
||||
return False
|
||||
|
||||
def get_cached_btype(self, ffi, finishlist, can_delay=False):
|
||||
try:
|
||||
BType = ffi._cached_btypes[self]
|
||||
except KeyError:
|
||||
BType = self.build_backend_type(ffi, finishlist)
|
||||
BType2 = ffi._cached_btypes.setdefault(self, BType)
|
||||
assert BType2 is BType
|
||||
return BType
|
||||
|
||||
def __repr__(self):
|
||||
return '<%s>' % (self._get_c_name(),)
|
||||
|
||||
def _get_items(self):
|
||||
return [(name, getattr(self, name)) for name in self._attrs_]
|
||||
|
||||
|
||||
class BaseType(BaseTypeByIdentity):
|
||||
|
||||
def __eq__(self, other):
|
||||
return (self.__class__ == other.__class__ and
|
||||
self._get_items() == other._get_items())
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self == other
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.__class__, tuple(self._get_items())))
|
||||
|
||||
|
||||
class VoidType(BaseType):
|
||||
_attrs_ = ()
|
||||
|
||||
def __init__(self):
|
||||
self.c_name_with_marker = 'void&'
|
||||
|
||||
def build_backend_type(self, ffi, finishlist):
|
||||
return global_cache(self, ffi, 'new_void_type')
|
||||
|
||||
void_type = VoidType()
|
||||
|
||||
|
||||
class BasePrimitiveType(BaseType):
|
||||
def is_complex_type(self):
|
||||
return False
|
||||
|
||||
|
||||
class PrimitiveType(BasePrimitiveType):
|
||||
_attrs_ = ('name',)
|
||||
|
||||
ALL_PRIMITIVE_TYPES = {
|
||||
'char': 'c',
|
||||
'short': 'i',
|
||||
'int': 'i',
|
||||
'long': 'i',
|
||||
'long long': 'i',
|
||||
'signed char': 'i',
|
||||
'unsigned char': 'i',
|
||||
'unsigned short': 'i',
|
||||
'unsigned int': 'i',
|
||||
'unsigned long': 'i',
|
||||
'unsigned long long': 'i',
|
||||
'float': 'f',
|
||||
'double': 'f',
|
||||
'long double': 'f',
|
||||
'_cffi_float_complex_t': 'j',
|
||||
'_cffi_double_complex_t': 'j',
|
||||
'_Bool': 'i',
|
||||
# the following types are not primitive in the C sense
|
||||
'wchar_t': 'c',
|
||||
'char16_t': 'c',
|
||||
'char32_t': 'c',
|
||||
'int8_t': 'i',
|
||||
'uint8_t': 'i',
|
||||
'int16_t': 'i',
|
||||
'uint16_t': 'i',
|
||||
'int32_t': 'i',
|
||||
'uint32_t': 'i',
|
||||
'int64_t': 'i',
|
||||
'uint64_t': 'i',
|
||||
'int_least8_t': 'i',
|
||||
'uint_least8_t': 'i',
|
||||
'int_least16_t': 'i',
|
||||
'uint_least16_t': 'i',
|
||||
'int_least32_t': 'i',
|
||||
'uint_least32_t': 'i',
|
||||
'int_least64_t': 'i',
|
||||
'uint_least64_t': 'i',
|
||||
'int_fast8_t': 'i',
|
||||
'uint_fast8_t': 'i',
|
||||
'int_fast16_t': 'i',
|
||||
'uint_fast16_t': 'i',
|
||||
'int_fast32_t': 'i',
|
||||
'uint_fast32_t': 'i',
|
||||
'int_fast64_t': 'i',
|
||||
'uint_fast64_t': 'i',
|
||||
'intptr_t': 'i',
|
||||
'uintptr_t': 'i',
|
||||
'intmax_t': 'i',
|
||||
'uintmax_t': 'i',
|
||||
'ptrdiff_t': 'i',
|
||||
'size_t': 'i',
|
||||
'ssize_t': 'i',
|
||||
}
|
||||
|
||||
def __init__(self, name):
|
||||
assert name in self.ALL_PRIMITIVE_TYPES
|
||||
self.name = name
|
||||
self.c_name_with_marker = name + '&'
|
||||
|
||||
def is_char_type(self):
|
||||
return self.ALL_PRIMITIVE_TYPES[self.name] == 'c'
|
||||
def is_integer_type(self):
|
||||
return self.ALL_PRIMITIVE_TYPES[self.name] == 'i'
|
||||
def is_float_type(self):
|
||||
return self.ALL_PRIMITIVE_TYPES[self.name] == 'f'
|
||||
def is_complex_type(self):
|
||||
return self.ALL_PRIMITIVE_TYPES[self.name] == 'j'
|
||||
|
||||
def build_backend_type(self, ffi, finishlist):
|
||||
return global_cache(self, ffi, 'new_primitive_type', self.name)
|
||||
|
||||
|
||||
class UnknownIntegerType(BasePrimitiveType):
|
||||
_attrs_ = ('name',)
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.c_name_with_marker = name + '&'
|
||||
|
||||
def is_integer_type(self):
|
||||
return True
|
||||
|
||||
def build_backend_type(self, ffi, finishlist):
|
||||
raise NotImplementedError("integer type '%s' can only be used after "
|
||||
"compilation" % self.name)
|
||||
|
||||
class UnknownFloatType(BasePrimitiveType):
|
||||
_attrs_ = ('name', )
|
||||
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.c_name_with_marker = name + '&'
|
||||
|
||||
def build_backend_type(self, ffi, finishlist):
|
||||
raise NotImplementedError("float type '%s' can only be used after "
|
||||
"compilation" % self.name)
|
||||
|
||||
|
||||
class BaseFunctionType(BaseType):
|
||||
_attrs_ = ('args', 'result', 'ellipsis', 'abi')
|
||||
|
||||
def __init__(self, args, result, ellipsis, abi=None):
|
||||
self.args = args
|
||||
self.result = result
|
||||
self.ellipsis = ellipsis
|
||||
self.abi = abi
|
||||
#
|
||||
reprargs = [arg._get_c_name() for arg in self.args]
|
||||
if self.ellipsis:
|
||||
reprargs.append('...')
|
||||
reprargs = reprargs or ['void']
|
||||
replace_with = self._base_pattern % (', '.join(reprargs),)
|
||||
if abi is not None:
|
||||
replace_with = replace_with[:1] + abi + ' ' + replace_with[1:]
|
||||
self.c_name_with_marker = (
|
||||
self.result.c_name_with_marker.replace('&', replace_with))
|
||||
|
||||
|
||||
class RawFunctionType(BaseFunctionType):
|
||||
# Corresponds to a C type like 'int(int)', which is the C type of
|
||||
# a function, but not a pointer-to-function. The backend has no
|
||||
# notion of such a type; it's used temporarily by parsing.
|
||||
_base_pattern = '(&)(%s)'
|
||||
is_raw_function = True
|
||||
|
||||
def build_backend_type(self, ffi, finishlist):
|
||||
raise CDefError("cannot render the type %r: it is a function "
|
||||
"type, not a pointer-to-function type" % (self,))
|
||||
|
||||
def as_function_pointer(self):
|
||||
return FunctionPtrType(self.args, self.result, self.ellipsis, self.abi)
|
||||
|
||||
|
||||
class FunctionPtrType(BaseFunctionType):
|
||||
_base_pattern = '(*&)(%s)'
|
||||
|
||||
def build_backend_type(self, ffi, finishlist):
|
||||
result = self.result.get_cached_btype(ffi, finishlist)
|
||||
args = []
|
||||
for tp in self.args:
|
||||
args.append(tp.get_cached_btype(ffi, finishlist))
|
||||
abi_args = ()
|
||||
if self.abi == "__stdcall":
|
||||
if not self.ellipsis: # __stdcall ignored for variadic funcs
|
||||
try:
|
||||
abi_args = (ffi._backend.FFI_STDCALL,)
|
||||
except AttributeError:
|
||||
pass
|
||||
return global_cache(self, ffi, 'new_function_type',
|
||||
tuple(args), result, self.ellipsis, *abi_args)
|
||||
|
||||
def as_raw_function(self):
|
||||
return RawFunctionType(self.args, self.result, self.ellipsis, self.abi)
|
||||
|
||||
|
||||
class PointerType(BaseType):
|
||||
_attrs_ = ('totype', 'quals')
|
||||
|
||||
def __init__(self, totype, quals=0):
|
||||
self.totype = totype
|
||||
self.quals = quals
|
||||
extra = " *&"
|
||||
if totype.is_array_type:
|
||||
extra = "(%s)" % (extra.lstrip(),)
|
||||
extra = qualify(quals, extra)
|
||||
self.c_name_with_marker = totype.c_name_with_marker.replace('&', extra)
|
||||
|
||||
def build_backend_type(self, ffi, finishlist):
|
||||
BItem = self.totype.get_cached_btype(ffi, finishlist, can_delay=True)
|
||||
return global_cache(self, ffi, 'new_pointer_type', BItem)
|
||||
|
||||
voidp_type = PointerType(void_type)
|
||||
|
||||
def ConstPointerType(totype):
|
||||
return PointerType(totype, Q_CONST)
|
||||
|
||||
const_voidp_type = ConstPointerType(void_type)
|
||||
|
||||
|
||||
class NamedPointerType(PointerType):
|
||||
_attrs_ = ('totype', 'name')
|
||||
|
||||
def __init__(self, totype, name, quals=0):
|
||||
PointerType.__init__(self, totype, quals)
|
||||
self.name = name
|
||||
self.c_name_with_marker = name + '&'
|
||||
|
||||
|
||||
class ArrayType(BaseType):
|
||||
_attrs_ = ('item', 'length')
|
||||
is_array_type = True
|
||||
|
||||
def __init__(self, item, length):
|
||||
self.item = item
|
||||
self.length = length
|
||||
#
|
||||
if length is None:
|
||||
brackets = '&[]'
|
||||
elif length == '...':
|
||||
brackets = '&[/*...*/]'
|
||||
else:
|
||||
brackets = '&[%s]' % length
|
||||
self.c_name_with_marker = (
|
||||
self.item.c_name_with_marker.replace('&', brackets))
|
||||
|
||||
def length_is_unknown(self):
|
||||
return isinstance(self.length, str)
|
||||
|
||||
def resolve_length(self, newlength):
|
||||
return ArrayType(self.item, newlength)
|
||||
|
||||
def build_backend_type(self, ffi, finishlist):
|
||||
if self.length_is_unknown():
|
||||
raise CDefError("cannot render the type %r: unknown length" %
|
||||
(self,))
|
||||
self.item.get_cached_btype(ffi, finishlist) # force the item BType
|
||||
BPtrItem = PointerType(self.item).get_cached_btype(ffi, finishlist)
|
||||
return global_cache(self, ffi, 'new_array_type', BPtrItem, self.length)
|
||||
|
||||
char_array_type = ArrayType(PrimitiveType('char'), None)
|
||||
|
||||
|
||||
class StructOrUnionOrEnum(BaseTypeByIdentity):
|
||||
_attrs_ = ('name',)
|
||||
forcename = None
|
||||
|
||||
def build_c_name_with_marker(self):
|
||||
name = self.forcename or '%s %s' % (self.kind, self.name)
|
||||
self.c_name_with_marker = name + '&'
|
||||
|
||||
def force_the_name(self, forcename):
|
||||
self.forcename = forcename
|
||||
self.build_c_name_with_marker()
|
||||
|
||||
def get_official_name(self):
|
||||
assert self.c_name_with_marker.endswith('&')
|
||||
return self.c_name_with_marker[:-1]
|
||||
|
||||
|
||||
class StructOrUnion(StructOrUnionOrEnum):
|
||||
fixedlayout = None
|
||||
completed = 0
|
||||
partial = False
|
||||
packed = 0
|
||||
|
||||
def __init__(self, name, fldnames, fldtypes, fldbitsize, fldquals=None):
|
||||
self.name = name
|
||||
self.fldnames = fldnames
|
||||
self.fldtypes = fldtypes
|
||||
self.fldbitsize = fldbitsize
|
||||
self.fldquals = fldquals
|
||||
self.build_c_name_with_marker()
|
||||
|
||||
def anonymous_struct_fields(self):
|
||||
if self.fldtypes is not None:
|
||||
for name, type in zip(self.fldnames, self.fldtypes):
|
||||
if name == '' and isinstance(type, StructOrUnion):
|
||||
yield type
|
||||
|
||||
def enumfields(self, expand_anonymous_struct_union=True):
|
||||
fldquals = self.fldquals
|
||||
if fldquals is None:
|
||||
fldquals = (0,) * len(self.fldnames)
|
||||
for name, type, bitsize, quals in zip(self.fldnames, self.fldtypes,
|
||||
self.fldbitsize, fldquals):
|
||||
if (name == '' and isinstance(type, StructOrUnion)
|
||||
and expand_anonymous_struct_union):
|
||||
# nested anonymous struct/union
|
||||
for result in type.enumfields():
|
||||
yield result
|
||||
else:
|
||||
yield (name, type, bitsize, quals)
|
||||
|
||||
def force_flatten(self):
|
||||
# force the struct or union to have a declaration that lists
|
||||
# directly all fields returned by enumfields(), flattening
|
||||
# nested anonymous structs/unions.
|
||||
names = []
|
||||
types = []
|
||||
bitsizes = []
|
||||
fldquals = []
|
||||
for name, type, bitsize, quals in self.enumfields():
|
||||
names.append(name)
|
||||
types.append(type)
|
||||
bitsizes.append(bitsize)
|
||||
fldquals.append(quals)
|
||||
self.fldnames = tuple(names)
|
||||
self.fldtypes = tuple(types)
|
||||
self.fldbitsize = tuple(bitsizes)
|
||||
self.fldquals = tuple(fldquals)
|
||||
|
||||
def get_cached_btype(self, ffi, finishlist, can_delay=False):
|
||||
BType = StructOrUnionOrEnum.get_cached_btype(self, ffi, finishlist,
|
||||
can_delay)
|
||||
if not can_delay:
|
||||
self.finish_backend_type(ffi, finishlist)
|
||||
return BType
|
||||
|
||||
def finish_backend_type(self, ffi, finishlist):
|
||||
if self.completed:
|
||||
if self.completed != 2:
|
||||
raise NotImplementedError("recursive structure declaration "
|
||||
"for '%s'" % (self.name,))
|
||||
return
|
||||
BType = ffi._cached_btypes[self]
|
||||
#
|
||||
self.completed = 1
|
||||
#
|
||||
if self.fldtypes is None:
|
||||
pass # not completing it: it's an opaque struct
|
||||
#
|
||||
elif self.fixedlayout is None:
|
||||
fldtypes = [tp.get_cached_btype(ffi, finishlist)
|
||||
for tp in self.fldtypes]
|
||||
lst = list(zip(self.fldnames, fldtypes, self.fldbitsize))
|
||||
extra_flags = ()
|
||||
if self.packed:
|
||||
if self.packed == 1:
|
||||
extra_flags = (8,) # SF_PACKED
|
||||
else:
|
||||
extra_flags = (0, self.packed)
|
||||
ffi._backend.complete_struct_or_union(BType, lst, self,
|
||||
-1, -1, *extra_flags)
|
||||
#
|
||||
else:
|
||||
fldtypes = []
|
||||
fieldofs, fieldsize, totalsize, totalalignment = self.fixedlayout
|
||||
for i in range(len(self.fldnames)):
|
||||
fsize = fieldsize[i]
|
||||
ftype = self.fldtypes[i]
|
||||
#
|
||||
if isinstance(ftype, ArrayType) and ftype.length_is_unknown():
|
||||
# fix the length to match the total size
|
||||
BItemType = ftype.item.get_cached_btype(ffi, finishlist)
|
||||
nlen, nrest = divmod(fsize, ffi.sizeof(BItemType))
|
||||
if nrest != 0:
|
||||
self._verification_error(
|
||||
"field '%s.%s' has a bogus size?" % (
|
||||
self.name, self.fldnames[i] or '{}'))
|
||||
ftype = ftype.resolve_length(nlen)
|
||||
self.fldtypes = (self.fldtypes[:i] + (ftype,) +
|
||||
self.fldtypes[i+1:])
|
||||
#
|
||||
BFieldType = ftype.get_cached_btype(ffi, finishlist)
|
||||
if isinstance(ftype, ArrayType) and ftype.length is None:
|
||||
assert fsize == 0
|
||||
else:
|
||||
bitemsize = ffi.sizeof(BFieldType)
|
||||
if bitemsize != fsize:
|
||||
self._verification_error(
|
||||
"field '%s.%s' is declared as %d bytes, but is "
|
||||
"really %d bytes" % (self.name,
|
||||
self.fldnames[i] or '{}',
|
||||
bitemsize, fsize))
|
||||
fldtypes.append(BFieldType)
|
||||
#
|
||||
lst = list(zip(self.fldnames, fldtypes, self.fldbitsize, fieldofs))
|
||||
ffi._backend.complete_struct_or_union(BType, lst, self,
|
||||
totalsize, totalalignment)
|
||||
self.completed = 2
|
||||
|
||||
def _verification_error(self, msg):
|
||||
raise VerificationError(msg)
|
||||
|
||||
def check_not_partial(self):
|
||||
if self.partial and self.fixedlayout is None:
|
||||
raise VerificationMissing(self._get_c_name())
|
||||
|
||||
def build_backend_type(self, ffi, finishlist):
|
||||
self.check_not_partial()
|
||||
finishlist.append(self)
|
||||
#
|
||||
return global_cache(self, ffi, 'new_%s_type' % self.kind,
|
||||
self.get_official_name(), key=self)
|
||||
|
||||
|
||||
class StructType(StructOrUnion):
|
||||
kind = 'struct'
|
||||
|
||||
|
||||
class UnionType(StructOrUnion):
|
||||
kind = 'union'
|
||||
|
||||
|
||||
class EnumType(StructOrUnionOrEnum):
|
||||
kind = 'enum'
|
||||
partial = False
|
||||
partial_resolved = False
|
||||
|
||||
def __init__(self, name, enumerators, enumvalues, baseinttype=None):
|
||||
self.name = name
|
||||
self.enumerators = enumerators
|
||||
self.enumvalues = enumvalues
|
||||
self.baseinttype = baseinttype
|
||||
self.build_c_name_with_marker()
|
||||
|
||||
def force_the_name(self, forcename):
|
||||
StructOrUnionOrEnum.force_the_name(self, forcename)
|
||||
if self.forcename is None:
|
||||
name = self.get_official_name()
|
||||
self.forcename = '$' + name.replace(' ', '_')
|
||||
|
||||
def check_not_partial(self):
|
||||
if self.partial and not self.partial_resolved:
|
||||
raise VerificationMissing(self._get_c_name())
|
||||
|
||||
def build_backend_type(self, ffi, finishlist):
|
||||
self.check_not_partial()
|
||||
base_btype = self.build_baseinttype(ffi, finishlist)
|
||||
return global_cache(self, ffi, 'new_enum_type',
|
||||
self.get_official_name(),
|
||||
self.enumerators, self.enumvalues,
|
||||
base_btype, key=self)
|
||||
|
||||
def build_baseinttype(self, ffi, finishlist):
|
||||
if self.baseinttype is not None:
|
||||
return self.baseinttype.get_cached_btype(ffi, finishlist)
|
||||
#
|
||||
if self.enumvalues:
|
||||
smallest_value = min(self.enumvalues)
|
||||
largest_value = max(self.enumvalues)
|
||||
else:
|
||||
import warnings
|
||||
try:
|
||||
# XXX! The goal is to ensure that the warnings.warn()
|
||||
# will not suppress the warning. We want to get it
|
||||
# several times if we reach this point several times.
|
||||
__warningregistry__.clear()
|
||||
except NameError:
|
||||
pass
|
||||
warnings.warn("%r has no values explicitly defined; "
|
||||
"guessing that it is equivalent to 'unsigned int'"
|
||||
% self._get_c_name())
|
||||
smallest_value = largest_value = 0
|
||||
if smallest_value < 0: # needs a signed type
|
||||
sign = 1
|
||||
candidate1 = PrimitiveType("int")
|
||||
candidate2 = PrimitiveType("long")
|
||||
else:
|
||||
sign = 0
|
||||
candidate1 = PrimitiveType("unsigned int")
|
||||
candidate2 = PrimitiveType("unsigned long")
|
||||
btype1 = candidate1.get_cached_btype(ffi, finishlist)
|
||||
btype2 = candidate2.get_cached_btype(ffi, finishlist)
|
||||
size1 = ffi.sizeof(btype1)
|
||||
size2 = ffi.sizeof(btype2)
|
||||
if (smallest_value >= ((-1) << (8*size1-1)) and
|
||||
largest_value < (1 << (8*size1-sign))):
|
||||
return btype1
|
||||
if (smallest_value >= ((-1) << (8*size2-1)) and
|
||||
largest_value < (1 << (8*size2-sign))):
|
||||
return btype2
|
||||
raise CDefError("%s values don't all fit into either 'long' "
|
||||
"or 'unsigned long'" % self._get_c_name())
|
||||
|
||||
def unknown_type(name, structname=None):
|
||||
if structname is None:
|
||||
structname = '$%s' % name
|
||||
tp = StructType(structname, None, None, None)
|
||||
tp.force_the_name(name)
|
||||
tp.origin = "unknown_type"
|
||||
return tp
|
||||
|
||||
def unknown_ptr_type(name, structname=None):
|
||||
if structname is None:
|
||||
structname = '$$%s' % name
|
||||
tp = StructType(structname, None, None, None)
|
||||
return NamedPointerType(tp, name)
|
||||
|
||||
|
||||
global_lock = allocate_lock()
|
||||
_typecache_cffi_backend = weakref.WeakValueDictionary()
|
||||
|
||||
def get_typecache(backend):
|
||||
# returns _typecache_cffi_backend if backend is the _cffi_backend
|
||||
# module, or type(backend).__typecache if backend is an instance of
|
||||
# CTypesBackend (or some FakeBackend class during tests)
|
||||
if isinstance(backend, types.ModuleType):
|
||||
return _typecache_cffi_backend
|
||||
with global_lock:
|
||||
if not hasattr(type(backend), '__typecache'):
|
||||
type(backend).__typecache = weakref.WeakValueDictionary()
|
||||
return type(backend).__typecache
|
||||
|
||||
def global_cache(srctype, ffi, funcname, *args, **kwds):
|
||||
key = kwds.pop('key', (funcname, args))
|
||||
assert not kwds
|
||||
try:
|
||||
return ffi._typecache[key]
|
||||
except KeyError:
|
||||
pass
|
||||
try:
|
||||
res = getattr(ffi._backend, funcname)(*args)
|
||||
except NotImplementedError as e:
|
||||
raise NotImplementedError("%s: %r: %s" % (funcname, srctype, e))
|
||||
# note that setdefault() on WeakValueDictionary is not atomic
|
||||
# and contains a rare bug (http://bugs.python.org/issue19542);
|
||||
# we have to use a lock and do it ourselves
|
||||
cache = ffi._typecache
|
||||
with global_lock:
|
||||
res1 = cache.get(key)
|
||||
if res1 is None:
|
||||
cache[key] = res
|
||||
return res
|
||||
else:
|
||||
return res1
|
||||
|
||||
def pointer_cache(ffi, BType):
|
||||
return global_cache('?', ffi, 'new_pointer_type', BType)
|
||||
|
||||
def attach_exception_info(e, name):
|
||||
if e.args and type(e.args[0]) is str:
|
||||
e.args = ('%s: %s' % (name, e.args[0]),) + e.args[1:]
|
||||
@@ -0,0 +1,181 @@
|
||||
|
||||
/* This part is from file 'cffi/parse_c_type.h'. It is copied at the
|
||||
beginning of C sources generated by CFFI's ffi.set_source(). */
|
||||
|
||||
typedef void *_cffi_opcode_t;
|
||||
|
||||
#define _CFFI_OP(opcode, arg) (_cffi_opcode_t)(opcode | (((uintptr_t)(arg)) << 8))
|
||||
#define _CFFI_GETOP(cffi_opcode) ((unsigned char)(uintptr_t)cffi_opcode)
|
||||
#define _CFFI_GETARG(cffi_opcode) (((intptr_t)cffi_opcode) >> 8)
|
||||
|
||||
#define _CFFI_OP_PRIMITIVE 1
|
||||
#define _CFFI_OP_POINTER 3
|
||||
#define _CFFI_OP_ARRAY 5
|
||||
#define _CFFI_OP_OPEN_ARRAY 7
|
||||
#define _CFFI_OP_STRUCT_UNION 9
|
||||
#define _CFFI_OP_ENUM 11
|
||||
#define _CFFI_OP_FUNCTION 13
|
||||
#define _CFFI_OP_FUNCTION_END 15
|
||||
#define _CFFI_OP_NOOP 17
|
||||
#define _CFFI_OP_BITFIELD 19
|
||||
#define _CFFI_OP_TYPENAME 21
|
||||
#define _CFFI_OP_CPYTHON_BLTN_V 23 // varargs
|
||||
#define _CFFI_OP_CPYTHON_BLTN_N 25 // noargs
|
||||
#define _CFFI_OP_CPYTHON_BLTN_O 27 // O (i.e. a single arg)
|
||||
#define _CFFI_OP_CONSTANT 29
|
||||
#define _CFFI_OP_CONSTANT_INT 31
|
||||
#define _CFFI_OP_GLOBAL_VAR 33
|
||||
#define _CFFI_OP_DLOPEN_FUNC 35
|
||||
#define _CFFI_OP_DLOPEN_CONST 37
|
||||
#define _CFFI_OP_GLOBAL_VAR_F 39
|
||||
#define _CFFI_OP_EXTERN_PYTHON 41
|
||||
|
||||
#define _CFFI_PRIM_VOID 0
|
||||
#define _CFFI_PRIM_BOOL 1
|
||||
#define _CFFI_PRIM_CHAR 2
|
||||
#define _CFFI_PRIM_SCHAR 3
|
||||
#define _CFFI_PRIM_UCHAR 4
|
||||
#define _CFFI_PRIM_SHORT 5
|
||||
#define _CFFI_PRIM_USHORT 6
|
||||
#define _CFFI_PRIM_INT 7
|
||||
#define _CFFI_PRIM_UINT 8
|
||||
#define _CFFI_PRIM_LONG 9
|
||||
#define _CFFI_PRIM_ULONG 10
|
||||
#define _CFFI_PRIM_LONGLONG 11
|
||||
#define _CFFI_PRIM_ULONGLONG 12
|
||||
#define _CFFI_PRIM_FLOAT 13
|
||||
#define _CFFI_PRIM_DOUBLE 14
|
||||
#define _CFFI_PRIM_LONGDOUBLE 15
|
||||
|
||||
#define _CFFI_PRIM_WCHAR 16
|
||||
#define _CFFI_PRIM_INT8 17
|
||||
#define _CFFI_PRIM_UINT8 18
|
||||
#define _CFFI_PRIM_INT16 19
|
||||
#define _CFFI_PRIM_UINT16 20
|
||||
#define _CFFI_PRIM_INT32 21
|
||||
#define _CFFI_PRIM_UINT32 22
|
||||
#define _CFFI_PRIM_INT64 23
|
||||
#define _CFFI_PRIM_UINT64 24
|
||||
#define _CFFI_PRIM_INTPTR 25
|
||||
#define _CFFI_PRIM_UINTPTR 26
|
||||
#define _CFFI_PRIM_PTRDIFF 27
|
||||
#define _CFFI_PRIM_SIZE 28
|
||||
#define _CFFI_PRIM_SSIZE 29
|
||||
#define _CFFI_PRIM_INT_LEAST8 30
|
||||
#define _CFFI_PRIM_UINT_LEAST8 31
|
||||
#define _CFFI_PRIM_INT_LEAST16 32
|
||||
#define _CFFI_PRIM_UINT_LEAST16 33
|
||||
#define _CFFI_PRIM_INT_LEAST32 34
|
||||
#define _CFFI_PRIM_UINT_LEAST32 35
|
||||
#define _CFFI_PRIM_INT_LEAST64 36
|
||||
#define _CFFI_PRIM_UINT_LEAST64 37
|
||||
#define _CFFI_PRIM_INT_FAST8 38
|
||||
#define _CFFI_PRIM_UINT_FAST8 39
|
||||
#define _CFFI_PRIM_INT_FAST16 40
|
||||
#define _CFFI_PRIM_UINT_FAST16 41
|
||||
#define _CFFI_PRIM_INT_FAST32 42
|
||||
#define _CFFI_PRIM_UINT_FAST32 43
|
||||
#define _CFFI_PRIM_INT_FAST64 44
|
||||
#define _CFFI_PRIM_UINT_FAST64 45
|
||||
#define _CFFI_PRIM_INTMAX 46
|
||||
#define _CFFI_PRIM_UINTMAX 47
|
||||
#define _CFFI_PRIM_FLOATCOMPLEX 48
|
||||
#define _CFFI_PRIM_DOUBLECOMPLEX 49
|
||||
#define _CFFI_PRIM_CHAR16 50
|
||||
#define _CFFI_PRIM_CHAR32 51
|
||||
|
||||
#define _CFFI__NUM_PRIM 52
|
||||
#define _CFFI__UNKNOWN_PRIM (-1)
|
||||
#define _CFFI__UNKNOWN_FLOAT_PRIM (-2)
|
||||
#define _CFFI__UNKNOWN_LONG_DOUBLE (-3)
|
||||
|
||||
#define _CFFI__IO_FILE_STRUCT (-1)
|
||||
|
||||
|
||||
struct _cffi_global_s {
|
||||
const char *name;
|
||||
void *address;
|
||||
_cffi_opcode_t type_op;
|
||||
void *size_or_direct_fn; // OP_GLOBAL_VAR: size, or 0 if unknown
|
||||
// OP_CPYTHON_BLTN_*: addr of direct function
|
||||
};
|
||||
|
||||
struct _cffi_getconst_s {
|
||||
unsigned long long value;
|
||||
const struct _cffi_type_context_s *ctx;
|
||||
int gindex;
|
||||
};
|
||||
|
||||
struct _cffi_struct_union_s {
|
||||
const char *name;
|
||||
int type_index; // -> _cffi_types, on a OP_STRUCT_UNION
|
||||
int flags; // _CFFI_F_* flags below
|
||||
size_t size;
|
||||
int alignment;
|
||||
int first_field_index; // -> _cffi_fields array
|
||||
int num_fields;
|
||||
};
|
||||
#define _CFFI_F_UNION 0x01 // is a union, not a struct
|
||||
#define _CFFI_F_CHECK_FIELDS 0x02 // complain if fields are not in the
|
||||
// "standard layout" or if some are missing
|
||||
#define _CFFI_F_PACKED 0x04 // for CHECK_FIELDS, assume a packed struct
|
||||
#define _CFFI_F_EXTERNAL 0x08 // in some other ffi.include()
|
||||
#define _CFFI_F_OPAQUE 0x10 // opaque
|
||||
|
||||
struct _cffi_field_s {
|
||||
const char *name;
|
||||
size_t field_offset;
|
||||
size_t field_size;
|
||||
_cffi_opcode_t field_type_op;
|
||||
};
|
||||
|
||||
struct _cffi_enum_s {
|
||||
const char *name;
|
||||
int type_index; // -> _cffi_types, on a OP_ENUM
|
||||
int type_prim; // _CFFI_PRIM_xxx
|
||||
const char *enumerators; // comma-delimited string
|
||||
};
|
||||
|
||||
struct _cffi_typename_s {
|
||||
const char *name;
|
||||
int type_index; /* if opaque, points to a possibly artificial
|
||||
OP_STRUCT which is itself opaque */
|
||||
};
|
||||
|
||||
struct _cffi_type_context_s {
|
||||
_cffi_opcode_t *types;
|
||||
const struct _cffi_global_s *globals;
|
||||
const struct _cffi_field_s *fields;
|
||||
const struct _cffi_struct_union_s *struct_unions;
|
||||
const struct _cffi_enum_s *enums;
|
||||
const struct _cffi_typename_s *typenames;
|
||||
int num_globals;
|
||||
int num_struct_unions;
|
||||
int num_enums;
|
||||
int num_typenames;
|
||||
const char *const *includes;
|
||||
int num_types;
|
||||
int flags; /* future extension */
|
||||
};
|
||||
|
||||
struct _cffi_parse_info_s {
|
||||
const struct _cffi_type_context_s *ctx;
|
||||
_cffi_opcode_t *output;
|
||||
unsigned int output_size;
|
||||
size_t error_location;
|
||||
const char *error_message;
|
||||
};
|
||||
|
||||
struct _cffi_externpy_s {
|
||||
const char *name;
|
||||
size_t size_of_result;
|
||||
void *reserved1, *reserved2;
|
||||
};
|
||||
|
||||
#ifdef _CFFI_INTERNAL
|
||||
static int parse_c_type(struct _cffi_parse_info_s *info, const char *input);
|
||||
static int search_in_globals(const struct _cffi_type_context_s *ctx,
|
||||
const char *search, size_t search_len);
|
||||
static int search_in_struct_unions(const struct _cffi_type_context_s *ctx,
|
||||
const char *search, size_t search_len);
|
||||
#endif
|
||||
@@ -0,0 +1,121 @@
|
||||
# pkg-config, https://www.freedesktop.org/wiki/Software/pkg-config/ integration for cffi
|
||||
import sys, os, subprocess
|
||||
|
||||
from .error import PkgConfigError
|
||||
|
||||
|
||||
def merge_flags(cfg1, cfg2):
|
||||
"""Merge values from cffi config flags cfg2 to cf1
|
||||
|
||||
Example:
|
||||
merge_flags({"libraries": ["one"]}, {"libraries": ["two"]})
|
||||
{"libraries": ["one", "two"]}
|
||||
"""
|
||||
for key, value in cfg2.items():
|
||||
if key not in cfg1:
|
||||
cfg1[key] = value
|
||||
else:
|
||||
if not isinstance(cfg1[key], list):
|
||||
raise TypeError("cfg1[%r] should be a list of strings" % (key,))
|
||||
if not isinstance(value, list):
|
||||
raise TypeError("cfg2[%r] should be a list of strings" % (key,))
|
||||
cfg1[key].extend(value)
|
||||
return cfg1
|
||||
|
||||
|
||||
def call(libname, flag, encoding=sys.getfilesystemencoding()):
|
||||
"""Calls pkg-config and returns the output if found
|
||||
"""
|
||||
a = ["pkg-config", "--print-errors"]
|
||||
a.append(flag)
|
||||
a.append(libname)
|
||||
try:
|
||||
pc = subprocess.Popen(a, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
except EnvironmentError as e:
|
||||
raise PkgConfigError("cannot run pkg-config: %s" % (str(e).strip(),))
|
||||
|
||||
bout, berr = pc.communicate()
|
||||
if pc.returncode != 0:
|
||||
try:
|
||||
berr = berr.decode(encoding)
|
||||
except Exception:
|
||||
pass
|
||||
raise PkgConfigError(berr.strip())
|
||||
|
||||
if sys.version_info >= (3,) and not isinstance(bout, str): # Python 3.x
|
||||
try:
|
||||
bout = bout.decode(encoding)
|
||||
except UnicodeDecodeError:
|
||||
raise PkgConfigError("pkg-config %s %s returned bytes that cannot "
|
||||
"be decoded with encoding %r:\n%r" %
|
||||
(flag, libname, encoding, bout))
|
||||
|
||||
if os.altsep != '\\' and '\\' in bout:
|
||||
raise PkgConfigError("pkg-config %s %s returned an unsupported "
|
||||
"backslash-escaped output:\n%r" %
|
||||
(flag, libname, bout))
|
||||
return bout
|
||||
|
||||
|
||||
def flags_from_pkgconfig(libs):
|
||||
r"""Return compiler line flags for FFI.set_source based on pkg-config output
|
||||
|
||||
Usage
|
||||
...
|
||||
ffibuilder.set_source("_foo", pkgconfig = ["libfoo", "libbar >= 1.8.3"])
|
||||
|
||||
If pkg-config is installed on build machine, then arguments include_dirs,
|
||||
library_dirs, libraries, define_macros, extra_compile_args and
|
||||
extra_link_args are extended with an output of pkg-config for libfoo and
|
||||
libbar.
|
||||
|
||||
Raises PkgConfigError in case the pkg-config call fails.
|
||||
"""
|
||||
|
||||
def get_include_dirs(string):
|
||||
return [x[2:] for x in string.split() if x.startswith("-I")]
|
||||
|
||||
def get_library_dirs(string):
|
||||
return [x[2:] for x in string.split() if x.startswith("-L")]
|
||||
|
||||
def get_libraries(string):
|
||||
return [x[2:] for x in string.split() if x.startswith("-l")]
|
||||
|
||||
# convert -Dfoo=bar to list of tuples [("foo", "bar")] expected by distutils
|
||||
def get_macros(string):
|
||||
def _macro(x):
|
||||
x = x[2:] # drop "-D"
|
||||
if '=' in x:
|
||||
return tuple(x.split("=", 1)) # "-Dfoo=bar" => ("foo", "bar")
|
||||
else:
|
||||
return (x, None) # "-Dfoo" => ("foo", None)
|
||||
return [_macro(x) for x in string.split() if x.startswith("-D")]
|
||||
|
||||
def get_other_cflags(string):
|
||||
return [x for x in string.split() if not x.startswith("-I") and
|
||||
not x.startswith("-D")]
|
||||
|
||||
def get_other_libs(string):
|
||||
return [x for x in string.split() if not x.startswith("-L") and
|
||||
not x.startswith("-l")]
|
||||
|
||||
# return kwargs for given libname
|
||||
def kwargs(libname):
|
||||
fse = sys.getfilesystemencoding()
|
||||
all_cflags = call(libname, "--cflags")
|
||||
all_libs = call(libname, "--libs")
|
||||
return {
|
||||
"include_dirs": get_include_dirs(all_cflags),
|
||||
"library_dirs": get_library_dirs(all_libs),
|
||||
"libraries": get_libraries(all_libs),
|
||||
"define_macros": get_macros(all_cflags),
|
||||
"extra_compile_args": get_other_cflags(all_cflags),
|
||||
"extra_link_args": get_other_libs(all_libs),
|
||||
}
|
||||
|
||||
# merge all arguments together
|
||||
ret = {}
|
||||
for libname in libs:
|
||||
lib_flags = kwargs(libname)
|
||||
merge_flags(ret, lib_flags)
|
||||
return ret
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,229 @@
|
||||
import os
|
||||
import sys
|
||||
import sysconfig
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
# Python 3.x
|
||||
basestring = str
|
||||
|
||||
def error(msg):
|
||||
from cffi._shimmed_dist_utils import DistutilsSetupError
|
||||
raise DistutilsSetupError(msg)
|
||||
|
||||
|
||||
def execfile(filename, glob):
|
||||
# We use execfile() (here rewritten for Python 3) instead of
|
||||
# __import__() to load the build script. The problem with
|
||||
# a normal import is that in some packages, the intermediate
|
||||
# __init__.py files may already try to import the file that
|
||||
# we are generating.
|
||||
with open(filename) as f:
|
||||
src = f.read()
|
||||
src += '\n' # Python 2.6 compatibility
|
||||
code = compile(src, filename, 'exec')
|
||||
exec(code, glob, glob)
|
||||
|
||||
|
||||
def add_cffi_module(dist, mod_spec):
|
||||
from cffi.api import FFI
|
||||
|
||||
if not isinstance(mod_spec, basestring):
|
||||
error("argument to 'cffi_modules=...' must be a str or a list of str,"
|
||||
" not %r" % (type(mod_spec).__name__,))
|
||||
mod_spec = str(mod_spec)
|
||||
try:
|
||||
build_file_name, ffi_var_name = mod_spec.split(':')
|
||||
except ValueError:
|
||||
error("%r must be of the form 'path/build.py:ffi_variable'" %
|
||||
(mod_spec,))
|
||||
if not os.path.exists(build_file_name):
|
||||
ext = ''
|
||||
rewritten = build_file_name.replace('.', '/') + '.py'
|
||||
if os.path.exists(rewritten):
|
||||
ext = ' (rewrite cffi_modules to [%r])' % (
|
||||
rewritten + ':' + ffi_var_name,)
|
||||
error("%r does not name an existing file%s" % (build_file_name, ext))
|
||||
|
||||
mod_vars = {'__name__': '__cffi__', '__file__': build_file_name}
|
||||
execfile(build_file_name, mod_vars)
|
||||
|
||||
try:
|
||||
ffi = mod_vars[ffi_var_name]
|
||||
except KeyError:
|
||||
error("%r: object %r not found in module" % (mod_spec,
|
||||
ffi_var_name))
|
||||
if not isinstance(ffi, FFI):
|
||||
ffi = ffi() # maybe it's a function instead of directly an ffi
|
||||
if not isinstance(ffi, FFI):
|
||||
error("%r is not an FFI instance (got %r)" % (mod_spec,
|
||||
type(ffi).__name__))
|
||||
if not hasattr(ffi, '_assigned_source'):
|
||||
error("%r: the set_source() method was not called" % (mod_spec,))
|
||||
module_name, source, source_extension, kwds = ffi._assigned_source
|
||||
if ffi._windows_unicode:
|
||||
kwds = kwds.copy()
|
||||
ffi._apply_windows_unicode(kwds)
|
||||
|
||||
if source is None:
|
||||
_add_py_module(dist, ffi, module_name)
|
||||
else:
|
||||
_add_c_module(dist, ffi, module_name, source, source_extension, kwds)
|
||||
|
||||
def _set_py_limited_api(Extension, kwds):
|
||||
"""
|
||||
Add py_limited_api to kwds if setuptools >= 26 is in use.
|
||||
Do not alter the setting if it already exists.
|
||||
Setuptools takes care of ignoring the flag on Python 2 and PyPy.
|
||||
|
||||
CPython itself should ignore the flag in a debugging version
|
||||
(by not listing .abi3.so in the extensions it supports), but
|
||||
it doesn't so far, creating troubles. That's why we check
|
||||
for "not hasattr(sys, 'gettotalrefcount')" (the 2.7 compatible equivalent
|
||||
of 'd' not in sys.abiflags). (http://bugs.python.org/issue28401)
|
||||
|
||||
On Windows, with CPython <= 3.4, it's better not to use py_limited_api
|
||||
because virtualenv *still* doesn't copy PYTHON3.DLL on these versions.
|
||||
Recently (2020) we started shipping only >= 3.5 wheels, though. So
|
||||
we'll give it another try and set py_limited_api on Windows >= 3.5.
|
||||
"""
|
||||
from cffi._shimmed_dist_utils import log
|
||||
from cffi import recompiler
|
||||
|
||||
if ('py_limited_api' not in kwds and not hasattr(sys, 'gettotalrefcount')
|
||||
and recompiler.USE_LIMITED_API):
|
||||
import setuptools
|
||||
try:
|
||||
setuptools_major_version = int(setuptools.__version__.partition('.')[0])
|
||||
if setuptools_major_version >= 26:
|
||||
kwds['py_limited_api'] = True
|
||||
except ValueError: # certain development versions of setuptools
|
||||
# If we don't know the version number of setuptools, we
|
||||
# try to set 'py_limited_api' anyway. At worst, we get a
|
||||
# warning.
|
||||
kwds['py_limited_api'] = True
|
||||
|
||||
if sysconfig.get_config_var("Py_GIL_DISABLED"):
|
||||
if kwds.get('py_limited_api'):
|
||||
log.info("Ignoring py_limited_api=True for free-threaded build.")
|
||||
|
||||
kwds['py_limited_api'] = False
|
||||
|
||||
if kwds.get('py_limited_api') is False:
|
||||
# avoid setting Py_LIMITED_API if py_limited_api=False
|
||||
# which _cffi_include.h does unless _CFFI_NO_LIMITED_API is defined
|
||||
kwds.setdefault("define_macros", []).append(("_CFFI_NO_LIMITED_API", None))
|
||||
return kwds
|
||||
|
||||
def _add_c_module(dist, ffi, module_name, source, source_extension, kwds):
|
||||
# We are a setuptools extension. Need this build_ext for py_limited_api.
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from cffi._shimmed_dist_utils import Extension, log, mkpath
|
||||
from cffi import recompiler
|
||||
|
||||
allsources = ['$PLACEHOLDER']
|
||||
allsources.extend(kwds.pop('sources', []))
|
||||
kwds = _set_py_limited_api(Extension, kwds)
|
||||
ext = Extension(name=module_name, sources=allsources, **kwds)
|
||||
|
||||
def make_mod(tmpdir, pre_run=None):
|
||||
c_file = os.path.join(tmpdir, module_name + source_extension)
|
||||
log.info("generating cffi module %r" % c_file)
|
||||
mkpath(tmpdir)
|
||||
# a setuptools-only, API-only hook: called with the "ext" and "ffi"
|
||||
# arguments just before we turn the ffi into C code. To use it,
|
||||
# subclass the 'distutils.command.build_ext.build_ext' class and
|
||||
# add a method 'def pre_run(self, ext, ffi)'.
|
||||
if pre_run is not None:
|
||||
pre_run(ext, ffi)
|
||||
updated = recompiler.make_c_source(ffi, module_name, source, c_file)
|
||||
if not updated:
|
||||
log.info("already up-to-date")
|
||||
return c_file
|
||||
|
||||
if dist.ext_modules is None:
|
||||
dist.ext_modules = []
|
||||
dist.ext_modules.append(ext)
|
||||
|
||||
base_class = dist.cmdclass.get('build_ext', build_ext)
|
||||
class build_ext_make_mod(base_class):
|
||||
def run(self):
|
||||
if ext.sources[0] == '$PLACEHOLDER':
|
||||
pre_run = getattr(self, 'pre_run', None)
|
||||
ext.sources[0] = make_mod(self.build_temp, pre_run)
|
||||
base_class.run(self)
|
||||
dist.cmdclass['build_ext'] = build_ext_make_mod
|
||||
# NB. multiple runs here will create multiple 'build_ext_make_mod'
|
||||
# classes. Even in this case the 'build_ext' command should be
|
||||
# run once; but just in case, the logic above does nothing if
|
||||
# called again.
|
||||
|
||||
|
||||
def _add_py_module(dist, ffi, module_name):
|
||||
from setuptools.command.build_py import build_py
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from cffi._shimmed_dist_utils import log, mkpath
|
||||
from cffi import recompiler
|
||||
|
||||
def generate_mod(py_file):
|
||||
log.info("generating cffi module %r" % py_file)
|
||||
mkpath(os.path.dirname(py_file))
|
||||
updated = recompiler.make_py_source(ffi, module_name, py_file)
|
||||
if not updated:
|
||||
log.info("already up-to-date")
|
||||
|
||||
base_class = dist.cmdclass.get('build_py', build_py)
|
||||
class build_py_make_mod(base_class):
|
||||
def run(self):
|
||||
base_class.run(self)
|
||||
module_path = module_name.split('.')
|
||||
module_path[-1] += '.py'
|
||||
generate_mod(os.path.join(self.build_lib, *module_path))
|
||||
def get_source_files(self):
|
||||
# This is called from 'setup.py sdist' only. Exclude
|
||||
# the generate .py module in this case.
|
||||
saved_py_modules = self.py_modules
|
||||
try:
|
||||
if saved_py_modules:
|
||||
self.py_modules = [m for m in saved_py_modules
|
||||
if m != module_name]
|
||||
return base_class.get_source_files(self)
|
||||
finally:
|
||||
self.py_modules = saved_py_modules
|
||||
dist.cmdclass['build_py'] = build_py_make_mod
|
||||
|
||||
# distutils and setuptools have no notion I could find of a
|
||||
# generated python module. If we don't add module_name to
|
||||
# dist.py_modules, then things mostly work but there are some
|
||||
# combination of options (--root and --record) that will miss
|
||||
# the module. So we add it here, which gives a few apparently
|
||||
# harmless warnings about not finding the file outside the
|
||||
# build directory.
|
||||
# Then we need to hack more in get_source_files(); see above.
|
||||
if dist.py_modules is None:
|
||||
dist.py_modules = []
|
||||
dist.py_modules.append(module_name)
|
||||
|
||||
# the following is only for "build_ext -i"
|
||||
base_class_2 = dist.cmdclass.get('build_ext', build_ext)
|
||||
class build_ext_make_mod(base_class_2):
|
||||
def run(self):
|
||||
base_class_2.run(self)
|
||||
if self.inplace:
|
||||
# from get_ext_fullpath() in distutils/command/build_ext.py
|
||||
module_path = module_name.split('.')
|
||||
package = '.'.join(module_path[:-1])
|
||||
build_py = self.get_finalized_command('build_py')
|
||||
package_dir = build_py.get_package_dir(package)
|
||||
file_name = module_path[-1] + '.py'
|
||||
generate_mod(os.path.join(package_dir, file_name))
|
||||
dist.cmdclass['build_ext'] = build_ext_make_mod
|
||||
|
||||
def cffi_modules(dist, attr, value):
|
||||
assert attr == 'cffi_modules'
|
||||
if isinstance(value, basestring):
|
||||
value = [value]
|
||||
|
||||
for cffi_module in value:
|
||||
add_cffi_module(dist, cffi_module)
|
||||
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,679 @@
|
||||
#
|
||||
# DEPRECATED: implementation for ffi.verify()
|
||||
#
|
||||
import sys, os
|
||||
import types
|
||||
|
||||
from . import model
|
||||
from .error import VerificationError
|
||||
|
||||
|
||||
class VGenericEngine(object):
|
||||
_class_key = 'g'
|
||||
_gen_python_module = False
|
||||
|
||||
def __init__(self, verifier):
|
||||
self.verifier = verifier
|
||||
self.ffi = verifier.ffi
|
||||
self.export_symbols = []
|
||||
self._struct_pending_verification = {}
|
||||
|
||||
def patch_extension_kwds(self, kwds):
|
||||
# add 'export_symbols' to the dictionary. Note that we add the
|
||||
# list before filling it. When we fill it, it will thus also show
|
||||
# up in kwds['export_symbols'].
|
||||
kwds.setdefault('export_symbols', self.export_symbols)
|
||||
|
||||
def find_module(self, module_name, path, so_suffixes):
|
||||
for so_suffix in so_suffixes:
|
||||
basename = module_name + so_suffix
|
||||
if path is None:
|
||||
path = sys.path
|
||||
for dirname in path:
|
||||
filename = os.path.join(dirname, basename)
|
||||
if os.path.isfile(filename):
|
||||
return filename
|
||||
|
||||
def collect_types(self):
|
||||
pass # not needed in the generic engine
|
||||
|
||||
def _prnt(self, what=''):
|
||||
self._f.write(what + '\n')
|
||||
|
||||
def write_source_to_f(self):
|
||||
prnt = self._prnt
|
||||
# first paste some standard set of lines that are mostly '#include'
|
||||
prnt(cffimod_header)
|
||||
# then paste the C source given by the user, verbatim.
|
||||
prnt(self.verifier.preamble)
|
||||
#
|
||||
# call generate_gen_xxx_decl(), for every xxx found from
|
||||
# ffi._parser._declarations. This generates all the functions.
|
||||
self._generate('decl')
|
||||
#
|
||||
# on Windows, distutils insists on putting init_cffi_xyz in
|
||||
# 'export_symbols', so instead of fighting it, just give up and
|
||||
# give it one
|
||||
if sys.platform == 'win32':
|
||||
if sys.version_info >= (3,):
|
||||
prefix = 'PyInit_'
|
||||
else:
|
||||
prefix = 'init'
|
||||
modname = self.verifier.get_module_name()
|
||||
prnt("void %s%s(void) { }\n" % (prefix, modname))
|
||||
|
||||
def load_library(self, flags=0):
|
||||
# import it with the CFFI backend
|
||||
backend = self.ffi._backend
|
||||
# needs to make a path that contains '/', on Posix
|
||||
filename = os.path.join(os.curdir, self.verifier.modulefilename)
|
||||
module = backend.load_library(filename, flags)
|
||||
#
|
||||
# call loading_gen_struct() to get the struct layout inferred by
|
||||
# the C compiler
|
||||
self._load(module, 'loading')
|
||||
|
||||
# build the FFILibrary class and instance, this is a module subclass
|
||||
# because modules are expected to have usually-constant-attributes and
|
||||
# in PyPy this means the JIT is able to treat attributes as constant,
|
||||
# which we want.
|
||||
class FFILibrary(types.ModuleType):
|
||||
_cffi_generic_module = module
|
||||
_cffi_ffi = self.ffi
|
||||
_cffi_dir = []
|
||||
def __dir__(self):
|
||||
return FFILibrary._cffi_dir
|
||||
library = FFILibrary("")
|
||||
#
|
||||
# finally, call the loaded_gen_xxx() functions. This will set
|
||||
# up the 'library' object.
|
||||
self._load(module, 'loaded', library=library)
|
||||
return library
|
||||
|
||||
def _get_declarations(self):
|
||||
lst = [(key, tp) for (key, (tp, qual)) in
|
||||
self.ffi._parser._declarations.items()]
|
||||
lst.sort()
|
||||
return lst
|
||||
|
||||
def _generate(self, step_name):
|
||||
for name, tp in self._get_declarations():
|
||||
kind, realname = name.split(' ', 1)
|
||||
try:
|
||||
method = getattr(self, '_generate_gen_%s_%s' % (kind,
|
||||
step_name))
|
||||
except AttributeError:
|
||||
raise VerificationError(
|
||||
"not implemented in verify(): %r" % name)
|
||||
try:
|
||||
method(tp, realname)
|
||||
except Exception as e:
|
||||
model.attach_exception_info(e, name)
|
||||
raise
|
||||
|
||||
def _load(self, module, step_name, **kwds):
|
||||
for name, tp in self._get_declarations():
|
||||
kind, realname = name.split(' ', 1)
|
||||
method = getattr(self, '_%s_gen_%s' % (step_name, kind))
|
||||
try:
|
||||
method(tp, realname, module, **kwds)
|
||||
except Exception as e:
|
||||
model.attach_exception_info(e, name)
|
||||
raise
|
||||
|
||||
def _generate_nothing(self, tp, name):
|
||||
pass
|
||||
|
||||
def _loaded_noop(self, tp, name, module, **kwds):
|
||||
pass
|
||||
|
||||
# ----------
|
||||
# typedefs: generates no code so far
|
||||
|
||||
_generate_gen_typedef_decl = _generate_nothing
|
||||
_loading_gen_typedef = _loaded_noop
|
||||
_loaded_gen_typedef = _loaded_noop
|
||||
|
||||
# ----------
|
||||
# function declarations
|
||||
|
||||
def _generate_gen_function_decl(self, tp, name):
|
||||
assert isinstance(tp, model.FunctionPtrType)
|
||||
if tp.ellipsis:
|
||||
# cannot support vararg functions better than this: check for its
|
||||
# exact type (including the fixed arguments), and build it as a
|
||||
# constant function pointer (no _cffi_f_%s wrapper)
|
||||
self._generate_gen_const(False, name, tp)
|
||||
return
|
||||
prnt = self._prnt
|
||||
numargs = len(tp.args)
|
||||
argnames = []
|
||||
for i, type in enumerate(tp.args):
|
||||
indirection = ''
|
||||
if isinstance(type, model.StructOrUnion):
|
||||
indirection = '*'
|
||||
argnames.append('%sx%d' % (indirection, i))
|
||||
context = 'argument of %s' % name
|
||||
arglist = [type.get_c_name(' %s' % arg, context)
|
||||
for type, arg in zip(tp.args, argnames)]
|
||||
tpresult = tp.result
|
||||
if isinstance(tpresult, model.StructOrUnion):
|
||||
arglist.insert(0, tpresult.get_c_name(' *r', context))
|
||||
tpresult = model.void_type
|
||||
arglist = ', '.join(arglist) or 'void'
|
||||
wrappername = '_cffi_f_%s' % name
|
||||
self.export_symbols.append(wrappername)
|
||||
if tp.abi:
|
||||
abi = tp.abi + ' '
|
||||
else:
|
||||
abi = ''
|
||||
funcdecl = ' %s%s(%s)' % (abi, wrappername, arglist)
|
||||
context = 'result of %s' % name
|
||||
prnt(tpresult.get_c_name(funcdecl, context))
|
||||
prnt('{')
|
||||
#
|
||||
if isinstance(tp.result, model.StructOrUnion):
|
||||
result_code = '*r = '
|
||||
elif not isinstance(tp.result, model.VoidType):
|
||||
result_code = 'return '
|
||||
else:
|
||||
result_code = ''
|
||||
prnt(' %s%s(%s);' % (result_code, name, ', '.join(argnames)))
|
||||
prnt('}')
|
||||
prnt()
|
||||
|
||||
_loading_gen_function = _loaded_noop
|
||||
|
||||
def _loaded_gen_function(self, tp, name, module, library):
|
||||
assert isinstance(tp, model.FunctionPtrType)
|
||||
if tp.ellipsis:
|
||||
newfunction = self._load_constant(False, tp, name, module)
|
||||
else:
|
||||
indirections = []
|
||||
base_tp = tp
|
||||
if (any(isinstance(typ, model.StructOrUnion) for typ in tp.args)
|
||||
or isinstance(tp.result, model.StructOrUnion)):
|
||||
indirect_args = []
|
||||
for i, typ in enumerate(tp.args):
|
||||
if isinstance(typ, model.StructOrUnion):
|
||||
typ = model.PointerType(typ)
|
||||
indirections.append((i, typ))
|
||||
indirect_args.append(typ)
|
||||
indirect_result = tp.result
|
||||
if isinstance(indirect_result, model.StructOrUnion):
|
||||
if indirect_result.fldtypes is None:
|
||||
raise TypeError("'%s' is used as result type, "
|
||||
"but is opaque" % (
|
||||
indirect_result._get_c_name(),))
|
||||
indirect_result = model.PointerType(indirect_result)
|
||||
indirect_args.insert(0, indirect_result)
|
||||
indirections.insert(0, ("result", indirect_result))
|
||||
indirect_result = model.void_type
|
||||
tp = model.FunctionPtrType(tuple(indirect_args),
|
||||
indirect_result, tp.ellipsis)
|
||||
BFunc = self.ffi._get_cached_btype(tp)
|
||||
wrappername = '_cffi_f_%s' % name
|
||||
newfunction = module.load_function(BFunc, wrappername)
|
||||
for i, typ in indirections:
|
||||
newfunction = self._make_struct_wrapper(newfunction, i, typ,
|
||||
base_tp)
|
||||
setattr(library, name, newfunction)
|
||||
type(library)._cffi_dir.append(name)
|
||||
|
||||
def _make_struct_wrapper(self, oldfunc, i, tp, base_tp):
|
||||
backend = self.ffi._backend
|
||||
BType = self.ffi._get_cached_btype(tp)
|
||||
if i == "result":
|
||||
ffi = self.ffi
|
||||
def newfunc(*args):
|
||||
res = ffi.new(BType)
|
||||
oldfunc(res, *args)
|
||||
return res[0]
|
||||
else:
|
||||
def newfunc(*args):
|
||||
args = args[:i] + (backend.newp(BType, args[i]),) + args[i+1:]
|
||||
return oldfunc(*args)
|
||||
newfunc._cffi_base_type = base_tp
|
||||
return newfunc
|
||||
|
||||
# ----------
|
||||
# named structs
|
||||
|
||||
def _generate_gen_struct_decl(self, tp, name):
|
||||
assert name == tp.name
|
||||
self._generate_struct_or_union_decl(tp, 'struct', name)
|
||||
|
||||
def _loading_gen_struct(self, tp, name, module):
|
||||
self._loading_struct_or_union(tp, 'struct', name, module)
|
||||
|
||||
def _loaded_gen_struct(self, tp, name, module, **kwds):
|
||||
self._loaded_struct_or_union(tp)
|
||||
|
||||
def _generate_gen_union_decl(self, tp, name):
|
||||
assert name == tp.name
|
||||
self._generate_struct_or_union_decl(tp, 'union', name)
|
||||
|
||||
def _loading_gen_union(self, tp, name, module):
|
||||
self._loading_struct_or_union(tp, 'union', name, module)
|
||||
|
||||
def _loaded_gen_union(self, tp, name, module, **kwds):
|
||||
self._loaded_struct_or_union(tp)
|
||||
|
||||
def _generate_struct_or_union_decl(self, tp, prefix, name):
|
||||
if tp.fldnames is None:
|
||||
return # nothing to do with opaque structs
|
||||
checkfuncname = '_cffi_check_%s_%s' % (prefix, name)
|
||||
layoutfuncname = '_cffi_layout_%s_%s' % (prefix, name)
|
||||
cname = ('%s %s' % (prefix, name)).strip()
|
||||
#
|
||||
prnt = self._prnt
|
||||
prnt('static void %s(%s *p)' % (checkfuncname, cname))
|
||||
prnt('{')
|
||||
prnt(' /* only to generate compile-time warnings or errors */')
|
||||
prnt(' (void)p;')
|
||||
for fname, ftype, fbitsize, fqual in tp.enumfields():
|
||||
if (isinstance(ftype, model.PrimitiveType)
|
||||
and ftype.is_integer_type()) or fbitsize >= 0:
|
||||
# accept all integers, but complain on float or double
|
||||
prnt(' (void)((p->%s) << 1);' % fname)
|
||||
else:
|
||||
# only accept exactly the type declared.
|
||||
try:
|
||||
prnt(' { %s = &p->%s; (void)tmp; }' % (
|
||||
ftype.get_c_name('*tmp', 'field %r'%fname, quals=fqual),
|
||||
fname))
|
||||
except VerificationError as e:
|
||||
prnt(' /* %s */' % str(e)) # cannot verify it, ignore
|
||||
prnt('}')
|
||||
self.export_symbols.append(layoutfuncname)
|
||||
prnt('intptr_t %s(intptr_t i)' % (layoutfuncname,))
|
||||
prnt('{')
|
||||
prnt(' struct _cffi_aligncheck { char x; %s y; };' % cname)
|
||||
prnt(' static intptr_t nums[] = {')
|
||||
prnt(' sizeof(%s),' % cname)
|
||||
prnt(' offsetof(struct _cffi_aligncheck, y),')
|
||||
for fname, ftype, fbitsize, fqual in tp.enumfields():
|
||||
if fbitsize >= 0:
|
||||
continue # xxx ignore fbitsize for now
|
||||
prnt(' offsetof(%s, %s),' % (cname, fname))
|
||||
if isinstance(ftype, model.ArrayType) and ftype.length is None:
|
||||
prnt(' 0, /* %s */' % ftype._get_c_name())
|
||||
else:
|
||||
prnt(' sizeof(((%s *)0)->%s),' % (cname, fname))
|
||||
prnt(' -1')
|
||||
prnt(' };')
|
||||
prnt(' return nums[i];')
|
||||
prnt(' /* the next line is not executed, but compiled */')
|
||||
prnt(' %s(0);' % (checkfuncname,))
|
||||
prnt('}')
|
||||
prnt()
|
||||
|
||||
def _loading_struct_or_union(self, tp, prefix, name, module):
|
||||
if tp.fldnames is None:
|
||||
return # nothing to do with opaque structs
|
||||
layoutfuncname = '_cffi_layout_%s_%s' % (prefix, name)
|
||||
#
|
||||
BFunc = self.ffi._typeof_locked("intptr_t(*)(intptr_t)")[0]
|
||||
function = module.load_function(BFunc, layoutfuncname)
|
||||
layout = []
|
||||
num = 0
|
||||
while True:
|
||||
x = function(num)
|
||||
if x < 0: break
|
||||
layout.append(x)
|
||||
num += 1
|
||||
if isinstance(tp, model.StructOrUnion) and tp.partial:
|
||||
# use the function()'s sizes and offsets to guide the
|
||||
# layout of the struct
|
||||
totalsize = layout[0]
|
||||
totalalignment = layout[1]
|
||||
fieldofs = layout[2::2]
|
||||
fieldsize = layout[3::2]
|
||||
tp.force_flatten()
|
||||
assert len(fieldofs) == len(fieldsize) == len(tp.fldnames)
|
||||
tp.fixedlayout = fieldofs, fieldsize, totalsize, totalalignment
|
||||
else:
|
||||
cname = ('%s %s' % (prefix, name)).strip()
|
||||
self._struct_pending_verification[tp] = layout, cname
|
||||
|
||||
def _loaded_struct_or_union(self, tp):
|
||||
if tp.fldnames is None:
|
||||
return # nothing to do with opaque structs
|
||||
self.ffi._get_cached_btype(tp) # force 'fixedlayout' to be considered
|
||||
|
||||
if tp in self._struct_pending_verification:
|
||||
# check that the layout sizes and offsets match the real ones
|
||||
def check(realvalue, expectedvalue, msg):
|
||||
if realvalue != expectedvalue:
|
||||
raise VerificationError(
|
||||
"%s (we have %d, but C compiler says %d)"
|
||||
% (msg, expectedvalue, realvalue))
|
||||
ffi = self.ffi
|
||||
BStruct = ffi._get_cached_btype(tp)
|
||||
layout, cname = self._struct_pending_verification.pop(tp)
|
||||
check(layout[0], ffi.sizeof(BStruct), "wrong total size")
|
||||
check(layout[1], ffi.alignof(BStruct), "wrong total alignment")
|
||||
i = 2
|
||||
for fname, ftype, fbitsize, fqual in tp.enumfields():
|
||||
if fbitsize >= 0:
|
||||
continue # xxx ignore fbitsize for now
|
||||
check(layout[i], ffi.offsetof(BStruct, fname),
|
||||
"wrong offset for field %r" % (fname,))
|
||||
if layout[i+1] != 0:
|
||||
BField = ffi._get_cached_btype(ftype)
|
||||
check(layout[i+1], ffi.sizeof(BField),
|
||||
"wrong size for field %r" % (fname,))
|
||||
i += 2
|
||||
assert i == len(layout)
|
||||
|
||||
# ----------
|
||||
# 'anonymous' declarations. These are produced for anonymous structs
|
||||
# or unions; the 'name' is obtained by a typedef.
|
||||
|
||||
def _generate_gen_anonymous_decl(self, tp, name):
|
||||
if isinstance(tp, model.EnumType):
|
||||
self._generate_gen_enum_decl(tp, name, '')
|
||||
else:
|
||||
self._generate_struct_or_union_decl(tp, '', name)
|
||||
|
||||
def _loading_gen_anonymous(self, tp, name, module):
|
||||
if isinstance(tp, model.EnumType):
|
||||
self._loading_gen_enum(tp, name, module, '')
|
||||
else:
|
||||
self._loading_struct_or_union(tp, '', name, module)
|
||||
|
||||
def _loaded_gen_anonymous(self, tp, name, module, **kwds):
|
||||
if isinstance(tp, model.EnumType):
|
||||
self._loaded_gen_enum(tp, name, module, **kwds)
|
||||
else:
|
||||
self._loaded_struct_or_union(tp)
|
||||
|
||||
# ----------
|
||||
# constants, likely declared with '#define'
|
||||
|
||||
def _generate_gen_const(self, is_int, name, tp=None, category='const',
|
||||
check_value=None):
|
||||
prnt = self._prnt
|
||||
funcname = '_cffi_%s_%s' % (category, name)
|
||||
self.export_symbols.append(funcname)
|
||||
if check_value is not None:
|
||||
assert is_int
|
||||
assert category == 'const'
|
||||
prnt('int %s(char *out_error)' % funcname)
|
||||
prnt('{')
|
||||
self._check_int_constant_value(name, check_value)
|
||||
prnt(' return 0;')
|
||||
prnt('}')
|
||||
elif is_int:
|
||||
assert category == 'const'
|
||||
prnt('int %s(long long *out_value)' % funcname)
|
||||
prnt('{')
|
||||
prnt(' *out_value = (long long)(%s);' % (name,))
|
||||
prnt(' return (%s) <= 0;' % (name,))
|
||||
prnt('}')
|
||||
else:
|
||||
assert tp is not None
|
||||
assert check_value is None
|
||||
if category == 'var':
|
||||
ampersand = '&'
|
||||
else:
|
||||
ampersand = ''
|
||||
extra = ''
|
||||
if category == 'const' and isinstance(tp, model.StructOrUnion):
|
||||
extra = 'const *'
|
||||
ampersand = '&'
|
||||
prnt(tp.get_c_name(' %s%s(void)' % (extra, funcname), name))
|
||||
prnt('{')
|
||||
prnt(' return (%s%s);' % (ampersand, name))
|
||||
prnt('}')
|
||||
prnt()
|
||||
|
||||
def _generate_gen_constant_decl(self, tp, name):
|
||||
is_int = isinstance(tp, model.PrimitiveType) and tp.is_integer_type()
|
||||
self._generate_gen_const(is_int, name, tp)
|
||||
|
||||
_loading_gen_constant = _loaded_noop
|
||||
|
||||
def _load_constant(self, is_int, tp, name, module, check_value=None):
|
||||
funcname = '_cffi_const_%s' % name
|
||||
if check_value is not None:
|
||||
assert is_int
|
||||
self._load_known_int_constant(module, funcname)
|
||||
value = check_value
|
||||
elif is_int:
|
||||
BType = self.ffi._typeof_locked("long long*")[0]
|
||||
BFunc = self.ffi._typeof_locked("int(*)(long long*)")[0]
|
||||
function = module.load_function(BFunc, funcname)
|
||||
p = self.ffi.new(BType)
|
||||
negative = function(p)
|
||||
value = int(p[0])
|
||||
if value < 0 and not negative:
|
||||
BLongLong = self.ffi._typeof_locked("long long")[0]
|
||||
value += (1 << (8*self.ffi.sizeof(BLongLong)))
|
||||
else:
|
||||
assert check_value is None
|
||||
fntypeextra = '(*)(void)'
|
||||
if isinstance(tp, model.StructOrUnion):
|
||||
fntypeextra = '*' + fntypeextra
|
||||
BFunc = self.ffi._typeof_locked(tp.get_c_name(fntypeextra, name))[0]
|
||||
function = module.load_function(BFunc, funcname)
|
||||
value = function()
|
||||
if isinstance(tp, model.StructOrUnion):
|
||||
value = value[0]
|
||||
return value
|
||||
|
||||
def _loaded_gen_constant(self, tp, name, module, library):
|
||||
is_int = isinstance(tp, model.PrimitiveType) and tp.is_integer_type()
|
||||
value = self._load_constant(is_int, tp, name, module)
|
||||
setattr(library, name, value)
|
||||
type(library)._cffi_dir.append(name)
|
||||
|
||||
# ----------
|
||||
# enums
|
||||
|
||||
def _check_int_constant_value(self, name, value):
|
||||
prnt = self._prnt
|
||||
if value <= 0:
|
||||
prnt(' if ((%s) > 0 || (long)(%s) != %dL) {' % (
|
||||
name, name, value))
|
||||
else:
|
||||
prnt(' if ((%s) <= 0 || (unsigned long)(%s) != %dUL) {' % (
|
||||
name, name, value))
|
||||
prnt(' char buf[64];')
|
||||
prnt(' if ((%s) <= 0)' % name)
|
||||
prnt(' sprintf(buf, "%%ld", (long)(%s));' % name)
|
||||
prnt(' else')
|
||||
prnt(' sprintf(buf, "%%lu", (unsigned long)(%s));' %
|
||||
name)
|
||||
prnt(' sprintf(out_error, "%s has the real value %s, not %s",')
|
||||
prnt(' "%s", buf, "%d");' % (name[:100], value))
|
||||
prnt(' return -1;')
|
||||
prnt(' }')
|
||||
|
||||
def _load_known_int_constant(self, module, funcname):
|
||||
BType = self.ffi._typeof_locked("char[]")[0]
|
||||
BFunc = self.ffi._typeof_locked("int(*)(char*)")[0]
|
||||
function = module.load_function(BFunc, funcname)
|
||||
p = self.ffi.new(BType, 256)
|
||||
if function(p) < 0:
|
||||
error = self.ffi.string(p)
|
||||
if sys.version_info >= (3,):
|
||||
error = str(error, 'utf-8')
|
||||
raise VerificationError(error)
|
||||
|
||||
def _enum_funcname(self, prefix, name):
|
||||
# "$enum_$1" => "___D_enum____D_1"
|
||||
name = name.replace('$', '___D_')
|
||||
return '_cffi_e_%s_%s' % (prefix, name)
|
||||
|
||||
def _generate_gen_enum_decl(self, tp, name, prefix='enum'):
|
||||
if tp.partial:
|
||||
for enumerator in tp.enumerators:
|
||||
self._generate_gen_const(True, enumerator)
|
||||
return
|
||||
#
|
||||
funcname = self._enum_funcname(prefix, name)
|
||||
self.export_symbols.append(funcname)
|
||||
prnt = self._prnt
|
||||
prnt('int %s(char *out_error)' % funcname)
|
||||
prnt('{')
|
||||
for enumerator, enumvalue in zip(tp.enumerators, tp.enumvalues):
|
||||
self._check_int_constant_value(enumerator, enumvalue)
|
||||
prnt(' return 0;')
|
||||
prnt('}')
|
||||
prnt()
|
||||
|
||||
def _loading_gen_enum(self, tp, name, module, prefix='enum'):
|
||||
if tp.partial:
|
||||
enumvalues = [self._load_constant(True, tp, enumerator, module)
|
||||
for enumerator in tp.enumerators]
|
||||
tp.enumvalues = tuple(enumvalues)
|
||||
tp.partial_resolved = True
|
||||
else:
|
||||
funcname = self._enum_funcname(prefix, name)
|
||||
self._load_known_int_constant(module, funcname)
|
||||
|
||||
def _loaded_gen_enum(self, tp, name, module, library):
|
||||
for enumerator, enumvalue in zip(tp.enumerators, tp.enumvalues):
|
||||
setattr(library, enumerator, enumvalue)
|
||||
type(library)._cffi_dir.append(enumerator)
|
||||
|
||||
# ----------
|
||||
# macros: for now only for integers
|
||||
|
||||
def _generate_gen_macro_decl(self, tp, name):
|
||||
if tp == '...':
|
||||
check_value = None
|
||||
else:
|
||||
check_value = tp # an integer
|
||||
self._generate_gen_const(True, name, check_value=check_value)
|
||||
|
||||
_loading_gen_macro = _loaded_noop
|
||||
|
||||
def _loaded_gen_macro(self, tp, name, module, library):
|
||||
if tp == '...':
|
||||
check_value = None
|
||||
else:
|
||||
check_value = tp # an integer
|
||||
value = self._load_constant(True, tp, name, module,
|
||||
check_value=check_value)
|
||||
setattr(library, name, value)
|
||||
type(library)._cffi_dir.append(name)
|
||||
|
||||
# ----------
|
||||
# global variables
|
||||
|
||||
def _generate_gen_variable_decl(self, tp, name):
|
||||
if isinstance(tp, model.ArrayType):
|
||||
if tp.length_is_unknown():
|
||||
prnt = self._prnt
|
||||
funcname = '_cffi_sizeof_%s' % (name,)
|
||||
self.export_symbols.append(funcname)
|
||||
prnt("size_t %s(void)" % funcname)
|
||||
prnt("{")
|
||||
prnt(" return sizeof(%s);" % (name,))
|
||||
prnt("}")
|
||||
tp_ptr = model.PointerType(tp.item)
|
||||
self._generate_gen_const(False, name, tp_ptr)
|
||||
else:
|
||||
tp_ptr = model.PointerType(tp)
|
||||
self._generate_gen_const(False, name, tp_ptr, category='var')
|
||||
|
||||
_loading_gen_variable = _loaded_noop
|
||||
|
||||
def _loaded_gen_variable(self, tp, name, module, library):
|
||||
if isinstance(tp, model.ArrayType): # int a[5] is "constant" in the
|
||||
# sense that "a=..." is forbidden
|
||||
if tp.length_is_unknown():
|
||||
funcname = '_cffi_sizeof_%s' % (name,)
|
||||
BFunc = self.ffi._typeof_locked('size_t(*)(void)')[0]
|
||||
function = module.load_function(BFunc, funcname)
|
||||
size = function()
|
||||
BItemType = self.ffi._get_cached_btype(tp.item)
|
||||
length, rest = divmod(size, self.ffi.sizeof(BItemType))
|
||||
if rest != 0:
|
||||
raise VerificationError(
|
||||
"bad size: %r does not seem to be an array of %s" %
|
||||
(name, tp.item))
|
||||
tp = tp.resolve_length(length)
|
||||
tp_ptr = model.PointerType(tp.item)
|
||||
value = self._load_constant(False, tp_ptr, name, module)
|
||||
# 'value' is a <cdata 'type *'> which we have to replace with
|
||||
# a <cdata 'type[N]'> if the N is actually known
|
||||
if tp.length is not None:
|
||||
BArray = self.ffi._get_cached_btype(tp)
|
||||
value = self.ffi.cast(BArray, value)
|
||||
setattr(library, name, value)
|
||||
type(library)._cffi_dir.append(name)
|
||||
return
|
||||
# remove ptr=<cdata 'int *'> from the library instance, and replace
|
||||
# it by a property on the class, which reads/writes into ptr[0].
|
||||
funcname = '_cffi_var_%s' % name
|
||||
BFunc = self.ffi._typeof_locked(tp.get_c_name('*(*)(void)', name))[0]
|
||||
function = module.load_function(BFunc, funcname)
|
||||
ptr = function()
|
||||
def getter(library):
|
||||
return ptr[0]
|
||||
def setter(library, value):
|
||||
ptr[0] = value
|
||||
setattr(type(library), name, property(getter, setter))
|
||||
type(library)._cffi_dir.append(name)
|
||||
|
||||
cffimod_header = r'''
|
||||
#include <stdio.h>
|
||||
#include <stddef.h>
|
||||
#include <stdarg.h>
|
||||
#include <errno.h>
|
||||
#include <sys/types.h> /* XXX for ssize_t on some platforms */
|
||||
|
||||
/* this block of #ifs should be kept exactly identical between
|
||||
c/_cffi_backend.c, cffi/vengine_cpy.py, cffi/vengine_gen.py
|
||||
and cffi/_cffi_include.h */
|
||||
#if defined(_MSC_VER)
|
||||
# include <malloc.h> /* for alloca() */
|
||||
# if _MSC_VER < 1600 /* MSVC < 2010 */
|
||||
typedef __int8 int8_t;
|
||||
typedef __int16 int16_t;
|
||||
typedef __int32 int32_t;
|
||||
typedef __int64 int64_t;
|
||||
typedef unsigned __int8 uint8_t;
|
||||
typedef unsigned __int16 uint16_t;
|
||||
typedef unsigned __int32 uint32_t;
|
||||
typedef unsigned __int64 uint64_t;
|
||||
typedef __int8 int_least8_t;
|
||||
typedef __int16 int_least16_t;
|
||||
typedef __int32 int_least32_t;
|
||||
typedef __int64 int_least64_t;
|
||||
typedef unsigned __int8 uint_least8_t;
|
||||
typedef unsigned __int16 uint_least16_t;
|
||||
typedef unsigned __int32 uint_least32_t;
|
||||
typedef unsigned __int64 uint_least64_t;
|
||||
typedef __int8 int_fast8_t;
|
||||
typedef __int16 int_fast16_t;
|
||||
typedef __int32 int_fast32_t;
|
||||
typedef __int64 int_fast64_t;
|
||||
typedef unsigned __int8 uint_fast8_t;
|
||||
typedef unsigned __int16 uint_fast16_t;
|
||||
typedef unsigned __int32 uint_fast32_t;
|
||||
typedef unsigned __int64 uint_fast64_t;
|
||||
typedef __int64 intmax_t;
|
||||
typedef unsigned __int64 uintmax_t;
|
||||
# else
|
||||
# include <stdint.h>
|
||||
# endif
|
||||
# if _MSC_VER < 1800 /* MSVC < 2013 */
|
||||
# ifndef __cplusplus
|
||||
typedef unsigned char _Bool;
|
||||
# endif
|
||||
# endif
|
||||
# define _cffi_float_complex_t _Fcomplex /* include <complex.h> for it */
|
||||
# define _cffi_double_complex_t _Dcomplex /* include <complex.h> for it */
|
||||
#else
|
||||
# include <stdint.h>
|
||||
# if (defined (__SVR4) && defined (__sun)) || defined(_AIX) || defined(__hpux)
|
||||
# include <alloca.h>
|
||||
# endif
|
||||
# define _cffi_float_complex_t float _Complex
|
||||
# define _cffi_double_complex_t double _Complex
|
||||
#endif
|
||||
'''
|
||||
@@ -0,0 +1,306 @@
|
||||
#
|
||||
# DEPRECATED: implementation for ffi.verify()
|
||||
#
|
||||
import sys, os, binascii, shutil, io
|
||||
from . import __version_verifier_modules__
|
||||
from . import ffiplatform
|
||||
from .error import VerificationError
|
||||
|
||||
if sys.version_info >= (3, 3):
|
||||
import importlib.machinery
|
||||
def _extension_suffixes():
|
||||
return importlib.machinery.EXTENSION_SUFFIXES[:]
|
||||
else:
|
||||
import imp
|
||||
def _extension_suffixes():
|
||||
return [suffix for suffix, _, type in imp.get_suffixes()
|
||||
if type == imp.C_EXTENSION]
|
||||
|
||||
|
||||
if sys.version_info >= (3,):
|
||||
NativeIO = io.StringIO
|
||||
else:
|
||||
class NativeIO(io.BytesIO):
|
||||
def write(self, s):
|
||||
if isinstance(s, unicode):
|
||||
s = s.encode('ascii')
|
||||
super(NativeIO, self).write(s)
|
||||
|
||||
|
||||
class Verifier(object):
|
||||
|
||||
def __init__(self, ffi, preamble, tmpdir=None, modulename=None,
|
||||
ext_package=None, tag='', force_generic_engine=False,
|
||||
source_extension='.c', flags=None, relative_to=None, **kwds):
|
||||
if ffi._parser._uses_new_feature:
|
||||
raise VerificationError(
|
||||
"feature not supported with ffi.verify(), but only "
|
||||
"with ffi.set_source(): %s" % (ffi._parser._uses_new_feature,))
|
||||
self.ffi = ffi
|
||||
self.preamble = preamble
|
||||
if not modulename:
|
||||
flattened_kwds = ffiplatform.flatten(kwds)
|
||||
vengine_class = _locate_engine_class(ffi, force_generic_engine)
|
||||
self._vengine = vengine_class(self)
|
||||
self._vengine.patch_extension_kwds(kwds)
|
||||
self.flags = flags
|
||||
self.kwds = self.make_relative_to(kwds, relative_to)
|
||||
#
|
||||
if modulename:
|
||||
if tag:
|
||||
raise TypeError("can't specify both 'modulename' and 'tag'")
|
||||
else:
|
||||
key = '\x00'.join(['%d.%d' % sys.version_info[:2],
|
||||
__version_verifier_modules__,
|
||||
preamble, flattened_kwds] +
|
||||
ffi._cdefsources)
|
||||
if sys.version_info >= (3,):
|
||||
key = key.encode('utf-8')
|
||||
k1 = hex(binascii.crc32(key[0::2]) & 0xffffffff)
|
||||
k1 = k1.lstrip('0x').rstrip('L')
|
||||
k2 = hex(binascii.crc32(key[1::2]) & 0xffffffff)
|
||||
k2 = k2.lstrip('0').rstrip('L')
|
||||
modulename = '_cffi_%s_%s%s%s' % (tag, self._vengine._class_key,
|
||||
k1, k2)
|
||||
suffix = _get_so_suffixes()[0]
|
||||
self.tmpdir = tmpdir or _caller_dir_pycache()
|
||||
self.sourcefilename = os.path.join(self.tmpdir, modulename + source_extension)
|
||||
self.modulefilename = os.path.join(self.tmpdir, modulename + suffix)
|
||||
self.ext_package = ext_package
|
||||
self._has_source = False
|
||||
self._has_module = False
|
||||
|
||||
def write_source(self, file=None):
|
||||
"""Write the C source code. It is produced in 'self.sourcefilename',
|
||||
which can be tweaked beforehand."""
|
||||
with self.ffi._lock:
|
||||
if self._has_source and file is None:
|
||||
raise VerificationError(
|
||||
"source code already written")
|
||||
self._write_source(file)
|
||||
|
||||
def compile_module(self):
|
||||
"""Write the C source code (if not done already) and compile it.
|
||||
This produces a dynamic link library in 'self.modulefilename'."""
|
||||
with self.ffi._lock:
|
||||
if self._has_module:
|
||||
raise VerificationError("module already compiled")
|
||||
if not self._has_source:
|
||||
self._write_source()
|
||||
self._compile_module()
|
||||
|
||||
def load_library(self):
|
||||
"""Get a C module from this Verifier instance.
|
||||
Returns an instance of a FFILibrary class that behaves like the
|
||||
objects returned by ffi.dlopen(), but that delegates all
|
||||
operations to the C module. If necessary, the C code is written
|
||||
and compiled first.
|
||||
"""
|
||||
with self.ffi._lock:
|
||||
if not self._has_module:
|
||||
self._locate_module()
|
||||
if not self._has_module:
|
||||
if not self._has_source:
|
||||
self._write_source()
|
||||
self._compile_module()
|
||||
return self._load_library()
|
||||
|
||||
def get_module_name(self):
|
||||
basename = os.path.basename(self.modulefilename)
|
||||
# kill both the .so extension and the other .'s, as introduced
|
||||
# by Python 3: 'basename.cpython-33m.so'
|
||||
basename = basename.split('.', 1)[0]
|
||||
# and the _d added in Python 2 debug builds --- but try to be
|
||||
# conservative and not kill a legitimate _d
|
||||
if basename.endswith('_d') and hasattr(sys, 'gettotalrefcount'):
|
||||
basename = basename[:-2]
|
||||
return basename
|
||||
|
||||
def get_extension(self):
|
||||
if not self._has_source:
|
||||
with self.ffi._lock:
|
||||
if not self._has_source:
|
||||
self._write_source()
|
||||
sourcename = ffiplatform.maybe_relative_path(self.sourcefilename)
|
||||
modname = self.get_module_name()
|
||||
return ffiplatform.get_extension(sourcename, modname, **self.kwds)
|
||||
|
||||
def generates_python_module(self):
|
||||
return self._vengine._gen_python_module
|
||||
|
||||
def make_relative_to(self, kwds, relative_to):
|
||||
if relative_to and os.path.dirname(relative_to):
|
||||
dirname = os.path.dirname(relative_to)
|
||||
kwds = kwds.copy()
|
||||
for key in ffiplatform.LIST_OF_FILE_NAMES:
|
||||
if key in kwds:
|
||||
lst = kwds[key]
|
||||
if not isinstance(lst, (list, tuple)):
|
||||
raise TypeError("keyword '%s' should be a list or tuple"
|
||||
% (key,))
|
||||
lst = [os.path.join(dirname, fn) for fn in lst]
|
||||
kwds[key] = lst
|
||||
return kwds
|
||||
|
||||
# ----------
|
||||
|
||||
def _locate_module(self):
|
||||
if not os.path.isfile(self.modulefilename):
|
||||
if self.ext_package:
|
||||
try:
|
||||
pkg = __import__(self.ext_package, None, None, ['__doc__'])
|
||||
except ImportError:
|
||||
return # cannot import the package itself, give up
|
||||
# (e.g. it might be called differently before installation)
|
||||
path = pkg.__path__
|
||||
else:
|
||||
path = None
|
||||
filename = self._vengine.find_module(self.get_module_name(), path,
|
||||
_get_so_suffixes())
|
||||
if filename is None:
|
||||
return
|
||||
self.modulefilename = filename
|
||||
self._vengine.collect_types()
|
||||
self._has_module = True
|
||||
|
||||
def _write_source_to(self, file):
|
||||
self._vengine._f = file
|
||||
try:
|
||||
self._vengine.write_source_to_f()
|
||||
finally:
|
||||
del self._vengine._f
|
||||
|
||||
def _write_source(self, file=None):
|
||||
if file is not None:
|
||||
self._write_source_to(file)
|
||||
else:
|
||||
# Write our source file to an in memory file.
|
||||
f = NativeIO()
|
||||
self._write_source_to(f)
|
||||
source_data = f.getvalue()
|
||||
|
||||
# Determine if this matches the current file
|
||||
if os.path.exists(self.sourcefilename):
|
||||
with open(self.sourcefilename, "r") as fp:
|
||||
needs_written = not (fp.read() == source_data)
|
||||
else:
|
||||
needs_written = True
|
||||
|
||||
# Actually write the file out if it doesn't match
|
||||
if needs_written:
|
||||
_ensure_dir(self.sourcefilename)
|
||||
with open(self.sourcefilename, "w") as fp:
|
||||
fp.write(source_data)
|
||||
|
||||
# Set this flag
|
||||
self._has_source = True
|
||||
|
||||
def _compile_module(self):
|
||||
# compile this C source
|
||||
tmpdir = os.path.dirname(self.sourcefilename)
|
||||
outputfilename = ffiplatform.compile(tmpdir, self.get_extension())
|
||||
try:
|
||||
same = ffiplatform.samefile(outputfilename, self.modulefilename)
|
||||
except OSError:
|
||||
same = False
|
||||
if not same:
|
||||
_ensure_dir(self.modulefilename)
|
||||
shutil.move(outputfilename, self.modulefilename)
|
||||
self._has_module = True
|
||||
|
||||
def _load_library(self):
|
||||
assert self._has_module
|
||||
if self.flags is not None:
|
||||
return self._vengine.load_library(self.flags)
|
||||
else:
|
||||
return self._vengine.load_library()
|
||||
|
||||
# ____________________________________________________________
|
||||
|
||||
_FORCE_GENERIC_ENGINE = False # for tests
|
||||
|
||||
def _locate_engine_class(ffi, force_generic_engine):
|
||||
if _FORCE_GENERIC_ENGINE:
|
||||
force_generic_engine = True
|
||||
if not force_generic_engine:
|
||||
if '__pypy__' in sys.builtin_module_names:
|
||||
force_generic_engine = True
|
||||
else:
|
||||
try:
|
||||
import _cffi_backend
|
||||
except ImportError:
|
||||
_cffi_backend = '?'
|
||||
if ffi._backend is not _cffi_backend:
|
||||
force_generic_engine = True
|
||||
if force_generic_engine:
|
||||
from . import vengine_gen
|
||||
return vengine_gen.VGenericEngine
|
||||
else:
|
||||
from . import vengine_cpy
|
||||
return vengine_cpy.VCPythonEngine
|
||||
|
||||
# ____________________________________________________________
|
||||
|
||||
_TMPDIR = None
|
||||
|
||||
def _caller_dir_pycache():
|
||||
if _TMPDIR:
|
||||
return _TMPDIR
|
||||
result = os.environ.get('CFFI_TMPDIR')
|
||||
if result:
|
||||
return result
|
||||
filename = sys._getframe(2).f_code.co_filename
|
||||
return os.path.abspath(os.path.join(os.path.dirname(filename),
|
||||
'__pycache__'))
|
||||
|
||||
def set_tmpdir(dirname):
|
||||
"""Set the temporary directory to use instead of __pycache__."""
|
||||
global _TMPDIR
|
||||
_TMPDIR = dirname
|
||||
|
||||
def cleanup_tmpdir(tmpdir=None, keep_so=False):
|
||||
"""Clean up the temporary directory by removing all files in it
|
||||
called `_cffi_*.{c,so}` as well as the `build` subdirectory."""
|
||||
tmpdir = tmpdir or _caller_dir_pycache()
|
||||
try:
|
||||
filelist = os.listdir(tmpdir)
|
||||
except OSError:
|
||||
return
|
||||
if keep_so:
|
||||
suffix = '.c' # only remove .c files
|
||||
else:
|
||||
suffix = _get_so_suffixes()[0].lower()
|
||||
for fn in filelist:
|
||||
if fn.lower().startswith('_cffi_') and (
|
||||
fn.lower().endswith(suffix) or fn.lower().endswith('.c')):
|
||||
try:
|
||||
os.unlink(os.path.join(tmpdir, fn))
|
||||
except OSError:
|
||||
pass
|
||||
clean_dir = [os.path.join(tmpdir, 'build')]
|
||||
for dir in clean_dir:
|
||||
try:
|
||||
for fn in os.listdir(dir):
|
||||
fn = os.path.join(dir, fn)
|
||||
if os.path.isdir(fn):
|
||||
clean_dir.append(fn)
|
||||
else:
|
||||
os.unlink(fn)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
def _get_so_suffixes():
|
||||
suffixes = _extension_suffixes()
|
||||
if not suffixes:
|
||||
# bah, no C_EXTENSION available. Occurs on pypy without cpyext
|
||||
if sys.platform == 'win32':
|
||||
suffixes = [".pyd"]
|
||||
else:
|
||||
suffixes = [".so"]
|
||||
|
||||
return suffixes
|
||||
|
||||
def _ensure_dir(filename):
|
||||
dirname = os.path.dirname(filename)
|
||||
if dirname and not os.path.isdir(dirname):
|
||||
os.makedirs(dirname)
|
||||
+1
@@ -0,0 +1 @@
|
||||
pip
|
||||
+764
@@ -0,0 +1,764 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: charset-normalizer
|
||||
Version: 3.4.4
|
||||
Summary: The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.
|
||||
Author-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
|
||||
Maintainer-email: "Ahmed R. TAHRI" <tahri.ahmed@proton.me>
|
||||
License: MIT
|
||||
Project-URL: Changelog, https://github.com/jawah/charset_normalizer/blob/master/CHANGELOG.md
|
||||
Project-URL: Documentation, https://charset-normalizer.readthedocs.io/
|
||||
Project-URL: Code, https://github.com/jawah/charset_normalizer
|
||||
Project-URL: Issue tracker, https://github.com/jawah/charset_normalizer/issues
|
||||
Keywords: encoding,charset,charset-detector,detector,normalization,unicode,chardet,detect
|
||||
Classifier: Development Status :: 5 - Production/Stable
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Operating System :: OS Independent
|
||||
Classifier: Programming Language :: Python
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.7
|
||||
Classifier: Programming Language :: Python :: 3.8
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Classifier: Programming Language :: Python :: 3.13
|
||||
Classifier: Programming Language :: Python :: 3.14
|
||||
Classifier: Programming Language :: Python :: 3 :: Only
|
||||
Classifier: Programming Language :: Python :: Implementation :: CPython
|
||||
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
||||
Classifier: Topic :: Text Processing :: Linguistic
|
||||
Classifier: Topic :: Utilities
|
||||
Classifier: Typing :: Typed
|
||||
Requires-Python: >=3.7
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Provides-Extra: unicode-backport
|
||||
Dynamic: license-file
|
||||
|
||||
<h1 align="center">Charset Detection, for Everyone 👋</h1>
|
||||
|
||||
<p align="center">
|
||||
<sup>The Real First Universal Charset Detector</sup><br>
|
||||
<a href="https://pypi.org/project/charset-normalizer">
|
||||
<img src="https://img.shields.io/pypi/pyversions/charset_normalizer.svg?orange=blue" />
|
||||
</a>
|
||||
<a href="https://pepy.tech/project/charset-normalizer/">
|
||||
<img alt="Download Count Total" src="https://static.pepy.tech/badge/charset-normalizer/month" />
|
||||
</a>
|
||||
<a href="https://bestpractices.coreinfrastructure.org/projects/7297">
|
||||
<img src="https://bestpractices.coreinfrastructure.org/projects/7297/badge">
|
||||
</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<sup><i>Featured Packages</i></sup><br>
|
||||
<a href="https://github.com/jawah/niquests">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Niquests-Most_Advanced_HTTP_Client-cyan">
|
||||
</a>
|
||||
<a href="https://github.com/jawah/wassima">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Wassima-Certifi_Replacement-cyan">
|
||||
</a>
|
||||
</p>
|
||||
<p align="center">
|
||||
<sup><i>In other language (unofficial port - by the community)</i></sup><br>
|
||||
<a href="https://github.com/nickspring/charset-normalizer-rs">
|
||||
<img alt="Static Badge" src="https://img.shields.io/badge/Rust-red">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
> A library that helps you read text from an unknown charset encoding.<br /> Motivated by `chardet`,
|
||||
> I'm trying to resolve the issue by taking a new approach.
|
||||
> All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
<p align="center">
|
||||
>>>>> <a href="https://charsetnormalizerweb.ousret.now.sh" target="_blank">👉 Try Me Online Now, Then Adopt Me 👈 </a> <<<<<
|
||||
</p>
|
||||
|
||||
This project offers you an alternative to **Universal Charset Encoding Detector**, also known as **Chardet**.
|
||||
|
||||
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
|
||||
|--------------------------------------------------|:---------------------------------------------:|:--------------------------------------------------------------------------------------------------:|:-----------------------------------------------:|
|
||||
| `Fast` | ❌ | ✅ | ✅ |
|
||||
| `Universal**` | ❌ | ✅ | ❌ |
|
||||
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
|
||||
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
|
||||
| `License` | LGPL-2.1<br>_restrictive_ | MIT | MPL-1.1<br>_restrictive_ |
|
||||
| `Native Python` | ✅ | ✅ | ❌ |
|
||||
| `Detect spoken language` | ❌ | ✅ | N/A |
|
||||
| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ |
|
||||
| `Whl Size (min)` | 193.6 kB | 42 kB | ~200 kB |
|
||||
| `Supported Encoding` | 33 | 🎉 [99](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 |
|
||||
|
||||
<p align="center">
|
||||
<img src="https://i.imgflip.com/373iay.gif" alt="Reading Normalized Text" width="226"/><img src="https://media.tenor.com/images/c0180f70732a18b4965448d33adba3d0/tenor.gif" alt="Cat Reading Text" width="200"/>
|
||||
</p>
|
||||
|
||||
*\*\* : They are clearly using specific code for a specific encoding even if covering most of used one*<br>
|
||||
|
||||
## ⚡ Performance
|
||||
|
||||
This package offer better performance than its counterpart Chardet. Here are some numbers.
|
||||
|
||||
| Package | Accuracy | Mean per file (ms) | File per sec (est) |
|
||||
|-----------------------------------------------|:--------:|:------------------:|:------------------:|
|
||||
| [chardet](https://github.com/chardet/chardet) | 86 % | 63 ms | 16 file/sec |
|
||||
| charset-normalizer | **98 %** | **10 ms** | 100 file/sec |
|
||||
|
||||
| Package | 99th percentile | 95th percentile | 50th percentile |
|
||||
|-----------------------------------------------|:---------------:|:---------------:|:---------------:|
|
||||
| [chardet](https://github.com/chardet/chardet) | 265 ms | 71 ms | 7 ms |
|
||||
| charset-normalizer | 100 ms | 50 ms | 5 ms |
|
||||
|
||||
_updated as of december 2024 using CPython 3.12_
|
||||
|
||||
Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload.
|
||||
|
||||
> Stats are generated using 400+ files using default parameters. More details on used files, see GHA workflows.
|
||||
> And yes, these results might change at any time. The dataset can be updated to include more files.
|
||||
> The actual delays heavily depends on your CPU capabilities. The factors should remain the same.
|
||||
> Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability
|
||||
> (e.g. Supported Encoding) Challenge-them if you want.
|
||||
|
||||
## ✨ Installation
|
||||
|
||||
Using pip:
|
||||
|
||||
```sh
|
||||
pip install charset-normalizer -U
|
||||
```
|
||||
|
||||
## 🚀 Basic Usage
|
||||
|
||||
### CLI
|
||||
This package comes with a CLI.
|
||||
|
||||
```
|
||||
usage: normalizer [-h] [-v] [-a] [-n] [-m] [-r] [-f] [-t THRESHOLD]
|
||||
file [file ...]
|
||||
|
||||
The Real First Universal Charset Detector. Discover originating encoding used
|
||||
on text file. Normalize text to unicode.
|
||||
|
||||
positional arguments:
|
||||
files File(s) to be analysed
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v, --verbose Display complementary information about file if any.
|
||||
Stdout will contain logs about the detection process.
|
||||
-a, --with-alternative
|
||||
Output complementary possibilities if any. Top-level
|
||||
JSON WILL be a list.
|
||||
-n, --normalize Permit to normalize input file. If not set, program
|
||||
does not write anything.
|
||||
-m, --minimal Only output the charset detected to STDOUT. Disabling
|
||||
JSON output.
|
||||
-r, --replace Replace file when trying to normalize it instead of
|
||||
creating a new one.
|
||||
-f, --force Replace file without asking if you are sure, use this
|
||||
flag with caution.
|
||||
-t THRESHOLD, --threshold THRESHOLD
|
||||
Define a custom maximum amount of chaos allowed in
|
||||
decoded content. 0. <= chaos <= 1.
|
||||
--version Show version information and exit.
|
||||
```
|
||||
|
||||
```bash
|
||||
normalizer ./data/sample.1.fr.srt
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
python -m charset_normalizer ./data/sample.1.fr.srt
|
||||
```
|
||||
|
||||
🎉 Since version 1.4.0 the CLI produce easily usable stdout result in JSON format.
|
||||
|
||||
```json
|
||||
{
|
||||
"path": "/home/default/projects/charset_normalizer/data/sample.1.fr.srt",
|
||||
"encoding": "cp1252",
|
||||
"encoding_aliases": [
|
||||
"1252",
|
||||
"windows_1252"
|
||||
],
|
||||
"alternative_encodings": [
|
||||
"cp1254",
|
||||
"cp1256",
|
||||
"cp1258",
|
||||
"iso8859_14",
|
||||
"iso8859_15",
|
||||
"iso8859_16",
|
||||
"iso8859_3",
|
||||
"iso8859_9",
|
||||
"latin_1",
|
||||
"mbcs"
|
||||
],
|
||||
"language": "French",
|
||||
"alphabets": [
|
||||
"Basic Latin",
|
||||
"Latin-1 Supplement"
|
||||
],
|
||||
"has_sig_or_bom": false,
|
||||
"chaos": 0.149,
|
||||
"coherence": 97.152,
|
||||
"unicode_path": null,
|
||||
"is_preferred": true
|
||||
}
|
||||
```
|
||||
|
||||
### Python
|
||||
*Just print out normalized text*
|
||||
```python
|
||||
from charset_normalizer import from_path
|
||||
|
||||
results = from_path('./my_subtitle.srt')
|
||||
|
||||
print(str(results.best()))
|
||||
```
|
||||
|
||||
*Upgrade your code without effort*
|
||||
```python
|
||||
from charset_normalizer import detect
|
||||
```
|
||||
|
||||
The above code will behave the same as **chardet**. We ensure that we offer the best (reasonable) BC result possible.
|
||||
|
||||
See the docs for advanced usage : [readthedocs.io](https://charset-normalizer.readthedocs.io/en/latest/)
|
||||
|
||||
## 😇 Why
|
||||
|
||||
When I started using Chardet, I noticed that it was not suited to my expectations, and I wanted to propose a
|
||||
reliable alternative using a completely different method. Also! I never back down on a good challenge!
|
||||
|
||||
I **don't care** about the **originating charset** encoding, because **two different tables** can
|
||||
produce **two identical rendered string.**
|
||||
What I want is to get readable text, the best I can.
|
||||
|
||||
In a way, **I'm brute forcing text decoding.** How cool is that ? 😎
|
||||
|
||||
Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair Unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode.
|
||||
|
||||
## 🍰 How
|
||||
|
||||
- Discard all charset encoding table that could not fit the binary content.
|
||||
- Measure noise, or the mess once opened (by chunks) with a corresponding charset encoding.
|
||||
- Extract matches with the lowest mess detected.
|
||||
- Additionally, we measure coherence / probe for a language.
|
||||
|
||||
**Wait a minute**, what is noise/mess and coherence according to **YOU ?**
|
||||
|
||||
*Noise :* I opened hundred of text files, **written by humans**, with the wrong encoding table. **I observed**, then
|
||||
**I established** some ground rules about **what is obvious** when **it seems like** a mess (aka. defining noise in rendered text).
|
||||
I know that my interpretation of what is noise is probably incomplete, feel free to contribute in order to
|
||||
improve or rewrite it.
|
||||
|
||||
*Coherence :* For each language there is on earth, we have computed ranked letter appearance occurrences (the best we can). So I thought
|
||||
that intel is worth something here. So I use those records against decoded text to check if I can detect intelligent design.
|
||||
|
||||
## ⚡ Known limitations
|
||||
|
||||
- Language detection is unreliable when text contains two or more languages sharing identical letters. (eg. HTML (english tags) + Turkish content (Sharing Latin characters))
|
||||
- Every charset detector heavily depends on sufficient content. In common cases, do not bother run detection on very tiny content.
|
||||
|
||||
## ⚠️ About Python EOLs
|
||||
|
||||
**If you are running:**
|
||||
|
||||
- Python >=2.7,<3.5: Unsupported
|
||||
- Python 3.5: charset-normalizer < 2.1
|
||||
- Python 3.6: charset-normalizer < 3.1
|
||||
- Python 3.7: charset-normalizer < 4.0
|
||||
|
||||
Upgrade your Python interpreter as soon as possible.
|
||||
|
||||
## 👤 Contributing
|
||||
|
||||
Contributions, issues and feature requests are very much welcome.<br />
|
||||
Feel free to check [issues page](https://github.com/ousret/charset_normalizer/issues) if you want to contribute.
|
||||
|
||||
## 📝 License
|
||||
|
||||
Copyright © [Ahmed TAHRI @Ousret](https://github.com/Ousret).<br />
|
||||
This project is [MIT](https://github.com/Ousret/charset_normalizer/blob/master/LICENSE) licensed.
|
||||
|
||||
Characters frequencies used in this project © 2012 [Denny Vrandečić](http://simia.net/letters/)
|
||||
|
||||
## 💼 For Enterprise
|
||||
|
||||
Professional support for charset-normalizer is available as part of the [Tidelift
|
||||
Subscription][1]. Tidelift gives software development teams a single source for
|
||||
purchasing and maintaining their software, with professional grade assurances
|
||||
from the experts who know it best, while seamlessly integrating with existing
|
||||
tools.
|
||||
|
||||
[1]: https://tidelift.com/subscription/pkg/pypi-charset-normalizer?utm_source=pypi-charset-normalizer&utm_medium=readme
|
||||
|
||||
[](https://www.bestpractices.dev/projects/7297)
|
||||
|
||||
# Changelog
|
||||
All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
|
||||
|
||||
## [3.4.4](https://github.com/Ousret/charset_normalizer/compare/3.4.2...3.4.4) (2025-10-13)
|
||||
|
||||
### Changed
|
||||
- Bound `setuptools` to a specific constraint `setuptools>=68,<=81`.
|
||||
- Raised upper bound of mypyc for the optional pre-built extension to v1.18.2
|
||||
|
||||
### Removed
|
||||
- `setuptools-scm` as a build dependency.
|
||||
|
||||
### Misc
|
||||
- Enforced hashes in `dev-requirements.txt` and created `ci-requirements.txt` for security purposes.
|
||||
- Additional pre-built wheels for riscv64, s390x, and armv7l architectures.
|
||||
- Restore ` multiple.intoto.jsonl` in GitHub releases in addition to individual attestation file per wheel.
|
||||
|
||||
## [3.4.3](https://github.com/Ousret/charset_normalizer/compare/3.4.2...3.4.3) (2025-08-09)
|
||||
|
||||
### Changed
|
||||
- mypy(c) is no longer a required dependency at build time if `CHARSET_NORMALIZER_USE_MYPYC` isn't set to `1`. (#595) (#583)
|
||||
- automatically lower confidence on small bytes samples that are not Unicode in `detect` output legacy function. (#391)
|
||||
|
||||
### Added
|
||||
- Custom build backend to overcome inability to mark mypy as an optional dependency in the build phase.
|
||||
- Support for Python 3.14
|
||||
|
||||
### Fixed
|
||||
- sdist archive contained useless directories.
|
||||
- automatically fallback on valid UTF-16 or UTF-32 even if the md says it's noisy. (#633)
|
||||
|
||||
### Misc
|
||||
- SBOM are automatically published to the relevant GitHub release to comply with regulatory changes.
|
||||
Each published wheel comes with its SBOM. We choose CycloneDX as the format.
|
||||
- Prebuilt optimized wheel are no longer distributed by default for CPython 3.7 due to a change in cibuildwheel.
|
||||
|
||||
## [3.4.2](https://github.com/Ousret/charset_normalizer/compare/3.4.1...3.4.2) (2025-05-02)
|
||||
|
||||
### Fixed
|
||||
- Addressed the DeprecationWarning in our CLI regarding `argparse.FileType` by backporting the target class into the package. (#591)
|
||||
- Improved the overall reliability of the detector with CJK Ideographs. (#605) (#587)
|
||||
|
||||
### Changed
|
||||
- Optional mypyc compilation upgraded to version 1.15 for Python >= 3.8
|
||||
|
||||
## [3.4.1](https://github.com/Ousret/charset_normalizer/compare/3.4.0...3.4.1) (2024-12-24)
|
||||
|
||||
### Changed
|
||||
- Project metadata are now stored using `pyproject.toml` instead of `setup.cfg` using setuptools as the build backend.
|
||||
- Enforce annotation delayed loading for a simpler and consistent types in the project.
|
||||
- Optional mypyc compilation upgraded to version 1.14 for Python >= 3.8
|
||||
|
||||
### Added
|
||||
- pre-commit configuration.
|
||||
- noxfile.
|
||||
|
||||
### Removed
|
||||
- `build-requirements.txt` as per using `pyproject.toml` native build configuration.
|
||||
- `bin/integration.py` and `bin/serve.py` in favor of downstream integration test (see noxfile).
|
||||
- `setup.cfg` in favor of `pyproject.toml` metadata configuration.
|
||||
- Unused `utils.range_scan` function.
|
||||
|
||||
### Fixed
|
||||
- Converting content to Unicode bytes may insert `utf_8` instead of preferred `utf-8`. (#572)
|
||||
- Deprecation warning "'count' is passed as positional argument" when converting to Unicode bytes on Python 3.13+
|
||||
|
||||
## [3.4.0](https://github.com/Ousret/charset_normalizer/compare/3.3.2...3.4.0) (2024-10-08)
|
||||
|
||||
### Added
|
||||
- Argument `--no-preemptive` in the CLI to prevent the detector to search for hints.
|
||||
- Support for Python 3.13 (#512)
|
||||
|
||||
### Fixed
|
||||
- Relax the TypeError exception thrown when trying to compare a CharsetMatch with anything else than a CharsetMatch.
|
||||
- Improved the general reliability of the detector based on user feedbacks. (#520) (#509) (#498) (#407) (#537)
|
||||
- Declared charset in content (preemptive detection) not changed when converting to utf-8 bytes. (#381)
|
||||
|
||||
## [3.3.2](https://github.com/Ousret/charset_normalizer/compare/3.3.1...3.3.2) (2023-10-31)
|
||||
|
||||
### Fixed
|
||||
- Unintentional memory usage regression when using large payload that match several encoding (#376)
|
||||
- Regression on some detection case showcased in the documentation (#371)
|
||||
|
||||
### Added
|
||||
- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife)
|
||||
|
||||
## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22)
|
||||
|
||||
### Changed
|
||||
- Optional mypyc compilation upgraded to version 1.6.1 for Python >= 3.8
|
||||
- Improved the general detection reliability based on reports from the community
|
||||
|
||||
## [3.3.0](https://github.com/Ousret/charset_normalizer/compare/3.2.0...3.3.0) (2023-09-30)
|
||||
|
||||
### Added
|
||||
- Allow to execute the CLI (e.g. normalizer) through `python -m charset_normalizer.cli` or `python -m charset_normalizer`
|
||||
- Support for 9 forgotten encoding that are supported by Python but unlisted in `encoding.aliases` as they have no alias (#323)
|
||||
|
||||
### Removed
|
||||
- (internal) Redundant utils.is_ascii function and unused function is_private_use_only
|
||||
- (internal) charset_normalizer.assets is moved inside charset_normalizer.constant
|
||||
|
||||
### Changed
|
||||
- (internal) Unicode code blocks in constants are updated using the latest v15.0.0 definition to improve detection
|
||||
- Optional mypyc compilation upgraded to version 1.5.1 for Python >= 3.8
|
||||
|
||||
### Fixed
|
||||
- Unable to properly sort CharsetMatch when both chaos/noise and coherence were close due to an unreachable condition in \_\_lt\_\_ (#350)
|
||||
|
||||
## [3.2.0](https://github.com/Ousret/charset_normalizer/compare/3.1.0...3.2.0) (2023-06-07)
|
||||
|
||||
### Changed
|
||||
- Typehint for function `from_path` no longer enforce `PathLike` as its first argument
|
||||
- Minor improvement over the global detection reliability
|
||||
|
||||
### Added
|
||||
- Introduce function `is_binary` that relies on main capabilities, and optimized to detect binaries
|
||||
- Propagate `enable_fallback` argument throughout `from_bytes`, `from_path`, and `from_fp` that allow a deeper control over the detection (default True)
|
||||
- Explicit support for Python 3.12
|
||||
|
||||
### Fixed
|
||||
- Edge case detection failure where a file would contain 'very-long' camel cased word (Issue #289)
|
||||
|
||||
## [3.1.0](https://github.com/Ousret/charset_normalizer/compare/3.0.1...3.1.0) (2023-03-06)
|
||||
|
||||
### Added
|
||||
- Argument `should_rename_legacy` for legacy function `detect` and disregard any new arguments without errors (PR #262)
|
||||
|
||||
### Removed
|
||||
- Support for Python 3.6 (PR #260)
|
||||
|
||||
### Changed
|
||||
- Optional speedup provided by mypy/c 1.0.1
|
||||
|
||||
## [3.0.1](https://github.com/Ousret/charset_normalizer/compare/3.0.0...3.0.1) (2022-11-18)
|
||||
|
||||
### Fixed
|
||||
- Multi-bytes cutter/chunk generator did not always cut correctly (PR #233)
|
||||
|
||||
### Changed
|
||||
- Speedup provided by mypy/c 0.990 on Python >= 3.7
|
||||
|
||||
## [3.0.0](https://github.com/Ousret/charset_normalizer/compare/2.1.1...3.0.0) (2022-10-20)
|
||||
|
||||
### Added
|
||||
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
||||
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
||||
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
||||
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
||||
|
||||
### Changed
|
||||
- Build with static metadata using 'build' frontend
|
||||
- Make the language detection stricter
|
||||
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
||||
|
||||
### Fixed
|
||||
- CLI with opt --normalize fail when using full path for files
|
||||
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
||||
- Sphinx warnings when generating the documentation
|
||||
|
||||
### Removed
|
||||
- Coherence detector no longer return 'Simple English' instead return 'English'
|
||||
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
||||
- Breaking: Method `first()` and `best()` from CharsetMatch
|
||||
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
||||
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
||||
- Breaking: Top-level function `normalize`
|
||||
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
||||
- Support for the backport `unicodedata2`
|
||||
|
||||
## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18)
|
||||
|
||||
### Added
|
||||
- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results
|
||||
- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES
|
||||
- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio
|
||||
|
||||
### Changed
|
||||
- Build with static metadata using 'build' frontend
|
||||
- Make the language detection stricter
|
||||
|
||||
### Fixed
|
||||
- CLI with opt --normalize fail when using full path for files
|
||||
- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it
|
||||
|
||||
### Removed
|
||||
- Coherence detector no longer return 'Simple English' instead return 'English'
|
||||
- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese'
|
||||
|
||||
## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21)
|
||||
|
||||
### Added
|
||||
- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl)
|
||||
|
||||
### Removed
|
||||
- Breaking: Method `first()` and `best()` from CharsetMatch
|
||||
- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII)
|
||||
|
||||
### Fixed
|
||||
- Sphinx warnings when generating the documentation
|
||||
|
||||
## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15)
|
||||
|
||||
### Changed
|
||||
- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1
|
||||
|
||||
### Removed
|
||||
- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches
|
||||
- Breaking: Top-level function `normalize`
|
||||
- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch
|
||||
- Support for the backport `unicodedata2`
|
||||
|
||||
## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19)
|
||||
|
||||
### Deprecated
|
||||
- Function `normalize` scheduled for removal in 3.0
|
||||
|
||||
### Changed
|
||||
- Removed useless call to decode in fn is_unprintable (#206)
|
||||
|
||||
### Fixed
|
||||
- Third-party library (i18n xgettext) crashing not recognizing utf_8 (PEP 263) with underscore from [@aleksandernovikov](https://github.com/aleksandernovikov) (#204)
|
||||
|
||||
## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19)
|
||||
|
||||
### Added
|
||||
- Output the Unicode table version when running the CLI with `--version` (PR #194)
|
||||
|
||||
### Changed
|
||||
- Re-use decoded buffer for single byte character sets from [@nijel](https://github.com/nijel) (PR #175)
|
||||
- Fixing some performance bottlenecks from [@deedy5](https://github.com/deedy5) (PR #183)
|
||||
|
||||
### Fixed
|
||||
- Workaround potential bug in cpython with Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space (PR #175)
|
||||
- CLI default threshold aligned with the API threshold from [@oleksandr-kuzmenko](https://github.com/oleksandr-kuzmenko) (PR #181)
|
||||
|
||||
### Removed
|
||||
- Support for Python 3.5 (PR #192)
|
||||
|
||||
### Deprecated
|
||||
- Use of backport unicodedata from `unicodedata2` as Python is quickly catching up, scheduled for removal in 3.0 (PR #194)
|
||||
|
||||
## [2.0.12](https://github.com/Ousret/charset_normalizer/compare/2.0.11...2.0.12) (2022-02-12)
|
||||
|
||||
### Fixed
|
||||
- ASCII miss-detection on rare cases (PR #170)
|
||||
|
||||
## [2.0.11](https://github.com/Ousret/charset_normalizer/compare/2.0.10...2.0.11) (2022-01-30)
|
||||
|
||||
### Added
|
||||
- Explicit support for Python 3.11 (PR #164)
|
||||
|
||||
### Changed
|
||||
- The logging behavior have been completely reviewed, now using only TRACE and DEBUG levels (PR #163 #165)
|
||||
|
||||
## [2.0.10](https://github.com/Ousret/charset_normalizer/compare/2.0.9...2.0.10) (2022-01-04)
|
||||
|
||||
### Fixed
|
||||
- Fallback match entries might lead to UnicodeDecodeError for large bytes sequence (PR #154)
|
||||
|
||||
### Changed
|
||||
- Skipping the language-detection (CD) on ASCII (PR #155)
|
||||
|
||||
## [2.0.9](https://github.com/Ousret/charset_normalizer/compare/2.0.8...2.0.9) (2021-12-03)
|
||||
|
||||
### Changed
|
||||
- Moderating the logging impact (since 2.0.8) for specific environments (PR #147)
|
||||
|
||||
### Fixed
|
||||
- Wrong logging level applied when setting kwarg `explain` to True (PR #146)
|
||||
|
||||
## [2.0.8](https://github.com/Ousret/charset_normalizer/compare/2.0.7...2.0.8) (2021-11-24)
|
||||
### Changed
|
||||
- Improvement over Vietnamese detection (PR #126)
|
||||
- MD improvement on trailing data and long foreign (non-pure latin) data (PR #124)
|
||||
- Efficiency improvements in cd/alphabet_languages from [@adbar](https://github.com/adbar) (PR #122)
|
||||
- call sum() without an intermediary list following PEP 289 recommendations from [@adbar](https://github.com/adbar) (PR #129)
|
||||
- Code style as refactored by Sourcery-AI (PR #131)
|
||||
- Minor adjustment on the MD around european words (PR #133)
|
||||
- Remove and replace SRTs from assets / tests (PR #139)
|
||||
- Initialize the library logger with a `NullHandler` by default from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
||||
- Setting kwarg `explain` to True will add provisionally (bounded to function lifespan) a specific stream handler (PR #135)
|
||||
|
||||
### Fixed
|
||||
- Fix large (misleading) sequence giving UnicodeDecodeError (PR #137)
|
||||
- Avoid using too insignificant chunk (PR #137)
|
||||
|
||||
### Added
|
||||
- Add and expose function `set_logging_handler` to configure a specific StreamHandler from [@nmaynes](https://github.com/nmaynes) (PR #135)
|
||||
- Add `CHANGELOG.md` entries, format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) (PR #141)
|
||||
|
||||
## [2.0.7](https://github.com/Ousret/charset_normalizer/compare/2.0.6...2.0.7) (2021-10-11)
|
||||
### Added
|
||||
- Add support for Kazakh (Cyrillic) language detection (PR #109)
|
||||
|
||||
### Changed
|
||||
- Further, improve inferring the language from a given single-byte code page (PR #112)
|
||||
- Vainly trying to leverage PEP263 when PEP3120 is not supported (PR #116)
|
||||
- Refactoring for potential performance improvements in loops from [@adbar](https://github.com/adbar) (PR #113)
|
||||
- Various detection improvement (MD+CD) (PR #117)
|
||||
|
||||
### Removed
|
||||
- Remove redundant logging entry about detected language(s) (PR #115)
|
||||
|
||||
### Fixed
|
||||
- Fix a minor inconsistency between Python 3.5 and other versions regarding language detection (PR #117 #102)
|
||||
|
||||
## [2.0.6](https://github.com/Ousret/charset_normalizer/compare/2.0.5...2.0.6) (2021-09-18)
|
||||
### Fixed
|
||||
- Unforeseen regression with the loss of the backward-compatibility with some older minor of Python 3.5.x (PR #100)
|
||||
- Fix CLI crash when using --minimal output in certain cases (PR #103)
|
||||
|
||||
### Changed
|
||||
- Minor improvement to the detection efficiency (less than 1%) (PR #106 #101)
|
||||
|
||||
## [2.0.5](https://github.com/Ousret/charset_normalizer/compare/2.0.4...2.0.5) (2021-09-14)
|
||||
### Changed
|
||||
- The project now comply with: flake8, mypy, isort and black to ensure a better overall quality (PR #81)
|
||||
- The BC-support with v1.x was improved, the old staticmethods are restored (PR #82)
|
||||
- The Unicode detection is slightly improved (PR #93)
|
||||
- Add syntax sugar \_\_bool\_\_ for results CharsetMatches list-container (PR #91)
|
||||
|
||||
### Removed
|
||||
- The project no longer raise warning on tiny content given for detection, will be simply logged as warning instead (PR #92)
|
||||
|
||||
### Fixed
|
||||
- In some rare case, the chunks extractor could cut in the middle of a multi-byte character and could mislead the mess detection (PR #95)
|
||||
- Some rare 'space' characters could trip up the UnprintablePlugin/Mess detection (PR #96)
|
||||
- The MANIFEST.in was not exhaustive (PR #78)
|
||||
|
||||
## [2.0.4](https://github.com/Ousret/charset_normalizer/compare/2.0.3...2.0.4) (2021-07-30)
|
||||
### Fixed
|
||||
- The CLI no longer raise an unexpected exception when no encoding has been found (PR #70)
|
||||
- Fix accessing the 'alphabets' property when the payload contains surrogate characters (PR #68)
|
||||
- The logger could mislead (explain=True) on detected languages and the impact of one MBCS match (PR #72)
|
||||
- Submatch factoring could be wrong in rare edge cases (PR #72)
|
||||
- Multiple files given to the CLI were ignored when publishing results to STDOUT. (After the first path) (PR #72)
|
||||
- Fix line endings from CRLF to LF for certain project files (PR #67)
|
||||
|
||||
### Changed
|
||||
- Adjust the MD to lower the sensitivity, thus improving the global detection reliability (PR #69 #76)
|
||||
- Allow fallback on specified encoding if any (PR #71)
|
||||
|
||||
## [2.0.3](https://github.com/Ousret/charset_normalizer/compare/2.0.2...2.0.3) (2021-07-16)
|
||||
### Changed
|
||||
- Part of the detection mechanism has been improved to be less sensitive, resulting in more accurate detection results. Especially ASCII. (PR #63)
|
||||
- According to the community wishes, the detection will fall back on ASCII or UTF-8 in a last-resort case. (PR #64)
|
||||
|
||||
## [2.0.2](https://github.com/Ousret/charset_normalizer/compare/2.0.1...2.0.2) (2021-07-15)
|
||||
### Fixed
|
||||
- Empty/Too small JSON payload miss-detection fixed. Report from [@tseaver](https://github.com/tseaver) (PR #59)
|
||||
|
||||
### Changed
|
||||
- Don't inject unicodedata2 into sys.modules from [@akx](https://github.com/akx) (PR #57)
|
||||
|
||||
## [2.0.1](https://github.com/Ousret/charset_normalizer/compare/2.0.0...2.0.1) (2021-07-13)
|
||||
### Fixed
|
||||
- Make it work where there isn't a filesystem available, dropping assets frequencies.json. Report from [@sethmlarson](https://github.com/sethmlarson). (PR #55)
|
||||
- Using explain=False permanently disable the verbose output in the current runtime (PR #47)
|
||||
- One log entry (language target preemptive) was not show in logs when using explain=True (PR #47)
|
||||
- Fix undesired exception (ValueError) on getitem of instance CharsetMatches (PR #52)
|
||||
|
||||
### Changed
|
||||
- Public function normalize default args values were not aligned with from_bytes (PR #53)
|
||||
|
||||
### Added
|
||||
- You may now use charset aliases in cp_isolation and cp_exclusion arguments (PR #47)
|
||||
|
||||
## [2.0.0](https://github.com/Ousret/charset_normalizer/compare/1.4.1...2.0.0) (2021-07-02)
|
||||
### Changed
|
||||
- 4x to 5 times faster than the previous 1.4.0 release. At least 2x faster than Chardet.
|
||||
- Accent has been made on UTF-8 detection, should perform rather instantaneous.
|
||||
- The backward compatibility with Chardet has been greatly improved. The legacy detect function returns an identical charset name whenever possible.
|
||||
- The detection mechanism has been slightly improved, now Turkish content is detected correctly (most of the time)
|
||||
- The program has been rewritten to ease the readability and maintainability. (+Using static typing)+
|
||||
- utf_7 detection has been reinstated.
|
||||
|
||||
### Removed
|
||||
- This package no longer require anything when used with Python 3.5 (Dropped cached_property)
|
||||
- Removed support for these languages: Catalan, Esperanto, Kazakh, Baque, Volapük, Azeri, Galician, Nynorsk, Macedonian, and Serbocroatian.
|
||||
- The exception hook on UnicodeDecodeError has been removed.
|
||||
|
||||
### Deprecated
|
||||
- Methods coherence_non_latin, w_counter, chaos_secondary_pass of the class CharsetMatch are now deprecated and scheduled for removal in v3.0
|
||||
|
||||
### Fixed
|
||||
- The CLI output used the relative path of the file(s). Should be absolute.
|
||||
|
||||
## [1.4.1](https://github.com/Ousret/charset_normalizer/compare/1.4.0...1.4.1) (2021-05-28)
|
||||
### Fixed
|
||||
- Logger configuration/usage no longer conflict with others (PR #44)
|
||||
|
||||
## [1.4.0](https://github.com/Ousret/charset_normalizer/compare/1.3.9...1.4.0) (2021-05-21)
|
||||
### Removed
|
||||
- Using standard logging instead of using the package loguru.
|
||||
- Dropping nose test framework in favor of the maintained pytest.
|
||||
- Choose to not use dragonmapper package to help with gibberish Chinese/CJK text.
|
||||
- Require cached_property only for Python 3.5 due to constraint. Dropping for every other interpreter version.
|
||||
- Stop support for UTF-7 that does not contain a SIG.
|
||||
- Dropping PrettyTable, replaced with pure JSON output in CLI.
|
||||
|
||||
### Fixed
|
||||
- BOM marker in a CharsetNormalizerMatch instance could be False in rare cases even if obviously present. Due to the sub-match factoring process.
|
||||
- Not searching properly for the BOM when trying utf32/16 parent codec.
|
||||
|
||||
### Changed
|
||||
- Improving the package final size by compressing frequencies.json.
|
||||
- Huge improvement over the larges payload.
|
||||
|
||||
### Added
|
||||
- CLI now produces JSON consumable output.
|
||||
- Return ASCII if given sequences fit. Given reasonable confidence.
|
||||
|
||||
## [1.3.9](https://github.com/Ousret/charset_normalizer/compare/1.3.8...1.3.9) (2021-05-13)
|
||||
|
||||
### Fixed
|
||||
- In some very rare cases, you may end up getting encode/decode errors due to a bad bytes payload (PR #40)
|
||||
|
||||
## [1.3.8](https://github.com/Ousret/charset_normalizer/compare/1.3.7...1.3.8) (2021-05-12)
|
||||
|
||||
### Fixed
|
||||
- Empty given payload for detection may cause an exception if trying to access the `alphabets` property. (PR #39)
|
||||
|
||||
## [1.3.7](https://github.com/Ousret/charset_normalizer/compare/1.3.6...1.3.7) (2021-05-12)
|
||||
|
||||
### Fixed
|
||||
- The legacy detect function should return UTF-8-SIG if sig is present in the payload. (PR #38)
|
||||
|
||||
## [1.3.6](https://github.com/Ousret/charset_normalizer/compare/1.3.5...1.3.6) (2021-02-09)
|
||||
|
||||
### Changed
|
||||
- Amend the previous release to allow prettytable 2.0 (PR #35)
|
||||
|
||||
## [1.3.5](https://github.com/Ousret/charset_normalizer/compare/1.3.4...1.3.5) (2021-02-08)
|
||||
|
||||
### Fixed
|
||||
- Fix error while using the package with a python pre-release interpreter (PR #33)
|
||||
|
||||
### Changed
|
||||
- Dependencies refactoring, constraints revised.
|
||||
|
||||
### Added
|
||||
- Add python 3.9 and 3.10 to the supported interpreters
|
||||
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 TAHRI Ahmed R.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
+35
@@ -0,0 +1,35 @@
|
||||
../../../bin/normalizer,sha256=v_2ayJwejBCYYvh_5WvQ2fwQToLdvVoSW0bGZLaa6NA,291
|
||||
charset_normalizer-3.4.4.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
charset_normalizer-3.4.4.dist-info/METADATA,sha256=jVuUFBti8dav19YLvWissTihVdF2ozUY4KKMw7jdkBQ,37303
|
||||
charset_normalizer-3.4.4.dist-info/RECORD,,
|
||||
charset_normalizer-3.4.4.dist-info/WHEEL,sha256=BvA_i88wcFUl5ehXLgmhwyDL4XPGrCKn6CTUA9axFDE,190
|
||||
charset_normalizer-3.4.4.dist-info/entry_points.txt,sha256=ADSTKrkXZ3hhdOVFi6DcUEHQRS0xfxDIE_pEz4wLIXA,65
|
||||
charset_normalizer-3.4.4.dist-info/licenses/LICENSE,sha256=bQ1Bv-FwrGx9wkjJpj4lTQ-0WmDVCoJX0K-SxuJJuIc,1071
|
||||
charset_normalizer-3.4.4.dist-info/top_level.txt,sha256=7ASyzePr8_xuZWJsnqJjIBtyV8vhEo0wBCv1MPRRi3Q,19
|
||||
charset_normalizer/__init__.py,sha256=OKRxRv2Zhnqk00tqkN0c1BtJjm165fWXLydE52IKuHc,1590
|
||||
charset_normalizer/__main__.py,sha256=yzYxMR-IhKRHYwcSlavEv8oGdwxsR89mr2X09qXGdps,109
|
||||
charset_normalizer/__pycache__/__init__.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/__main__.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/api.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/cd.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/constant.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/legacy.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/md.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/models.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/utils.cpython-311.pyc,,
|
||||
charset_normalizer/__pycache__/version.cpython-311.pyc,,
|
||||
charset_normalizer/api.py,sha256=V07i8aVeCD8T2fSia3C-fn0i9t8qQguEBhsqszg32Ns,22668
|
||||
charset_normalizer/cd.py,sha256=WKTo1HDb-H9HfCDc3Bfwq5jzS25Ziy9SE2a74SgTq88,12522
|
||||
charset_normalizer/cli/__init__.py,sha256=D8I86lFk2-py45JvqxniTirSj_sFyE6sjaY_0-G1shc,136
|
||||
charset_normalizer/cli/__main__.py,sha256=dMaXG6IJXRvqq8z2tig7Qb83-BpWTln55ooiku5_uvg,12646
|
||||
charset_normalizer/cli/__pycache__/__init__.cpython-311.pyc,,
|
||||
charset_normalizer/cli/__pycache__/__main__.cpython-311.pyc,,
|
||||
charset_normalizer/constant.py,sha256=7UVY4ldYhmQMHUdgQ_sgZmzcQ0xxYxpBunqSZ-XJZ8U,42713
|
||||
charset_normalizer/legacy.py,sha256=sYBzSpzsRrg_wF4LP536pG64BItw7Tqtc3SMQAHvFLM,2731
|
||||
charset_normalizer/md.cpython-311-x86_64-linux-gnu.so,sha256=vmQPHNPc6Z0rbXxiiMeoCQxfjjh5z8umtRgKpBckk7E,15912
|
||||
charset_normalizer/md.py,sha256=-_oN3h3_X99nkFfqamD3yu45DC_wfk5odH0Tr_CQiXs,20145
|
||||
charset_normalizer/md__mypyc.cpython-311-x86_64-linux-gnu.so,sha256=83X9F2ayDPRojhPPi7Dpm2LS0plQBG4B12R1GSri2lw,282232
|
||||
charset_normalizer/models.py,sha256=lKXhOnIPtiakbK3i__J9wpOfzx3JDTKj7Dn3Rg0VaRI,12394
|
||||
charset_normalizer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
||||
charset_normalizer/utils.py,sha256=sTejPgrdlNsKNucZfJCxJ95lMTLA0ShHLLE3n5wpT9Q,12170
|
||||
charset_normalizer/version.py,sha256=nKE4qBNk5WA4LIJ_yIH_aSDfvtsyizkWMg-PUG-UZVk,115
|
||||
+7
@@ -0,0 +1,7 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: setuptools (80.9.0)
|
||||
Root-Is-Purelib: false
|
||||
Tag: cp311-cp311-manylinux_2_17_x86_64
|
||||
Tag: cp311-cp311-manylinux2014_x86_64
|
||||
Tag: cp311-cp311-manylinux_2_28_x86_64
|
||||
|
||||
+2
@@ -0,0 +1,2 @@
|
||||
[console_scripts]
|
||||
normalizer = charset_normalizer.cli:cli_detect
|
||||
+21
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2025 TAHRI Ahmed R.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
+1
@@ -0,0 +1 @@
|
||||
charset_normalizer
|
||||
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
Charset-Normalizer
|
||||
~~~~~~~~~~~~~~
|
||||
The Real First Universal Charset Detector.
|
||||
A library that helps you read text from an unknown charset encoding.
|
||||
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
|
||||
All IANA character set names for which the Python core library provides codecs are supported.
|
||||
|
||||
Basic usage:
|
||||
>>> from charset_normalizer import from_bytes
|
||||
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
|
||||
>>> best_guess = results.best()
|
||||
>>> str(best_guess)
|
||||
'Bсеки човек има право на образование. Oбразованието!'
|
||||
|
||||
Others methods and usages are available - see the full documentation
|
||||
at <https://github.com/Ousret/charset_normalizer>.
|
||||
:copyright: (c) 2021 by Ahmed TAHRI
|
||||
:license: MIT, see LICENSE for more details.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
|
||||
from .api import from_bytes, from_fp, from_path, is_binary
|
||||
from .legacy import detect
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import set_logging_handler
|
||||
from .version import VERSION, __version__
|
||||
|
||||
__all__ = (
|
||||
"from_fp",
|
||||
"from_path",
|
||||
"from_bytes",
|
||||
"is_binary",
|
||||
"detect",
|
||||
"CharsetMatch",
|
||||
"CharsetMatches",
|
||||
"__version__",
|
||||
"VERSION",
|
||||
"set_logging_handler",
|
||||
)
|
||||
|
||||
# Attach a NullHandler to the top level logger by default
|
||||
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
|
||||
|
||||
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())
|
||||
@@ -0,0 +1,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from .cli import cli_detect
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_detect()
|
||||
@@ -0,0 +1,669 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from os import PathLike
|
||||
from typing import BinaryIO
|
||||
|
||||
from .cd import (
|
||||
coherence_ratio,
|
||||
encoding_languages,
|
||||
mb_encoding_languages,
|
||||
merge_coherence_ratios,
|
||||
)
|
||||
from .constant import IANA_SUPPORTED, TOO_BIG_SEQUENCE, TOO_SMALL_SEQUENCE, TRACE
|
||||
from .md import mess_ratio
|
||||
from .models import CharsetMatch, CharsetMatches
|
||||
from .utils import (
|
||||
any_specified_encoding,
|
||||
cut_sequence_chunks,
|
||||
iana_name,
|
||||
identify_sig_or_bom,
|
||||
is_cp_similar,
|
||||
is_multi_byte_encoding,
|
||||
should_strip_sig_or_bom,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("charset_normalizer")
|
||||
explain_handler = logging.StreamHandler()
|
||||
explain_handler.setFormatter(
|
||||
logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
|
||||
)
|
||||
|
||||
|
||||
def from_bytes(
|
||||
sequences: bytes | bytearray,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.2,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Given a raw bytes sequence, return the best possibles charset usable to render str objects.
|
||||
If there is no results, it is a strong indicator that the source is binary/not text.
|
||||
By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
|
||||
And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.
|
||||
|
||||
The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
|
||||
but never take it for granted. Can improve the performance.
|
||||
|
||||
You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
|
||||
purpose.
|
||||
|
||||
This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
|
||||
By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
|
||||
toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
|
||||
Custom logging format and handler can be set manually.
|
||||
"""
|
||||
|
||||
if not isinstance(sequences, (bytearray, bytes)):
|
||||
raise TypeError(
|
||||
"Expected object of type bytes or bytearray, got: {}".format(
|
||||
type(sequences)
|
||||
)
|
||||
)
|
||||
|
||||
if explain:
|
||||
previous_logger_level: int = logger.level
|
||||
logger.addHandler(explain_handler)
|
||||
logger.setLevel(TRACE)
|
||||
|
||||
length: int = len(sequences)
|
||||
|
||||
if length == 0:
|
||||
logger.debug("Encoding detection on empty bytes, assuming utf_8 intention.")
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level or logging.WARNING)
|
||||
return CharsetMatches([CharsetMatch(sequences, "utf_8", 0.0, False, [], "")])
|
||||
|
||||
if cp_isolation is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_isolation is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding allowed : %s.",
|
||||
", ".join(cp_isolation),
|
||||
)
|
||||
cp_isolation = [iana_name(cp, False) for cp in cp_isolation]
|
||||
else:
|
||||
cp_isolation = []
|
||||
|
||||
if cp_exclusion is not None:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"cp_exclusion is set. use this flag for debugging purpose. "
|
||||
"limited list of encoding excluded : %s.",
|
||||
", ".join(cp_exclusion),
|
||||
)
|
||||
cp_exclusion = [iana_name(cp, False) for cp in cp_exclusion]
|
||||
else:
|
||||
cp_exclusion = []
|
||||
|
||||
if length <= (chunk_size * steps):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.",
|
||||
steps,
|
||||
chunk_size,
|
||||
length,
|
||||
)
|
||||
steps = 1
|
||||
chunk_size = length
|
||||
|
||||
if steps > 1 and length / steps < chunk_size:
|
||||
chunk_size = int(length / steps)
|
||||
|
||||
is_too_small_sequence: bool = len(sequences) < TOO_SMALL_SEQUENCE
|
||||
is_too_large_sequence: bool = len(sequences) >= TOO_BIG_SEQUENCE
|
||||
|
||||
if is_too_small_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Trying to detect encoding from a tiny portion of ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
elif is_too_large_sequence:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Using lazy str decoding because the payload is quite large, ({}) byte(s).".format(
|
||||
length
|
||||
),
|
||||
)
|
||||
|
||||
prioritized_encodings: list[str] = []
|
||||
|
||||
specified_encoding: str | None = (
|
||||
any_specified_encoding(sequences) if preemptive_behaviour else None
|
||||
)
|
||||
|
||||
if specified_encoding is not None:
|
||||
prioritized_encodings.append(specified_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected declarative mark in sequence. Priority +1 given for %s.",
|
||||
specified_encoding,
|
||||
)
|
||||
|
||||
tested: set[str] = set()
|
||||
tested_but_hard_failure: list[str] = []
|
||||
tested_but_soft_failure: list[str] = []
|
||||
|
||||
fallback_ascii: CharsetMatch | None = None
|
||||
fallback_u8: CharsetMatch | None = None
|
||||
fallback_specified: CharsetMatch | None = None
|
||||
|
||||
results: CharsetMatches = CharsetMatches()
|
||||
|
||||
early_stop_results: CharsetMatches = CharsetMatches()
|
||||
|
||||
sig_encoding, sig_payload = identify_sig_or_bom(sequences)
|
||||
|
||||
if sig_encoding is not None:
|
||||
prioritized_encodings.append(sig_encoding)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Detected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.",
|
||||
len(sig_payload),
|
||||
sig_encoding,
|
||||
)
|
||||
|
||||
prioritized_encodings.append("ascii")
|
||||
|
||||
if "utf_8" not in prioritized_encodings:
|
||||
prioritized_encodings.append("utf_8")
|
||||
|
||||
for encoding_iana in prioritized_encodings + IANA_SUPPORTED:
|
||||
if cp_isolation and encoding_iana not in cp_isolation:
|
||||
continue
|
||||
|
||||
if cp_exclusion and encoding_iana in cp_exclusion:
|
||||
continue
|
||||
|
||||
if encoding_iana in tested:
|
||||
continue
|
||||
|
||||
tested.add(encoding_iana)
|
||||
|
||||
decoded_payload: str | None = None
|
||||
bom_or_sig_available: bool = sig_encoding == encoding_iana
|
||||
strip_sig_or_bom: bool = bom_or_sig_available and should_strip_sig_or_bom(
|
||||
encoding_iana
|
||||
)
|
||||
|
||||
if encoding_iana in {"utf_16", "utf_32"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
if encoding_iana in {"utf_7"} and not bom_or_sig_available:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana)
|
||||
except (ModuleNotFoundError, ImportError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Encoding %s does not provide an IncrementalDecoder",
|
||||
encoding_iana,
|
||||
)
|
||||
continue
|
||||
|
||||
try:
|
||||
if is_too_large_sequence and is_multi_byte_decoder is False:
|
||||
str(
|
||||
(
|
||||
sequences[: int(50e4)]
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) : int(50e4)]
|
||||
),
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
else:
|
||||
decoded_payload = str(
|
||||
(
|
||||
sequences
|
||||
if strip_sig_or_bom is False
|
||||
else sequences[len(sig_payload) :]
|
||||
),
|
||||
encoding=encoding_iana,
|
||||
)
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
if not isinstance(e, LookupError):
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
similar_soft_failure_test: bool = False
|
||||
|
||||
for encoding_soft_failed in tested_but_soft_failure:
|
||||
if is_cp_similar(encoding_iana, encoding_soft_failed):
|
||||
similar_soft_failure_test = True
|
||||
break
|
||||
|
||||
if similar_soft_failure_test:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s is deemed too similar to code page %s and was consider unsuited already. Continuing!",
|
||||
encoding_iana,
|
||||
encoding_soft_failed,
|
||||
)
|
||||
continue
|
||||
|
||||
r_ = range(
|
||||
0 if not bom_or_sig_available else len(sig_payload),
|
||||
length,
|
||||
int(length / steps),
|
||||
)
|
||||
|
||||
multi_byte_bonus: bool = (
|
||||
is_multi_byte_decoder
|
||||
and decoded_payload is not None
|
||||
and len(decoded_payload) < length
|
||||
)
|
||||
|
||||
if multi_byte_bonus:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Code page %s is a multi byte encoding table and it appear that at least one character "
|
||||
"was encoded using n-bytes.",
|
||||
encoding_iana,
|
||||
)
|
||||
|
||||
max_chunk_gave_up: int = int(len(r_) / 4)
|
||||
|
||||
max_chunk_gave_up = max(max_chunk_gave_up, 2)
|
||||
early_stop_count: int = 0
|
||||
lazy_str_hard_failure = False
|
||||
|
||||
md_chunks: list[str] = []
|
||||
md_ratios = []
|
||||
|
||||
try:
|
||||
for chunk in cut_sequence_chunks(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
r_,
|
||||
chunk_size,
|
||||
bom_or_sig_available,
|
||||
strip_sig_or_bom,
|
||||
sig_payload,
|
||||
is_multi_byte_decoder,
|
||||
decoded_payload,
|
||||
):
|
||||
md_chunks.append(chunk)
|
||||
|
||||
md_ratios.append(
|
||||
mess_ratio(
|
||||
chunk,
|
||||
threshold,
|
||||
explain is True and 1 <= len(cp_isolation) <= 2,
|
||||
)
|
||||
)
|
||||
|
||||
if md_ratios[-1] >= threshold:
|
||||
early_stop_count += 1
|
||||
|
||||
if (early_stop_count >= max_chunk_gave_up) or (
|
||||
bom_or_sig_available and strip_sig_or_bom is False
|
||||
):
|
||||
break
|
||||
except (
|
||||
UnicodeDecodeError
|
||||
) as e: # Lazy str loading may have missed something there
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
early_stop_count = max_chunk_gave_up
|
||||
lazy_str_hard_failure = True
|
||||
|
||||
# We might want to check the sequence again with the whole content
|
||||
# Only if initial MD tests passes
|
||||
if (
|
||||
not lazy_str_hard_failure
|
||||
and is_too_large_sequence
|
||||
and not is_multi_byte_decoder
|
||||
):
|
||||
try:
|
||||
sequences[int(50e3) :].decode(encoding_iana, errors="strict")
|
||||
except UnicodeDecodeError as e:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %s",
|
||||
encoding_iana,
|
||||
str(e),
|
||||
)
|
||||
tested_but_hard_failure.append(encoding_iana)
|
||||
continue
|
||||
|
||||
mean_mess_ratio: float = sum(md_ratios) / len(md_ratios) if md_ratios else 0.0
|
||||
if mean_mess_ratio >= threshold or early_stop_count >= max_chunk_gave_up:
|
||||
tested_but_soft_failure.append(encoding_iana)
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s was excluded because of initial chaos probing. Gave up %i time(s). "
|
||||
"Computed mean chaos is %f %%.",
|
||||
encoding_iana,
|
||||
early_stop_count,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
# Preparing those fallbacks in case we got nothing.
|
||||
if (
|
||||
enable_fallback
|
||||
and encoding_iana
|
||||
in ["ascii", "utf_8", specified_encoding, "utf_16", "utf_32"]
|
||||
and not lazy_str_hard_failure
|
||||
):
|
||||
fallback_entry = CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
threshold,
|
||||
bom_or_sig_available,
|
||||
[],
|
||||
decoded_payload,
|
||||
preemptive_declaration=specified_encoding,
|
||||
)
|
||||
if encoding_iana == specified_encoding:
|
||||
fallback_specified = fallback_entry
|
||||
elif encoding_iana == "ascii":
|
||||
fallback_ascii = fallback_entry
|
||||
else:
|
||||
fallback_u8 = fallback_entry
|
||||
continue
|
||||
|
||||
logger.log(
|
||||
TRACE,
|
||||
"%s passed initial chaos probing. Mean measured chaos is %f %%",
|
||||
encoding_iana,
|
||||
round(mean_mess_ratio * 100, ndigits=3),
|
||||
)
|
||||
|
||||
if not is_multi_byte_decoder:
|
||||
target_languages: list[str] = encoding_languages(encoding_iana)
|
||||
else:
|
||||
target_languages = mb_encoding_languages(encoding_iana)
|
||||
|
||||
if target_languages:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"{} should target any language(s) of {}".format(
|
||||
encoding_iana, str(target_languages)
|
||||
),
|
||||
)
|
||||
|
||||
cd_ratios = []
|
||||
|
||||
# We shall skip the CD when its about ASCII
|
||||
# Most of the time its not relevant to run "language-detection" on it.
|
||||
if encoding_iana != "ascii":
|
||||
for chunk in md_chunks:
|
||||
chunk_languages = coherence_ratio(
|
||||
chunk,
|
||||
language_threshold,
|
||||
",".join(target_languages) if target_languages else None,
|
||||
)
|
||||
|
||||
cd_ratios.append(chunk_languages)
|
||||
|
||||
cd_ratios_merged = merge_coherence_ratios(cd_ratios)
|
||||
|
||||
if cd_ratios_merged:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"We detected language {} using {}".format(
|
||||
cd_ratios_merged, encoding_iana
|
||||
),
|
||||
)
|
||||
|
||||
current_match = CharsetMatch(
|
||||
sequences,
|
||||
encoding_iana,
|
||||
mean_mess_ratio,
|
||||
bom_or_sig_available,
|
||||
cd_ratios_merged,
|
||||
(
|
||||
decoded_payload
|
||||
if (
|
||||
is_too_large_sequence is False
|
||||
or encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||
)
|
||||
else None
|
||||
),
|
||||
preemptive_declaration=specified_encoding,
|
||||
)
|
||||
|
||||
results.append(current_match)
|
||||
|
||||
if (
|
||||
encoding_iana in [specified_encoding, "ascii", "utf_8"]
|
||||
and mean_mess_ratio < 0.1
|
||||
):
|
||||
# If md says nothing to worry about, then... stop immediately!
|
||||
if mean_mess_ratio == 0.0:
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.",
|
||||
current_match.encoding,
|
||||
)
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([current_match])
|
||||
|
||||
early_stop_results.append(current_match)
|
||||
|
||||
if (
|
||||
len(early_stop_results)
|
||||
and (specified_encoding is None or specified_encoding in tested)
|
||||
and "ascii" in tested
|
||||
and "utf_8" in tested
|
||||
):
|
||||
probable_result: CharsetMatch = early_stop_results.best() # type: ignore[assignment]
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one.",
|
||||
probable_result.encoding,
|
||||
)
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
|
||||
return CharsetMatches([probable_result])
|
||||
|
||||
if encoding_iana == sig_encoding:
|
||||
logger.debug(
|
||||
"Encoding detection: %s is most likely the one as we detected a BOM or SIG within "
|
||||
"the beginning of the sequence.",
|
||||
encoding_iana,
|
||||
)
|
||||
if explain: # Defensive: ensure exit path clean handler
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
return CharsetMatches([results[encoding_iana]])
|
||||
|
||||
if len(results) == 0:
|
||||
if fallback_u8 or fallback_ascii or fallback_specified:
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Nothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.",
|
||||
)
|
||||
|
||||
if fallback_specified:
|
||||
logger.debug(
|
||||
"Encoding detection: %s will be used as a fallback match",
|
||||
fallback_specified.encoding,
|
||||
)
|
||||
results.append(fallback_specified)
|
||||
elif (
|
||||
(fallback_u8 and fallback_ascii is None)
|
||||
or (
|
||||
fallback_u8
|
||||
and fallback_ascii
|
||||
and fallback_u8.fingerprint != fallback_ascii.fingerprint
|
||||
)
|
||||
or (fallback_u8 is not None)
|
||||
):
|
||||
logger.debug("Encoding detection: utf_8 will be used as a fallback match")
|
||||
results.append(fallback_u8)
|
||||
elif fallback_ascii:
|
||||
logger.debug("Encoding detection: ascii will be used as a fallback match")
|
||||
results.append(fallback_ascii)
|
||||
|
||||
if results:
|
||||
logger.debug(
|
||||
"Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.",
|
||||
results.best().encoding, # type: ignore
|
||||
len(results) - 1,
|
||||
)
|
||||
else:
|
||||
logger.debug("Encoding detection: Unable to determine any suitable charset.")
|
||||
|
||||
if explain:
|
||||
logger.removeHandler(explain_handler)
|
||||
logger.setLevel(previous_logger_level)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def from_fp(
|
||||
fp: BinaryIO,
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but using a file pointer that is already ready.
|
||||
Will not close the file pointer.
|
||||
"""
|
||||
return from_bytes(
|
||||
fp.read(),
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
language_threshold,
|
||||
enable_fallback,
|
||||
)
|
||||
|
||||
|
||||
def from_path(
|
||||
path: str | bytes | PathLike, # type: ignore[type-arg]
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = True,
|
||||
) -> CharsetMatches:
|
||||
"""
|
||||
Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
|
||||
Can raise IOError.
|
||||
"""
|
||||
with open(path, "rb") as fp:
|
||||
return from_fp(
|
||||
fp,
|
||||
steps,
|
||||
chunk_size,
|
||||
threshold,
|
||||
cp_isolation,
|
||||
cp_exclusion,
|
||||
preemptive_behaviour,
|
||||
explain,
|
||||
language_threshold,
|
||||
enable_fallback,
|
||||
)
|
||||
|
||||
|
||||
def is_binary(
|
||||
fp_or_path_or_payload: PathLike | str | BinaryIO | bytes, # type: ignore[type-arg]
|
||||
steps: int = 5,
|
||||
chunk_size: int = 512,
|
||||
threshold: float = 0.20,
|
||||
cp_isolation: list[str] | None = None,
|
||||
cp_exclusion: list[str] | None = None,
|
||||
preemptive_behaviour: bool = True,
|
||||
explain: bool = False,
|
||||
language_threshold: float = 0.1,
|
||||
enable_fallback: bool = False,
|
||||
) -> bool:
|
||||
"""
|
||||
Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
|
||||
Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
|
||||
are disabled to be stricter around ASCII-compatible but unlikely to be a string.
|
||||
"""
|
||||
if isinstance(fp_or_path_or_payload, (str, PathLike)):
|
||||
guesses = from_path(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
elif isinstance(
|
||||
fp_or_path_or_payload,
|
||||
(
|
||||
bytes,
|
||||
bytearray,
|
||||
),
|
||||
):
|
||||
guesses = from_bytes(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
else:
|
||||
guesses = from_fp(
|
||||
fp_or_path_or_payload,
|
||||
steps=steps,
|
||||
chunk_size=chunk_size,
|
||||
threshold=threshold,
|
||||
cp_isolation=cp_isolation,
|
||||
cp_exclusion=cp_exclusion,
|
||||
preemptive_behaviour=preemptive_behaviour,
|
||||
explain=explain,
|
||||
language_threshold=language_threshold,
|
||||
enable_fallback=enable_fallback,
|
||||
)
|
||||
|
||||
return not guesses
|
||||
@@ -0,0 +1,395 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
from codecs import IncrementalDecoder
|
||||
from collections import Counter
|
||||
from functools import lru_cache
|
||||
from typing import Counter as TypeCounter
|
||||
|
||||
from .constant import (
|
||||
FREQUENCIES,
|
||||
KO_NAMES,
|
||||
LANGUAGE_SUPPORTED_COUNT,
|
||||
TOO_SMALL_SEQUENCE,
|
||||
ZH_NAMES,
|
||||
)
|
||||
from .md import is_suspiciously_successive_range
|
||||
from .models import CoherenceMatches
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_latin,
|
||||
is_multi_byte_encoding,
|
||||
is_unicode_range_secondary,
|
||||
unicode_range,
|
||||
)
|
||||
|
||||
|
||||
def encoding_unicode_range(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Return associated unicode ranges in a single byte code page.
|
||||
"""
|
||||
if is_multi_byte_encoding(iana_name):
|
||||
raise OSError("Function not supported on multi-byte code page")
|
||||
|
||||
decoder = importlib.import_module(f"encodings.{iana_name}").IncrementalDecoder
|
||||
|
||||
p: IncrementalDecoder = decoder(errors="ignore")
|
||||
seen_ranges: dict[str, int] = {}
|
||||
character_count: int = 0
|
||||
|
||||
for i in range(0x40, 0xFF):
|
||||
chunk: str = p.decode(bytes([i]))
|
||||
|
||||
if chunk:
|
||||
character_range: str | None = unicode_range(chunk)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
if is_unicode_range_secondary(character_range) is False:
|
||||
if character_range not in seen_ranges:
|
||||
seen_ranges[character_range] = 0
|
||||
seen_ranges[character_range] += 1
|
||||
character_count += 1
|
||||
|
||||
return sorted(
|
||||
[
|
||||
character_range
|
||||
for character_range in seen_ranges
|
||||
if seen_ranges[character_range] / character_count >= 0.15
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def unicode_range_languages(primary_range: str) -> list[str]:
|
||||
"""
|
||||
Return inferred languages used with a unicode range.
|
||||
"""
|
||||
languages: list[str] = []
|
||||
|
||||
for language, characters in FREQUENCIES.items():
|
||||
for character in characters:
|
||||
if unicode_range(character) == primary_range:
|
||||
languages.append(language)
|
||||
break
|
||||
|
||||
return languages
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def encoding_languages(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Single-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
unicode_ranges: list[str] = encoding_unicode_range(iana_name)
|
||||
primary_range: str | None = None
|
||||
|
||||
for specified_range in unicode_ranges:
|
||||
if "Latin" not in specified_range:
|
||||
primary_range = specified_range
|
||||
break
|
||||
|
||||
if primary_range is None:
|
||||
return ["Latin Based"]
|
||||
|
||||
return unicode_range_languages(primary_range)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def mb_encoding_languages(iana_name: str) -> list[str]:
|
||||
"""
|
||||
Multi-byte encoding language association. Some code page are heavily linked to particular language(s).
|
||||
This function does the correspondence.
|
||||
"""
|
||||
if (
|
||||
iana_name.startswith("shift_")
|
||||
or iana_name.startswith("iso2022_jp")
|
||||
or iana_name.startswith("euc_j")
|
||||
or iana_name == "cp932"
|
||||
):
|
||||
return ["Japanese"]
|
||||
if iana_name.startswith("gb") or iana_name in ZH_NAMES:
|
||||
return ["Chinese"]
|
||||
if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES:
|
||||
return ["Korean"]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
@lru_cache(maxsize=LANGUAGE_SUPPORTED_COUNT)
|
||||
def get_target_features(language: str) -> tuple[bool, bool]:
|
||||
"""
|
||||
Determine main aspects from a supported language if it contains accents and if is pure Latin.
|
||||
"""
|
||||
target_have_accents: bool = False
|
||||
target_pure_latin: bool = True
|
||||
|
||||
for character in FREQUENCIES[language]:
|
||||
if not target_have_accents and is_accentuated(character):
|
||||
target_have_accents = True
|
||||
if target_pure_latin and is_latin(character) is False:
|
||||
target_pure_latin = False
|
||||
|
||||
return target_have_accents, target_pure_latin
|
||||
|
||||
|
||||
def alphabet_languages(
|
||||
characters: list[str], ignore_non_latin: bool = False
|
||||
) -> list[str]:
|
||||
"""
|
||||
Return associated languages associated to given characters.
|
||||
"""
|
||||
languages: list[tuple[str, float]] = []
|
||||
|
||||
source_have_accents = any(is_accentuated(character) for character in characters)
|
||||
|
||||
for language, language_characters in FREQUENCIES.items():
|
||||
target_have_accents, target_pure_latin = get_target_features(language)
|
||||
|
||||
if ignore_non_latin and target_pure_latin is False:
|
||||
continue
|
||||
|
||||
if target_have_accents is False and source_have_accents:
|
||||
continue
|
||||
|
||||
character_count: int = len(language_characters)
|
||||
|
||||
character_match_count: int = len(
|
||||
[c for c in language_characters if c in characters]
|
||||
)
|
||||
|
||||
ratio: float = character_match_count / character_count
|
||||
|
||||
if ratio >= 0.2:
|
||||
languages.append((language, ratio))
|
||||
|
||||
languages = sorted(languages, key=lambda x: x[1], reverse=True)
|
||||
|
||||
return [compatible_language[0] for compatible_language in languages]
|
||||
|
||||
|
||||
def characters_popularity_compare(
|
||||
language: str, ordered_characters: list[str]
|
||||
) -> float:
|
||||
"""
|
||||
Determine if a ordered characters list (by occurrence from most appearance to rarest) match a particular language.
|
||||
The result is a ratio between 0. (absolutely no correspondence) and 1. (near perfect fit).
|
||||
Beware that is function is not strict on the match in order to ease the detection. (Meaning close match is 1.)
|
||||
"""
|
||||
if language not in FREQUENCIES:
|
||||
raise ValueError(f"{language} not available")
|
||||
|
||||
character_approved_count: int = 0
|
||||
FREQUENCIES_language_set = set(FREQUENCIES[language])
|
||||
|
||||
ordered_characters_count: int = len(ordered_characters)
|
||||
target_language_characters_count: int = len(FREQUENCIES[language])
|
||||
|
||||
large_alphabet: bool = target_language_characters_count > 26
|
||||
|
||||
for character, character_rank in zip(
|
||||
ordered_characters, range(0, ordered_characters_count)
|
||||
):
|
||||
if character not in FREQUENCIES_language_set:
|
||||
continue
|
||||
|
||||
character_rank_in_language: int = FREQUENCIES[language].index(character)
|
||||
expected_projection_ratio: float = (
|
||||
target_language_characters_count / ordered_characters_count
|
||||
)
|
||||
character_rank_projection: int = int(character_rank * expected_projection_ratio)
|
||||
|
||||
if (
|
||||
large_alphabet is False
|
||||
and abs(character_rank_projection - character_rank_in_language) > 4
|
||||
):
|
||||
continue
|
||||
|
||||
if (
|
||||
large_alphabet is True
|
||||
and abs(character_rank_projection - character_rank_in_language)
|
||||
< target_language_characters_count / 3
|
||||
):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
characters_before_source: list[str] = FREQUENCIES[language][
|
||||
0:character_rank_in_language
|
||||
]
|
||||
characters_after_source: list[str] = FREQUENCIES[language][
|
||||
character_rank_in_language:
|
||||
]
|
||||
characters_before: list[str] = ordered_characters[0:character_rank]
|
||||
characters_after: list[str] = ordered_characters[character_rank:]
|
||||
|
||||
before_match_count: int = len(
|
||||
set(characters_before) & set(characters_before_source)
|
||||
)
|
||||
|
||||
after_match_count: int = len(
|
||||
set(characters_after) & set(characters_after_source)
|
||||
)
|
||||
|
||||
if len(characters_before_source) == 0 and before_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if len(characters_after_source) == 0 and after_match_count <= 4:
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
if (
|
||||
before_match_count / len(characters_before_source) >= 0.4
|
||||
or after_match_count / len(characters_after_source) >= 0.4
|
||||
):
|
||||
character_approved_count += 1
|
||||
continue
|
||||
|
||||
return character_approved_count / len(ordered_characters)
|
||||
|
||||
|
||||
def alpha_unicode_split(decoded_sequence: str) -> list[str]:
|
||||
"""
|
||||
Given a decoded text sequence, return a list of str. Unicode range / alphabet separation.
|
||||
Ex. a text containing English/Latin with a bit a Hebrew will return two items in the resulting list;
|
||||
One containing the latin letters and the other hebrew.
|
||||
"""
|
||||
layers: dict[str, str] = {}
|
||||
|
||||
for character in decoded_sequence:
|
||||
if character.isalpha() is False:
|
||||
continue
|
||||
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
continue
|
||||
|
||||
layer_target_range: str | None = None
|
||||
|
||||
for discovered_range in layers:
|
||||
if (
|
||||
is_suspiciously_successive_range(discovered_range, character_range)
|
||||
is False
|
||||
):
|
||||
layer_target_range = discovered_range
|
||||
break
|
||||
|
||||
if layer_target_range is None:
|
||||
layer_target_range = character_range
|
||||
|
||||
if layer_target_range not in layers:
|
||||
layers[layer_target_range] = character.lower()
|
||||
continue
|
||||
|
||||
layers[layer_target_range] += character.lower()
|
||||
|
||||
return list(layers.values())
|
||||
|
||||
|
||||
def merge_coherence_ratios(results: list[CoherenceMatches]) -> CoherenceMatches:
|
||||
"""
|
||||
This function merge results previously given by the function coherence_ratio.
|
||||
The return type is the same as coherence_ratio.
|
||||
"""
|
||||
per_language_ratios: dict[str, list[float]] = {}
|
||||
for result in results:
|
||||
for sub_result in result:
|
||||
language, ratio = sub_result
|
||||
if language not in per_language_ratios:
|
||||
per_language_ratios[language] = [ratio]
|
||||
continue
|
||||
per_language_ratios[language].append(ratio)
|
||||
|
||||
merge = [
|
||||
(
|
||||
language,
|
||||
round(
|
||||
sum(per_language_ratios[language]) / len(per_language_ratios[language]),
|
||||
4,
|
||||
),
|
||||
)
|
||||
for language in per_language_ratios
|
||||
]
|
||||
|
||||
return sorted(merge, key=lambda x: x[1], reverse=True)
|
||||
|
||||
|
||||
def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches:
|
||||
"""
|
||||
We shall NOT return "English—" in CoherenceMatches because it is an alternative
|
||||
of "English". This function only keeps the best match and remove the em-dash in it.
|
||||
"""
|
||||
index_results: dict[str, list[float]] = dict()
|
||||
|
||||
for result in results:
|
||||
language, ratio = result
|
||||
no_em_name: str = language.replace("—", "")
|
||||
|
||||
if no_em_name not in index_results:
|
||||
index_results[no_em_name] = []
|
||||
|
||||
index_results[no_em_name].append(ratio)
|
||||
|
||||
if any(len(index_results[e]) > 1 for e in index_results):
|
||||
filtered_results: CoherenceMatches = []
|
||||
|
||||
for language in index_results:
|
||||
filtered_results.append((language, max(index_results[language])))
|
||||
|
||||
return filtered_results
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def coherence_ratio(
|
||||
decoded_sequence: str, threshold: float = 0.1, lg_inclusion: str | None = None
|
||||
) -> CoherenceMatches:
|
||||
"""
|
||||
Detect ANY language that can be identified in given sequence. The sequence will be analysed by layers.
|
||||
A layer = Character extraction by alphabets/ranges.
|
||||
"""
|
||||
|
||||
results: list[tuple[str, float]] = []
|
||||
ignore_non_latin: bool = False
|
||||
|
||||
sufficient_match_count: int = 0
|
||||
|
||||
lg_inclusion_list = lg_inclusion.split(",") if lg_inclusion is not None else []
|
||||
if "Latin Based" in lg_inclusion_list:
|
||||
ignore_non_latin = True
|
||||
lg_inclusion_list.remove("Latin Based")
|
||||
|
||||
for layer in alpha_unicode_split(decoded_sequence):
|
||||
sequence_frequencies: TypeCounter[str] = Counter(layer)
|
||||
most_common = sequence_frequencies.most_common()
|
||||
|
||||
character_count: int = sum(o for c, o in most_common)
|
||||
|
||||
if character_count <= TOO_SMALL_SEQUENCE:
|
||||
continue
|
||||
|
||||
popular_character_ordered: list[str] = [c for c, o in most_common]
|
||||
|
||||
for language in lg_inclusion_list or alphabet_languages(
|
||||
popular_character_ordered, ignore_non_latin
|
||||
):
|
||||
ratio: float = characters_popularity_compare(
|
||||
language, popular_character_ordered
|
||||
)
|
||||
|
||||
if ratio < threshold:
|
||||
continue
|
||||
elif ratio >= 0.8:
|
||||
sufficient_match_count += 1
|
||||
|
||||
results.append((language, round(ratio, 4)))
|
||||
|
||||
if sufficient_match_count >= 3:
|
||||
break
|
||||
|
||||
return sorted(
|
||||
filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True
|
||||
)
|
||||
+8
@@ -0,0 +1,8 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from .__main__ import cli_detect, query_yes_no
|
||||
|
||||
__all__ = (
|
||||
"cli_detect",
|
||||
"query_yes_no",
|
||||
)
|
||||
+381
@@ -0,0 +1,381 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import typing
|
||||
from json import dumps
|
||||
from os.path import abspath, basename, dirname, join, realpath
|
||||
from platform import python_version
|
||||
from unicodedata import unidata_version
|
||||
|
||||
import charset_normalizer.md as md_module
|
||||
from charset_normalizer import from_fp
|
||||
from charset_normalizer.models import CliDetectionResult
|
||||
from charset_normalizer.version import __version__
|
||||
|
||||
|
||||
def query_yes_no(question: str, default: str = "yes") -> bool:
|
||||
"""Ask a yes/no question via input() and return their answer.
|
||||
|
||||
"question" is a string that is presented to the user.
|
||||
"default" is the presumed answer if the user just hits <Enter>.
|
||||
It must be "yes" (the default), "no" or None (meaning
|
||||
an answer is required of the user).
|
||||
|
||||
The "answer" return value is True for "yes" or False for "no".
|
||||
|
||||
Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
|
||||
"""
|
||||
valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
|
||||
if default is None:
|
||||
prompt = " [y/n] "
|
||||
elif default == "yes":
|
||||
prompt = " [Y/n] "
|
||||
elif default == "no":
|
||||
prompt = " [y/N] "
|
||||
else:
|
||||
raise ValueError("invalid default answer: '%s'" % default)
|
||||
|
||||
while True:
|
||||
sys.stdout.write(question + prompt)
|
||||
choice = input().lower()
|
||||
if default is not None and choice == "":
|
||||
return valid[default]
|
||||
elif choice in valid:
|
||||
return valid[choice]
|
||||
else:
|
||||
sys.stdout.write("Please respond with 'yes' or 'no' (or 'y' or 'n').\n")
|
||||
|
||||
|
||||
class FileType:
|
||||
"""Factory for creating file object types
|
||||
|
||||
Instances of FileType are typically passed as type= arguments to the
|
||||
ArgumentParser add_argument() method.
|
||||
|
||||
Keyword Arguments:
|
||||
- mode -- A string indicating how the file is to be opened. Accepts the
|
||||
same values as the builtin open() function.
|
||||
- bufsize -- The file's desired buffer size. Accepts the same values as
|
||||
the builtin open() function.
|
||||
- encoding -- The file's encoding. Accepts the same values as the
|
||||
builtin open() function.
|
||||
- errors -- A string indicating how encoding and decoding errors are to
|
||||
be handled. Accepts the same value as the builtin open() function.
|
||||
|
||||
Backported from CPython 3.12
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mode: str = "r",
|
||||
bufsize: int = -1,
|
||||
encoding: str | None = None,
|
||||
errors: str | None = None,
|
||||
):
|
||||
self._mode = mode
|
||||
self._bufsize = bufsize
|
||||
self._encoding = encoding
|
||||
self._errors = errors
|
||||
|
||||
def __call__(self, string: str) -> typing.IO: # type: ignore[type-arg]
|
||||
# the special argument "-" means sys.std{in,out}
|
||||
if string == "-":
|
||||
if "r" in self._mode:
|
||||
return sys.stdin.buffer if "b" in self._mode else sys.stdin
|
||||
elif any(c in self._mode for c in "wax"):
|
||||
return sys.stdout.buffer if "b" in self._mode else sys.stdout
|
||||
else:
|
||||
msg = f'argument "-" with mode {self._mode}'
|
||||
raise ValueError(msg)
|
||||
|
||||
# all other arguments are used as file names
|
||||
try:
|
||||
return open(string, self._mode, self._bufsize, self._encoding, self._errors)
|
||||
except OSError as e:
|
||||
message = f"can't open '{string}': {e}"
|
||||
raise argparse.ArgumentTypeError(message)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
args = self._mode, self._bufsize
|
||||
kwargs = [("encoding", self._encoding), ("errors", self._errors)]
|
||||
args_str = ", ".join(
|
||||
[repr(arg) for arg in args if arg != -1]
|
||||
+ [f"{kw}={arg!r}" for kw, arg in kwargs if arg is not None]
|
||||
)
|
||||
return f"{type(self).__name__}({args_str})"
|
||||
|
||||
|
||||
def cli_detect(argv: list[str] | None = None) -> int:
|
||||
"""
|
||||
CLI assistant using ARGV and ArgumentParser
|
||||
:param argv:
|
||||
:return: 0 if everything is fine, anything else equal trouble
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="The Real First Universal Charset Detector. "
|
||||
"Discover originating encoding used on text file. "
|
||||
"Normalize text to unicode."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"files", type=FileType("rb"), nargs="+", help="File(s) to be analysed"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-v",
|
||||
"--verbose",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="verbose",
|
||||
help="Display complementary information about file if any. "
|
||||
"Stdout will contain logs about the detection process.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-a",
|
||||
"--with-alternative",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="alternatives",
|
||||
help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-n",
|
||||
"--normalize",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="normalize",
|
||||
help="Permit to normalize input file. If not set, program does not write anything.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-m",
|
||||
"--minimal",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="minimal",
|
||||
help="Only output the charset detected to STDOUT. Disabling JSON output.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-r",
|
||||
"--replace",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="replace",
|
||||
help="Replace file when trying to normalize it instead of creating a new one.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-f",
|
||||
"--force",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="force",
|
||||
help="Replace file without asking if you are sure, use this flag with caution.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i",
|
||||
"--no-preemptive",
|
||||
action="store_true",
|
||||
default=False,
|
||||
dest="no_preemptive",
|
||||
help="Disable looking at a charset declaration to hint the detector.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-t",
|
||||
"--threshold",
|
||||
action="store",
|
||||
default=0.2,
|
||||
type=float,
|
||||
dest="threshold",
|
||||
help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--version",
|
||||
action="version",
|
||||
version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
|
||||
__version__,
|
||||
python_version(),
|
||||
unidata_version,
|
||||
"OFF" if md_module.__file__.lower().endswith(".py") else "ON",
|
||||
),
|
||||
help="Show version information and exit.",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
if args.replace is True and args.normalize is False:
|
||||
if args.files:
|
||||
for my_file in args.files:
|
||||
my_file.close()
|
||||
print("Use --replace in addition of --normalize only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.force is True and args.replace is False:
|
||||
if args.files:
|
||||
for my_file in args.files:
|
||||
my_file.close()
|
||||
print("Use --force in addition of --replace only.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
if args.threshold < 0.0 or args.threshold > 1.0:
|
||||
if args.files:
|
||||
for my_file in args.files:
|
||||
my_file.close()
|
||||
print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
x_ = []
|
||||
|
||||
for my_file in args.files:
|
||||
matches = from_fp(
|
||||
my_file,
|
||||
threshold=args.threshold,
|
||||
explain=args.verbose,
|
||||
preemptive_behaviour=args.no_preemptive is False,
|
||||
)
|
||||
|
||||
best_guess = matches.best()
|
||||
|
||||
if best_guess is None:
|
||||
print(
|
||||
'Unable to identify originating encoding for "{}". {}'.format(
|
||||
my_file.name,
|
||||
(
|
||||
"Maybe try increasing maximum amount of chaos."
|
||||
if args.threshold < 1.0
|
||||
else ""
|
||||
),
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
None,
|
||||
[],
|
||||
[],
|
||||
"Unknown",
|
||||
[],
|
||||
False,
|
||||
1.0,
|
||||
0.0,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
else:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
best_guess.encoding,
|
||||
best_guess.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in best_guess.could_be_from_charset
|
||||
if cp != best_guess.encoding
|
||||
],
|
||||
best_guess.language,
|
||||
best_guess.alphabets,
|
||||
best_guess.bom,
|
||||
best_guess.percent_chaos,
|
||||
best_guess.percent_coherence,
|
||||
None,
|
||||
True,
|
||||
)
|
||||
)
|
||||
|
||||
if len(matches) > 1 and args.alternatives:
|
||||
for el in matches:
|
||||
if el != best_guess:
|
||||
x_.append(
|
||||
CliDetectionResult(
|
||||
abspath(my_file.name),
|
||||
el.encoding,
|
||||
el.encoding_aliases,
|
||||
[
|
||||
cp
|
||||
for cp in el.could_be_from_charset
|
||||
if cp != el.encoding
|
||||
],
|
||||
el.language,
|
||||
el.alphabets,
|
||||
el.bom,
|
||||
el.percent_chaos,
|
||||
el.percent_coherence,
|
||||
None,
|
||||
False,
|
||||
)
|
||||
)
|
||||
|
||||
if args.normalize is True:
|
||||
if best_guess.encoding.startswith("utf") is True:
|
||||
print(
|
||||
'"{}" file does not need to be normalized, as it already came from unicode.'.format(
|
||||
my_file.name
|
||||
),
|
||||
file=sys.stderr,
|
||||
)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
dir_path = dirname(realpath(my_file.name))
|
||||
file_name = basename(realpath(my_file.name))
|
||||
|
||||
o_: list[str] = file_name.split(".")
|
||||
|
||||
if args.replace is False:
|
||||
o_.insert(-1, best_guess.encoding)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
elif (
|
||||
args.force is False
|
||||
and query_yes_no(
|
||||
'Are you sure to normalize "{}" by replacing it ?'.format(
|
||||
my_file.name
|
||||
),
|
||||
"no",
|
||||
)
|
||||
is False
|
||||
):
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
continue
|
||||
|
||||
try:
|
||||
x_[0].unicode_path = join(dir_path, ".".join(o_))
|
||||
|
||||
with open(x_[0].unicode_path, "wb") as fp:
|
||||
fp.write(best_guess.output())
|
||||
except OSError as e:
|
||||
print(str(e), file=sys.stderr)
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
return 2
|
||||
|
||||
if my_file.closed is False:
|
||||
my_file.close()
|
||||
|
||||
if args.minimal is False:
|
||||
print(
|
||||
dumps(
|
||||
[el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
|
||||
ensure_ascii=True,
|
||||
indent=4,
|
||||
)
|
||||
)
|
||||
else:
|
||||
for my_file in args.files:
|
||||
print(
|
||||
", ".join(
|
||||
[
|
||||
el.encoding or "undefined"
|
||||
for el in x_
|
||||
if el.path == abspath(my_file.name)
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli_detect()
|
||||
+2015
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,80 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import TYPE_CHECKING, Any
|
||||
from warnings import warn
|
||||
|
||||
from .api import from_bytes
|
||||
from .constant import CHARDET_CORRESPONDENCE, TOO_SMALL_SEQUENCE
|
||||
|
||||
# TODO: remove this check when dropping Python 3.7 support
|
||||
if TYPE_CHECKING:
|
||||
from typing_extensions import TypedDict
|
||||
|
||||
class ResultDict(TypedDict):
|
||||
encoding: str | None
|
||||
language: str
|
||||
confidence: float | None
|
||||
|
||||
|
||||
def detect(
|
||||
byte_str: bytes, should_rename_legacy: bool = False, **kwargs: Any
|
||||
) -> ResultDict:
|
||||
"""
|
||||
chardet legacy method
|
||||
Detect the encoding of the given byte string. It should be mostly backward-compatible.
|
||||
Encoding name will match Chardet own writing whenever possible. (Not on encoding name unsupported by it)
|
||||
This function is deprecated and should be used to migrate your project easily, consult the documentation for
|
||||
further information. Not planned for removal.
|
||||
|
||||
:param byte_str: The byte sequence to examine.
|
||||
:param should_rename_legacy: Should we rename legacy encodings
|
||||
to their more modern equivalents?
|
||||
"""
|
||||
if len(kwargs):
|
||||
warn(
|
||||
f"charset-normalizer disregard arguments '{','.join(list(kwargs.keys()))}' in legacy function detect()"
|
||||
)
|
||||
|
||||
if not isinstance(byte_str, (bytearray, bytes)):
|
||||
raise TypeError( # pragma: nocover
|
||||
f"Expected object of type bytes or bytearray, got: {type(byte_str)}"
|
||||
)
|
||||
|
||||
if isinstance(byte_str, bytearray):
|
||||
byte_str = bytes(byte_str)
|
||||
|
||||
r = from_bytes(byte_str).best()
|
||||
|
||||
encoding = r.encoding if r is not None else None
|
||||
language = r.language if r is not None and r.language != "Unknown" else ""
|
||||
confidence = 1.0 - r.chaos if r is not None else None
|
||||
|
||||
# automatically lower confidence
|
||||
# on small bytes samples.
|
||||
# https://github.com/jawah/charset_normalizer/issues/391
|
||||
if (
|
||||
confidence is not None
|
||||
and confidence >= 0.9
|
||||
and encoding
|
||||
not in {
|
||||
"utf_8",
|
||||
"ascii",
|
||||
}
|
||||
and r.bom is False # type: ignore[union-attr]
|
||||
and len(byte_str) < TOO_SMALL_SEQUENCE
|
||||
):
|
||||
confidence -= 0.2
|
||||
|
||||
# Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
|
||||
# but chardet does return 'utf-8-sig' and it is a valid codec name.
|
||||
if r is not None and encoding == "utf_8" and r.bom:
|
||||
encoding += "_sig"
|
||||
|
||||
if should_rename_legacy is False and encoding in CHARDET_CORRESPONDENCE:
|
||||
encoding = CHARDET_CORRESPONDENCE[encoding]
|
||||
|
||||
return {
|
||||
"encoding": encoding,
|
||||
"language": language,
|
||||
"confidence": confidence,
|
||||
}
|
||||
BIN
Binary file not shown.
@@ -0,0 +1,635 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from logging import getLogger
|
||||
|
||||
from .constant import (
|
||||
COMMON_SAFE_ASCII_CHARACTERS,
|
||||
TRACE,
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||
)
|
||||
from .utils import (
|
||||
is_accentuated,
|
||||
is_arabic,
|
||||
is_arabic_isolated_form,
|
||||
is_case_variable,
|
||||
is_cjk,
|
||||
is_emoticon,
|
||||
is_hangul,
|
||||
is_hiragana,
|
||||
is_katakana,
|
||||
is_latin,
|
||||
is_punctuation,
|
||||
is_separator,
|
||||
is_symbol,
|
||||
is_thai,
|
||||
is_unprintable,
|
||||
remove_accent,
|
||||
unicode_range,
|
||||
is_cjk_uncommon,
|
||||
)
|
||||
|
||||
|
||||
class MessDetectorPlugin:
|
||||
"""
|
||||
Base abstract class used for mess detection plugins.
|
||||
All detectors MUST extend and implement given methods.
|
||||
"""
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
"""
|
||||
Determine if given character should be fed in.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
"""
|
||||
The main routine to be executed upon character.
|
||||
Insert the logic in witch the text would be considered chaotic.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
def reset(self) -> None: # pragma: no cover
|
||||
"""
|
||||
Permit to reset the plugin to the initial state.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
"""
|
||||
Compute the chaos ratio based on what your feed() has seen.
|
||||
Must NOT be lower than 0.; No restriction gt 0.
|
||||
"""
|
||||
raise NotImplementedError # pragma: nocover
|
||||
|
||||
|
||||
class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._punctuation_count: int = 0
|
||||
self._symbol_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_printable_char: str | None = None
|
||||
self._frenzy_symbol_in_word: bool = False
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if (
|
||||
character != self._last_printable_char
|
||||
and character not in COMMON_SAFE_ASCII_CHARACTERS
|
||||
):
|
||||
if is_punctuation(character):
|
||||
self._punctuation_count += 1
|
||||
elif (
|
||||
character.isdigit() is False
|
||||
and is_symbol(character)
|
||||
and is_emoticon(character) is False
|
||||
):
|
||||
self._symbol_count += 2
|
||||
|
||||
self._last_printable_char = character
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._punctuation_count = 0
|
||||
self._character_count = 0
|
||||
self._symbol_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
ratio_of_punctuation: float = (
|
||||
self._punctuation_count + self._symbol_count
|
||||
) / self._character_count
|
||||
|
||||
return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
|
||||
|
||||
|
||||
class TooManyAccentuatedPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._accentuated_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if is_accentuated(character):
|
||||
self._accentuated_count += 1
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._accentuated_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count < 8:
|
||||
return 0.0
|
||||
|
||||
ratio_of_accentuation: float = self._accentuated_count / self._character_count
|
||||
return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
|
||||
|
||||
|
||||
class UnprintablePlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._unprintable_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if is_unprintable(character):
|
||||
self._unprintable_count += 1
|
||||
self._character_count += 1
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._unprintable_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return (self._unprintable_count * 8) / self._character_count
|
||||
|
||||
|
||||
class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._successive_count: int = 0
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_latin_character: str | None = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isalpha() and is_latin(character)
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
if (
|
||||
self._last_latin_character is not None
|
||||
and is_accentuated(character)
|
||||
and is_accentuated(self._last_latin_character)
|
||||
):
|
||||
if character.isupper() and self._last_latin_character.isupper():
|
||||
self._successive_count += 1
|
||||
# Worse if its the same char duplicated with different accent.
|
||||
if remove_accent(character) == remove_accent(self._last_latin_character):
|
||||
self._successive_count += 1
|
||||
self._last_latin_character = character
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._successive_count = 0
|
||||
self._character_count = 0
|
||||
self._last_latin_character = None
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return (self._successive_count * 2) / self._character_count
|
||||
|
||||
|
||||
class SuspiciousRange(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._suspicious_successive_range_count: int = 0
|
||||
self._character_count: int = 0
|
||||
self._last_printable_seen: str | None = None
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return character.isprintable()
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if (
|
||||
character.isspace()
|
||||
or is_punctuation(character)
|
||||
or character in COMMON_SAFE_ASCII_CHARACTERS
|
||||
):
|
||||
self._last_printable_seen = None
|
||||
return
|
||||
|
||||
if self._last_printable_seen is None:
|
||||
self._last_printable_seen = character
|
||||
return
|
||||
|
||||
unicode_range_a: str | None = unicode_range(self._last_printable_seen)
|
||||
unicode_range_b: str | None = unicode_range(character)
|
||||
|
||||
if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
|
||||
self._suspicious_successive_range_count += 1
|
||||
|
||||
self._last_printable_seen = character
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._suspicious_successive_range_count = 0
|
||||
self._last_printable_seen = None
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count <= 13:
|
||||
return 0.0
|
||||
|
||||
ratio_of_suspicious_range_usage: float = (
|
||||
self._suspicious_successive_range_count * 2
|
||||
) / self._character_count
|
||||
|
||||
return ratio_of_suspicious_range_usage
|
||||
|
||||
|
||||
class SuperWeirdWordPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._word_count: int = 0
|
||||
self._bad_word_count: int = 0
|
||||
self._foreign_long_count: int = 0
|
||||
|
||||
self._is_current_word_bad: bool = False
|
||||
self._foreign_long_watch: bool = False
|
||||
|
||||
self._character_count: int = 0
|
||||
self._bad_character_count: int = 0
|
||||
|
||||
self._buffer: str = ""
|
||||
self._buffer_accent_count: int = 0
|
||||
self._buffer_glyph_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
if character.isalpha():
|
||||
self._buffer += character
|
||||
if is_accentuated(character):
|
||||
self._buffer_accent_count += 1
|
||||
if (
|
||||
self._foreign_long_watch is False
|
||||
and (is_latin(character) is False or is_accentuated(character))
|
||||
and is_cjk(character) is False
|
||||
and is_hangul(character) is False
|
||||
and is_katakana(character) is False
|
||||
and is_hiragana(character) is False
|
||||
and is_thai(character) is False
|
||||
):
|
||||
self._foreign_long_watch = True
|
||||
if (
|
||||
is_cjk(character)
|
||||
or is_hangul(character)
|
||||
or is_katakana(character)
|
||||
or is_hiragana(character)
|
||||
or is_thai(character)
|
||||
):
|
||||
self._buffer_glyph_count += 1
|
||||
return
|
||||
if not self._buffer:
|
||||
return
|
||||
if (
|
||||
character.isspace() or is_punctuation(character) or is_separator(character)
|
||||
) and self._buffer:
|
||||
self._word_count += 1
|
||||
buffer_length: int = len(self._buffer)
|
||||
|
||||
self._character_count += buffer_length
|
||||
|
||||
if buffer_length >= 4:
|
||||
if self._buffer_accent_count / buffer_length >= 0.5:
|
||||
self._is_current_word_bad = True
|
||||
# Word/Buffer ending with an upper case accentuated letter are so rare,
|
||||
# that we will consider them all as suspicious. Same weight as foreign_long suspicious.
|
||||
elif (
|
||||
is_accentuated(self._buffer[-1])
|
||||
and self._buffer[-1].isupper()
|
||||
and all(_.isupper() for _ in self._buffer) is False
|
||||
):
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
elif self._buffer_glyph_count == 1:
|
||||
self._is_current_word_bad = True
|
||||
self._foreign_long_count += 1
|
||||
if buffer_length >= 24 and self._foreign_long_watch:
|
||||
camel_case_dst = [
|
||||
i
|
||||
for c, i in zip(self._buffer, range(0, buffer_length))
|
||||
if c.isupper()
|
||||
]
|
||||
probable_camel_cased: bool = False
|
||||
|
||||
if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
|
||||
probable_camel_cased = True
|
||||
|
||||
if not probable_camel_cased:
|
||||
self._foreign_long_count += 1
|
||||
self._is_current_word_bad = True
|
||||
|
||||
if self._is_current_word_bad:
|
||||
self._bad_word_count += 1
|
||||
self._bad_character_count += len(self._buffer)
|
||||
self._is_current_word_bad = False
|
||||
|
||||
self._foreign_long_watch = False
|
||||
self._buffer = ""
|
||||
self._buffer_accent_count = 0
|
||||
self._buffer_glyph_count = 0
|
||||
elif (
|
||||
character not in {"<", ">", "-", "=", "~", "|", "_"}
|
||||
and character.isdigit() is False
|
||||
and is_symbol(character)
|
||||
):
|
||||
self._is_current_word_bad = True
|
||||
self._buffer += character
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._buffer = ""
|
||||
self._is_current_word_bad = False
|
||||
self._foreign_long_watch = False
|
||||
self._bad_word_count = 0
|
||||
self._word_count = 0
|
||||
self._character_count = 0
|
||||
self._bad_character_count = 0
|
||||
self._foreign_long_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._word_count <= 10 and self._foreign_long_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._bad_character_count / self._character_count
|
||||
|
||||
|
||||
class CjkUncommonPlugin(MessDetectorPlugin):
|
||||
"""
|
||||
Detect messy CJK text that probably means nothing.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._uncommon_count: int = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return is_cjk(character)
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if is_cjk_uncommon(character):
|
||||
self._uncommon_count += 1
|
||||
return
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._uncommon_count = 0
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count < 8:
|
||||
return 0.0
|
||||
|
||||
uncommon_form_usage: float = self._uncommon_count / self._character_count
|
||||
|
||||
# we can be pretty sure it's garbage when uncommon characters are widely
|
||||
# used. otherwise it could just be traditional chinese for example.
|
||||
return uncommon_form_usage / 10 if uncommon_form_usage > 0.5 else 0.0
|
||||
|
||||
|
||||
class ArchaicUpperLowerPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._buf: bool = False
|
||||
|
||||
self._character_count_since_last_sep: int = 0
|
||||
|
||||
self._successive_upper_lower_count: int = 0
|
||||
self._successive_upper_lower_count_final: int = 0
|
||||
|
||||
self._character_count: int = 0
|
||||
|
||||
self._last_alpha_seen: str | None = None
|
||||
self._current_ascii_only: bool = True
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return True
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
is_concerned = character.isalpha() and is_case_variable(character)
|
||||
chunk_sep = is_concerned is False
|
||||
|
||||
if chunk_sep and self._character_count_since_last_sep > 0:
|
||||
if (
|
||||
self._character_count_since_last_sep <= 64
|
||||
and character.isdigit() is False
|
||||
and self._current_ascii_only is False
|
||||
):
|
||||
self._successive_upper_lower_count_final += (
|
||||
self._successive_upper_lower_count
|
||||
)
|
||||
|
||||
self._successive_upper_lower_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._last_alpha_seen = None
|
||||
self._buf = False
|
||||
self._character_count += 1
|
||||
self._current_ascii_only = True
|
||||
|
||||
return
|
||||
|
||||
if self._current_ascii_only is True and character.isascii() is False:
|
||||
self._current_ascii_only = False
|
||||
|
||||
if self._last_alpha_seen is not None:
|
||||
if (character.isupper() and self._last_alpha_seen.islower()) or (
|
||||
character.islower() and self._last_alpha_seen.isupper()
|
||||
):
|
||||
if self._buf is True:
|
||||
self._successive_upper_lower_count += 2
|
||||
self._buf = False
|
||||
else:
|
||||
self._buf = True
|
||||
else:
|
||||
self._buf = False
|
||||
|
||||
self._character_count += 1
|
||||
self._character_count_since_last_sep += 1
|
||||
self._last_alpha_seen = character
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._character_count_since_last_sep = 0
|
||||
self._successive_upper_lower_count = 0
|
||||
self._successive_upper_lower_count_final = 0
|
||||
self._last_alpha_seen = None
|
||||
self._buf = False
|
||||
self._current_ascii_only = True
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count == 0:
|
||||
return 0.0
|
||||
|
||||
return self._successive_upper_lower_count_final / self._character_count
|
||||
|
||||
|
||||
class ArabicIsolatedFormPlugin(MessDetectorPlugin):
|
||||
def __init__(self) -> None:
|
||||
self._character_count: int = 0
|
||||
self._isolated_form_count: int = 0
|
||||
|
||||
def reset(self) -> None: # Abstract
|
||||
self._character_count = 0
|
||||
self._isolated_form_count = 0
|
||||
|
||||
def eligible(self, character: str) -> bool:
|
||||
return is_arabic(character)
|
||||
|
||||
def feed(self, character: str) -> None:
|
||||
self._character_count += 1
|
||||
|
||||
if is_arabic_isolated_form(character):
|
||||
self._isolated_form_count += 1
|
||||
|
||||
@property
|
||||
def ratio(self) -> float:
|
||||
if self._character_count < 8:
|
||||
return 0.0
|
||||
|
||||
isolated_form_usage: float = self._isolated_form_count / self._character_count
|
||||
|
||||
return isolated_form_usage
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def is_suspiciously_successive_range(
|
||||
unicode_range_a: str | None, unicode_range_b: str | None
|
||||
) -> bool:
|
||||
"""
|
||||
Determine if two Unicode range seen next to each other can be considered as suspicious.
|
||||
"""
|
||||
if unicode_range_a is None or unicode_range_b is None:
|
||||
return True
|
||||
|
||||
if unicode_range_a == unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
|
||||
return False
|
||||
|
||||
if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
|
||||
return False
|
||||
|
||||
# Latin characters can be accompanied with a combining diacritical mark
|
||||
# eg. Vietnamese.
|
||||
if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
|
||||
"Combining" in unicode_range_a or "Combining" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
|
||||
keywords_range_a, keywords_range_b = (
|
||||
unicode_range_a.split(" "),
|
||||
unicode_range_b.split(" "),
|
||||
)
|
||||
|
||||
for el in keywords_range_a:
|
||||
if el in UNICODE_SECONDARY_RANGE_KEYWORD:
|
||||
continue
|
||||
if el in keywords_range_b:
|
||||
return False
|
||||
|
||||
# Japanese Exception
|
||||
range_a_jp_chars, range_b_jp_chars = (
|
||||
unicode_range_a
|
||||
in (
|
||||
"Hiragana",
|
||||
"Katakana",
|
||||
),
|
||||
unicode_range_b in ("Hiragana", "Katakana"),
|
||||
)
|
||||
if (range_a_jp_chars or range_b_jp_chars) and (
|
||||
"CJK" in unicode_range_a or "CJK" in unicode_range_b
|
||||
):
|
||||
return False
|
||||
if range_a_jp_chars and range_b_jp_chars:
|
||||
return False
|
||||
|
||||
if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
|
||||
if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
|
||||
return False
|
||||
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
||||
return False
|
||||
|
||||
# Chinese/Japanese use dedicated range for punctuation and/or separators.
|
||||
if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
|
||||
unicode_range_a in ["Katakana", "Hiragana"]
|
||||
and unicode_range_b in ["Katakana", "Hiragana"]
|
||||
):
|
||||
if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
|
||||
return False
|
||||
if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
|
||||
return False
|
||||
if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
@lru_cache(maxsize=2048)
|
||||
def mess_ratio(
|
||||
decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
|
||||
) -> float:
|
||||
"""
|
||||
Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
|
||||
"""
|
||||
|
||||
detectors: list[MessDetectorPlugin] = [
|
||||
md_class() for md_class in MessDetectorPlugin.__subclasses__()
|
||||
]
|
||||
|
||||
length: int = len(decoded_sequence) + 1
|
||||
|
||||
mean_mess_ratio: float = 0.0
|
||||
|
||||
if length < 512:
|
||||
intermediary_mean_mess_ratio_calc: int = 32
|
||||
elif length <= 1024:
|
||||
intermediary_mean_mess_ratio_calc = 64
|
||||
else:
|
||||
intermediary_mean_mess_ratio_calc = 128
|
||||
|
||||
for character, index in zip(decoded_sequence + "\n", range(length)):
|
||||
for detector in detectors:
|
||||
if detector.eligible(character):
|
||||
detector.feed(character)
|
||||
|
||||
if (
|
||||
index > 0 and index % intermediary_mean_mess_ratio_calc == 0
|
||||
) or index == length - 1:
|
||||
mean_mess_ratio = sum(dt.ratio for dt in detectors)
|
||||
|
||||
if mean_mess_ratio >= maximum_threshold:
|
||||
break
|
||||
|
||||
if debug:
|
||||
logger = getLogger("charset_normalizer")
|
||||
|
||||
logger.log(
|
||||
TRACE,
|
||||
"Mess-detector extended-analysis start. "
|
||||
f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
|
||||
f"maximum_threshold={maximum_threshold}",
|
||||
)
|
||||
|
||||
if len(decoded_sequence) > 16:
|
||||
logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
|
||||
logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
|
||||
|
||||
for dt in detectors:
|
||||
logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
|
||||
|
||||
return round(mean_mess_ratio, 3)
|
||||
BIN
Binary file not shown.
@@ -0,0 +1,360 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from encodings.aliases import aliases
|
||||
from hashlib import sha256
|
||||
from json import dumps
|
||||
from re import sub
|
||||
from typing import Any, Iterator, List, Tuple
|
||||
|
||||
from .constant import RE_POSSIBLE_ENCODING_INDICATION, TOO_BIG_SEQUENCE
|
||||
from .utils import iana_name, is_multi_byte_encoding, unicode_range
|
||||
|
||||
|
||||
class CharsetMatch:
|
||||
def __init__(
|
||||
self,
|
||||
payload: bytes,
|
||||
guessed_encoding: str,
|
||||
mean_mess_ratio: float,
|
||||
has_sig_or_bom: bool,
|
||||
languages: CoherenceMatches,
|
||||
decoded_payload: str | None = None,
|
||||
preemptive_declaration: str | None = None,
|
||||
):
|
||||
self._payload: bytes = payload
|
||||
|
||||
self._encoding: str = guessed_encoding
|
||||
self._mean_mess_ratio: float = mean_mess_ratio
|
||||
self._languages: CoherenceMatches = languages
|
||||
self._has_sig_or_bom: bool = has_sig_or_bom
|
||||
self._unicode_ranges: list[str] | None = None
|
||||
|
||||
self._leaves: list[CharsetMatch] = []
|
||||
self._mean_coherence_ratio: float = 0.0
|
||||
|
||||
self._output_payload: bytes | None = None
|
||||
self._output_encoding: str | None = None
|
||||
|
||||
self._string: str | None = decoded_payload
|
||||
|
||||
self._preemptive_declaration: str | None = preemptive_declaration
|
||||
|
||||
def __eq__(self, other: object) -> bool:
|
||||
if not isinstance(other, CharsetMatch):
|
||||
if isinstance(other, str):
|
||||
return iana_name(other) == self.encoding
|
||||
return False
|
||||
return self.encoding == other.encoding and self.fingerprint == other.fingerprint
|
||||
|
||||
def __lt__(self, other: object) -> bool:
|
||||
"""
|
||||
Implemented to make sorted available upon CharsetMatches items.
|
||||
"""
|
||||
if not isinstance(other, CharsetMatch):
|
||||
raise ValueError
|
||||
|
||||
chaos_difference: float = abs(self.chaos - other.chaos)
|
||||
coherence_difference: float = abs(self.coherence - other.coherence)
|
||||
|
||||
# Below 1% difference --> Use Coherence
|
||||
if chaos_difference < 0.01 and coherence_difference > 0.02:
|
||||
return self.coherence > other.coherence
|
||||
elif chaos_difference < 0.01 and coherence_difference <= 0.02:
|
||||
# When having a difficult decision, use the result that decoded as many multi-byte as possible.
|
||||
# preserve RAM usage!
|
||||
if len(self._payload) >= TOO_BIG_SEQUENCE:
|
||||
return self.chaos < other.chaos
|
||||
return self.multi_byte_usage > other.multi_byte_usage
|
||||
|
||||
return self.chaos < other.chaos
|
||||
|
||||
@property
|
||||
def multi_byte_usage(self) -> float:
|
||||
return 1.0 - (len(str(self)) / len(self.raw))
|
||||
|
||||
def __str__(self) -> str:
|
||||
# Lazy Str Loading
|
||||
if self._string is None:
|
||||
self._string = str(self._payload, self._encoding, "strict")
|
||||
return self._string
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"<CharsetMatch '{self.encoding}' bytes({self.fingerprint})>"
|
||||
|
||||
def add_submatch(self, other: CharsetMatch) -> None:
|
||||
if not isinstance(other, CharsetMatch) or other == self:
|
||||
raise ValueError(
|
||||
"Unable to add instance <{}> as a submatch of a CharsetMatch".format(
|
||||
other.__class__
|
||||
)
|
||||
)
|
||||
|
||||
other._string = None # Unload RAM usage; dirty trick.
|
||||
self._leaves.append(other)
|
||||
|
||||
@property
|
||||
def encoding(self) -> str:
|
||||
return self._encoding
|
||||
|
||||
@property
|
||||
def encoding_aliases(self) -> list[str]:
|
||||
"""
|
||||
Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
|
||||
"""
|
||||
also_known_as: list[str] = []
|
||||
for u, p in aliases.items():
|
||||
if self.encoding == u:
|
||||
also_known_as.append(p)
|
||||
elif self.encoding == p:
|
||||
also_known_as.append(u)
|
||||
return also_known_as
|
||||
|
||||
@property
|
||||
def bom(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def byte_order_mark(self) -> bool:
|
||||
return self._has_sig_or_bom
|
||||
|
||||
@property
|
||||
def languages(self) -> list[str]:
|
||||
"""
|
||||
Return the complete list of possible languages found in decoded sequence.
|
||||
Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
|
||||
"""
|
||||
return [e[0] for e in self._languages]
|
||||
|
||||
@property
|
||||
def language(self) -> str:
|
||||
"""
|
||||
Most probable language found in decoded sequence. If none were detected or inferred, the property will return
|
||||
"Unknown".
|
||||
"""
|
||||
if not self._languages:
|
||||
# Trying to infer the language based on the given encoding
|
||||
# Its either English or we should not pronounce ourselves in certain cases.
|
||||
if "ascii" in self.could_be_from_charset:
|
||||
return "English"
|
||||
|
||||
# doing it there to avoid circular import
|
||||
from charset_normalizer.cd import encoding_languages, mb_encoding_languages
|
||||
|
||||
languages = (
|
||||
mb_encoding_languages(self.encoding)
|
||||
if is_multi_byte_encoding(self.encoding)
|
||||
else encoding_languages(self.encoding)
|
||||
)
|
||||
|
||||
if len(languages) == 0 or "Latin Based" in languages:
|
||||
return "Unknown"
|
||||
|
||||
return languages[0]
|
||||
|
||||
return self._languages[0][0]
|
||||
|
||||
@property
|
||||
def chaos(self) -> float:
|
||||
return self._mean_mess_ratio
|
||||
|
||||
@property
|
||||
def coherence(self) -> float:
|
||||
if not self._languages:
|
||||
return 0.0
|
||||
return self._languages[0][1]
|
||||
|
||||
@property
|
||||
def percent_chaos(self) -> float:
|
||||
return round(self.chaos * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def percent_coherence(self) -> float:
|
||||
return round(self.coherence * 100, ndigits=3)
|
||||
|
||||
@property
|
||||
def raw(self) -> bytes:
|
||||
"""
|
||||
Original untouched bytes.
|
||||
"""
|
||||
return self._payload
|
||||
|
||||
@property
|
||||
def submatch(self) -> list[CharsetMatch]:
|
||||
return self._leaves
|
||||
|
||||
@property
|
||||
def has_submatch(self) -> bool:
|
||||
return len(self._leaves) > 0
|
||||
|
||||
@property
|
||||
def alphabets(self) -> list[str]:
|
||||
if self._unicode_ranges is not None:
|
||||
return self._unicode_ranges
|
||||
# list detected ranges
|
||||
detected_ranges: list[str | None] = [unicode_range(char) for char in str(self)]
|
||||
# filter and sort
|
||||
self._unicode_ranges = sorted(list({r for r in detected_ranges if r}))
|
||||
return self._unicode_ranges
|
||||
|
||||
@property
|
||||
def could_be_from_charset(self) -> list[str]:
|
||||
"""
|
||||
The complete list of encoding that output the exact SAME str result and therefore could be the originating
|
||||
encoding.
|
||||
This list does include the encoding available in property 'encoding'.
|
||||
"""
|
||||
return [self._encoding] + [m.encoding for m in self._leaves]
|
||||
|
||||
def output(self, encoding: str = "utf_8") -> bytes:
|
||||
"""
|
||||
Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
|
||||
Any errors will be simply ignored by the encoder NOT replaced.
|
||||
"""
|
||||
if self._output_encoding is None or self._output_encoding != encoding:
|
||||
self._output_encoding = encoding
|
||||
decoded_string = str(self)
|
||||
if (
|
||||
self._preemptive_declaration is not None
|
||||
and self._preemptive_declaration.lower()
|
||||
not in ["utf-8", "utf8", "utf_8"]
|
||||
):
|
||||
patched_header = sub(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
lambda m: m.string[m.span()[0] : m.span()[1]].replace(
|
||||
m.groups()[0],
|
||||
iana_name(self._output_encoding).replace("_", "-"), # type: ignore[arg-type]
|
||||
),
|
||||
decoded_string[:8192],
|
||||
count=1,
|
||||
)
|
||||
|
||||
decoded_string = patched_header + decoded_string[8192:]
|
||||
|
||||
self._output_payload = decoded_string.encode(encoding, "replace")
|
||||
|
||||
return self._output_payload # type: ignore
|
||||
|
||||
@property
|
||||
def fingerprint(self) -> str:
|
||||
"""
|
||||
Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
|
||||
"""
|
||||
return sha256(self.output()).hexdigest()
|
||||
|
||||
|
||||
class CharsetMatches:
|
||||
"""
|
||||
Container with every CharsetMatch items ordered by default from most probable to the less one.
|
||||
Act like a list(iterable) but does not implements all related methods.
|
||||
"""
|
||||
|
||||
def __init__(self, results: list[CharsetMatch] | None = None):
|
||||
self._results: list[CharsetMatch] = sorted(results) if results else []
|
||||
|
||||
def __iter__(self) -> Iterator[CharsetMatch]:
|
||||
yield from self._results
|
||||
|
||||
def __getitem__(self, item: int | str) -> CharsetMatch:
|
||||
"""
|
||||
Retrieve a single item either by its position or encoding name (alias may be used here).
|
||||
Raise KeyError upon invalid index or encoding not present in results.
|
||||
"""
|
||||
if isinstance(item, int):
|
||||
return self._results[item]
|
||||
if isinstance(item, str):
|
||||
item = iana_name(item, False)
|
||||
for result in self._results:
|
||||
if item in result.could_be_from_charset:
|
||||
return result
|
||||
raise KeyError
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self._results)
|
||||
|
||||
def __bool__(self) -> bool:
|
||||
return len(self._results) > 0
|
||||
|
||||
def append(self, item: CharsetMatch) -> None:
|
||||
"""
|
||||
Insert a single match. Will be inserted accordingly to preserve sort.
|
||||
Can be inserted as a submatch.
|
||||
"""
|
||||
if not isinstance(item, CharsetMatch):
|
||||
raise ValueError(
|
||||
"Cannot append instance '{}' to CharsetMatches".format(
|
||||
str(item.__class__)
|
||||
)
|
||||
)
|
||||
# We should disable the submatch factoring when the input file is too heavy (conserve RAM usage)
|
||||
if len(item.raw) < TOO_BIG_SEQUENCE:
|
||||
for match in self._results:
|
||||
if match.fingerprint == item.fingerprint and match.chaos == item.chaos:
|
||||
match.add_submatch(item)
|
||||
return
|
||||
self._results.append(item)
|
||||
self._results = sorted(self._results)
|
||||
|
||||
def best(self) -> CharsetMatch | None:
|
||||
"""
|
||||
Simply return the first match. Strict equivalent to matches[0].
|
||||
"""
|
||||
if not self._results:
|
||||
return None
|
||||
return self._results[0]
|
||||
|
||||
def first(self) -> CharsetMatch | None:
|
||||
"""
|
||||
Redundant method, call the method best(). Kept for BC reasons.
|
||||
"""
|
||||
return self.best()
|
||||
|
||||
|
||||
CoherenceMatch = Tuple[str, float]
|
||||
CoherenceMatches = List[CoherenceMatch]
|
||||
|
||||
|
||||
class CliDetectionResult:
|
||||
def __init__(
|
||||
self,
|
||||
path: str,
|
||||
encoding: str | None,
|
||||
encoding_aliases: list[str],
|
||||
alternative_encodings: list[str],
|
||||
language: str,
|
||||
alphabets: list[str],
|
||||
has_sig_or_bom: bool,
|
||||
chaos: float,
|
||||
coherence: float,
|
||||
unicode_path: str | None,
|
||||
is_preferred: bool,
|
||||
):
|
||||
self.path: str = path
|
||||
self.unicode_path: str | None = unicode_path
|
||||
self.encoding: str | None = encoding
|
||||
self.encoding_aliases: list[str] = encoding_aliases
|
||||
self.alternative_encodings: list[str] = alternative_encodings
|
||||
self.language: str = language
|
||||
self.alphabets: list[str] = alphabets
|
||||
self.has_sig_or_bom: bool = has_sig_or_bom
|
||||
self.chaos: float = chaos
|
||||
self.coherence: float = coherence
|
||||
self.is_preferred: bool = is_preferred
|
||||
|
||||
@property
|
||||
def __dict__(self) -> dict[str, Any]: # type: ignore
|
||||
return {
|
||||
"path": self.path,
|
||||
"encoding": self.encoding,
|
||||
"encoding_aliases": self.encoding_aliases,
|
||||
"alternative_encodings": self.alternative_encodings,
|
||||
"language": self.language,
|
||||
"alphabets": self.alphabets,
|
||||
"has_sig_or_bom": self.has_sig_or_bom,
|
||||
"chaos": self.chaos,
|
||||
"coherence": self.coherence,
|
||||
"unicode_path": self.unicode_path,
|
||||
"is_preferred": self.is_preferred,
|
||||
}
|
||||
|
||||
def to_json(self) -> str:
|
||||
return dumps(self.__dict__, ensure_ascii=True, indent=4)
|
||||
@@ -0,0 +1,414 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
import unicodedata
|
||||
from codecs import IncrementalDecoder
|
||||
from encodings.aliases import aliases
|
||||
from functools import lru_cache
|
||||
from re import findall
|
||||
from typing import Generator
|
||||
|
||||
from _multibytecodec import ( # type: ignore[import-not-found,import]
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
from .constant import (
|
||||
ENCODING_MARKS,
|
||||
IANA_SUPPORTED_SIMILAR,
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
UNICODE_RANGES_COMBINED,
|
||||
UNICODE_SECONDARY_RANGE_KEYWORD,
|
||||
UTF8_MAXIMAL_ALLOCATION,
|
||||
COMMON_CJK_CHARACTERS,
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_accentuated(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
return (
|
||||
"WITH GRAVE" in description
|
||||
or "WITH ACUTE" in description
|
||||
or "WITH CEDILLA" in description
|
||||
or "WITH DIAERESIS" in description
|
||||
or "WITH CIRCUMFLEX" in description
|
||||
or "WITH TILDE" in description
|
||||
or "WITH MACRON" in description
|
||||
or "WITH RING ABOVE" in description
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def remove_accent(character: str) -> str:
|
||||
decomposed: str = unicodedata.decomposition(character)
|
||||
if not decomposed:
|
||||
return character
|
||||
|
||||
codes: list[str] = decomposed.split(" ")
|
||||
|
||||
return chr(int(codes[0], 16))
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def unicode_range(character: str) -> str | None:
|
||||
"""
|
||||
Retrieve the Unicode range official name from a single character.
|
||||
"""
|
||||
character_ord: int = ord(character)
|
||||
|
||||
for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
|
||||
if character_ord in ord_range:
|
||||
return range_name
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_latin(character: str) -> bool:
|
||||
try:
|
||||
description: str = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
return "LATIN" in description
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_punctuation(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "P" in character_category:
|
||||
return True
|
||||
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Punctuation" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_symbol(character: str) -> bool:
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
if "S" in character_category or "N" in character_category:
|
||||
return True
|
||||
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Forms" in character_range and character_category != "Lo"
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_emoticon(character: str) -> bool:
|
||||
character_range: str | None = unicode_range(character)
|
||||
|
||||
if character_range is None:
|
||||
return False
|
||||
|
||||
return "Emoticons" in character_range or "Pictographs" in character_range
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_separator(character: str) -> bool:
|
||||
if character.isspace() or character in {"|", "+", "<", ">"}:
|
||||
return True
|
||||
|
||||
character_category: str = unicodedata.category(character)
|
||||
|
||||
return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_case_variable(character: str) -> bool:
|
||||
return character.islower() != character.isupper()
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_cjk(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "CJK" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hiragana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "HIRAGANA" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_katakana(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "KATAKANA" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_hangul(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "HANGUL" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_thai(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "THAI" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_arabic(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "ARABIC" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_arabic_isolated_form(character: str) -> bool:
|
||||
try:
|
||||
character_name = unicodedata.name(character)
|
||||
except ValueError: # Defensive: unicode database outdated?
|
||||
return False
|
||||
|
||||
return "ARABIC" in character_name and "ISOLATED FORM" in character_name
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_cjk_uncommon(character: str) -> bool:
|
||||
return character not in COMMON_CJK_CHARACTERS
|
||||
|
||||
|
||||
@lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
|
||||
def is_unicode_range_secondary(range_name: str) -> bool:
|
||||
return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
|
||||
|
||||
|
||||
@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
|
||||
def is_unprintable(character: str) -> bool:
|
||||
return (
|
||||
character.isspace() is False # includes \n \t \r \v
|
||||
and character.isprintable() is False
|
||||
and character != "\x1a" # Why? Its the ASCII substitute character.
|
||||
and character != "\ufeff" # bug discovered in Python,
|
||||
# Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
|
||||
)
|
||||
|
||||
|
||||
def any_specified_encoding(sequence: bytes, search_zone: int = 8192) -> str | None:
|
||||
"""
|
||||
Extract using ASCII-only decoder any specified encoding in the first n-bytes.
|
||||
"""
|
||||
if not isinstance(sequence, bytes):
|
||||
raise TypeError
|
||||
|
||||
seq_len: int = len(sequence)
|
||||
|
||||
results: list[str] = findall(
|
||||
RE_POSSIBLE_ENCODING_INDICATION,
|
||||
sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
|
||||
)
|
||||
|
||||
if len(results) == 0:
|
||||
return None
|
||||
|
||||
for specified_encoding in results:
|
||||
specified_encoding = specified_encoding.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if encoding_alias == specified_encoding:
|
||||
return encoding_iana
|
||||
if encoding_iana == specified_encoding:
|
||||
return encoding_iana
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def is_multi_byte_encoding(name: str) -> bool:
|
||||
"""
|
||||
Verify is a specific encoding is a multi byte one based on it IANA name
|
||||
"""
|
||||
return name in {
|
||||
"utf_8",
|
||||
"utf_8_sig",
|
||||
"utf_16",
|
||||
"utf_16_be",
|
||||
"utf_16_le",
|
||||
"utf_32",
|
||||
"utf_32_le",
|
||||
"utf_32_be",
|
||||
"utf_7",
|
||||
} or issubclass(
|
||||
importlib.import_module(f"encodings.{name}").IncrementalDecoder,
|
||||
MultibyteIncrementalDecoder,
|
||||
)
|
||||
|
||||
|
||||
def identify_sig_or_bom(sequence: bytes) -> tuple[str | None, bytes]:
|
||||
"""
|
||||
Identify and extract SIG/BOM in given sequence.
|
||||
"""
|
||||
|
||||
for iana_encoding in ENCODING_MARKS:
|
||||
marks: bytes | list[bytes] = ENCODING_MARKS[iana_encoding]
|
||||
|
||||
if isinstance(marks, bytes):
|
||||
marks = [marks]
|
||||
|
||||
for mark in marks:
|
||||
if sequence.startswith(mark):
|
||||
return iana_encoding, mark
|
||||
|
||||
return None, b""
|
||||
|
||||
|
||||
def should_strip_sig_or_bom(iana_encoding: str) -> bool:
|
||||
return iana_encoding not in {"utf_16", "utf_32"}
|
||||
|
||||
|
||||
def iana_name(cp_name: str, strict: bool = True) -> str:
|
||||
"""Returns the Python normalized encoding name (Not the IANA official name)."""
|
||||
cp_name = cp_name.lower().replace("-", "_")
|
||||
|
||||
encoding_alias: str
|
||||
encoding_iana: str
|
||||
|
||||
for encoding_alias, encoding_iana in aliases.items():
|
||||
if cp_name in [encoding_alias, encoding_iana]:
|
||||
return encoding_iana
|
||||
|
||||
if strict:
|
||||
raise ValueError(f"Unable to retrieve IANA for '{cp_name}'")
|
||||
|
||||
return cp_name
|
||||
|
||||
|
||||
def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
|
||||
if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
|
||||
return 0.0
|
||||
|
||||
decoder_a = importlib.import_module(f"encodings.{iana_name_a}").IncrementalDecoder
|
||||
decoder_b = importlib.import_module(f"encodings.{iana_name_b}").IncrementalDecoder
|
||||
|
||||
id_a: IncrementalDecoder = decoder_a(errors="ignore")
|
||||
id_b: IncrementalDecoder = decoder_b(errors="ignore")
|
||||
|
||||
character_match_count: int = 0
|
||||
|
||||
for i in range(255):
|
||||
to_be_decoded: bytes = bytes([i])
|
||||
if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
|
||||
character_match_count += 1
|
||||
|
||||
return character_match_count / 254
|
||||
|
||||
|
||||
def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
|
||||
"""
|
||||
Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
|
||||
the function cp_similarity.
|
||||
"""
|
||||
return (
|
||||
iana_name_a in IANA_SUPPORTED_SIMILAR
|
||||
and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
|
||||
)
|
||||
|
||||
|
||||
def set_logging_handler(
|
||||
name: str = "charset_normalizer",
|
||||
level: int = logging.INFO,
|
||||
format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
|
||||
) -> None:
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(level)
|
||||
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(logging.Formatter(format_string))
|
||||
logger.addHandler(handler)
|
||||
|
||||
|
||||
def cut_sequence_chunks(
|
||||
sequences: bytes,
|
||||
encoding_iana: str,
|
||||
offsets: range,
|
||||
chunk_size: int,
|
||||
bom_or_sig_available: bool,
|
||||
strip_sig_or_bom: bool,
|
||||
sig_payload: bytes,
|
||||
is_multi_byte_decoder: bool,
|
||||
decoded_payload: str | None = None,
|
||||
) -> Generator[str, None, None]:
|
||||
if decoded_payload and is_multi_byte_decoder is False:
|
||||
for i in offsets:
|
||||
chunk = decoded_payload[i : i + chunk_size]
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
||||
else:
|
||||
for i in offsets:
|
||||
chunk_end = i + chunk_size
|
||||
if chunk_end > len(sequences) + 8:
|
||||
continue
|
||||
|
||||
cut_sequence = sequences[i : i + chunk_size]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(
|
||||
encoding_iana,
|
||||
errors="ignore" if is_multi_byte_decoder else "strict",
|
||||
)
|
||||
|
||||
# multi-byte bad cutting detector and adjustment
|
||||
# not the cleanest way to perform that fix but clever enough for now.
|
||||
if is_multi_byte_decoder and i > 0:
|
||||
chunk_partial_size_chk: int = min(chunk_size, 16)
|
||||
|
||||
if (
|
||||
decoded_payload
|
||||
and chunk[:chunk_partial_size_chk] not in decoded_payload
|
||||
):
|
||||
for j in range(i, i - 4, -1):
|
||||
cut_sequence = sequences[j:chunk_end]
|
||||
|
||||
if bom_or_sig_available and strip_sig_or_bom is False:
|
||||
cut_sequence = sig_payload + cut_sequence
|
||||
|
||||
chunk = cut_sequence.decode(encoding_iana, errors="ignore")
|
||||
|
||||
if chunk[:chunk_partial_size_chk] in decoded_payload:
|
||||
break
|
||||
|
||||
yield chunk
|
||||
@@ -0,0 +1,8 @@
|
||||
"""
|
||||
Expose version
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
__version__ = "3.4.4"
|
||||
VERSION = __version__.split(".")
|
||||
+1
@@ -0,0 +1 @@
|
||||
pip
|
||||
+156
@@ -0,0 +1,156 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: cramjam
|
||||
Version: 2.11.0
|
||||
Requires-Dist: black==22.3.0 ; extra == 'dev'
|
||||
Requires-Dist: numpy ; extra == 'dev'
|
||||
Requires-Dist: pytest>=5.30 ; extra == 'dev'
|
||||
Requires-Dist: pytest-xdist ; extra == 'dev'
|
||||
Requires-Dist: pytest-benchmark ; extra == 'dev'
|
||||
Requires-Dist: hypothesis==6.60.0 ; extra == 'dev'
|
||||
Provides-Extra: dev
|
||||
License-File: LICENSE
|
||||
Summary: Thin Python bindings to de/compression algorithms in Rust
|
||||
Keywords: compression,decompression,snappy,zstd,bz2,gzip,lz4,brotli,deflate,blosc2
|
||||
Author: Miles Granger <miles59923@gmail.com>
|
||||
Author-email: Miles Granger <miles59923@gmail.com>
|
||||
License: MIT
|
||||
Requires-Python: >=3.8
|
||||
Description-Content-Type: text/markdown; charset=UTF-8; variant=GFM
|
||||
Project-URL: homepage, https://github.com/milesgranger/pyrus-cramjam
|
||||
Project-URL: documentation, https://docs.rs/cramjam/latest/cramjam
|
||||
Project-URL: repository, https://github.com/milesgranger/pyrus-cramjam
|
||||
|
||||
# cramjam
|
||||
|
||||
[](https://github.com/python/black)
|
||||
[](https://github.com/milesgranger/cramjam/actions/workflows/CI.yml)
|
||||
[](https://pypi.org/project/cramjam)
|
||||
[](https://anaconda.org/conda-forge/cramjam)
|
||||
[](https://pepy.tech/project/cramjam)
|
||||
[](https://www.npmjs.com/package/cramjam)
|
||||
|
||||
|
||||
[API Documentation](https://milesgranger.github.io/cramjam/cramjam.html)
|
||||
|
||||
### Install (Python)
|
||||
```commandline
|
||||
pip install --upgrade cramjam # Requires no Python or system dependencies!
|
||||
```
|
||||
|
||||
|
||||
### Install (JavaScript / TypeScript)
|
||||
```commandline
|
||||
npm install cramjam
|
||||
```
|
||||
|
||||
### CLI
|
||||
|
||||
A CLI interface is available as [`cramjam-cli`](https://github.com/cramjam/cramjam-cli)
|
||||
|
||||
### libcramjam
|
||||
|
||||
A Rust crate and C friendly library available at [libcramjam](https://github.com/cramjam/libcramjam)
|
||||
|
||||
---
|
||||
|
||||
Extremely thin and easy-to-install Python bindings to de/compression algorithms in Rust.
|
||||
Allows for using algorithms such as Snappy, without any system or other python dependencies.
|
||||
|
||||
---
|
||||
|
||||
##### Benchmarks
|
||||
|
||||
Some basic benchmarks are available [in the benchmarks directory](./benchmarks/README.md)
|
||||
|
||||
---
|
||||
|
||||
Available algorithms:
|
||||
|
||||
- [X] Snappy `cramjam.snappy`
|
||||
- [X] Brotli `cramjam.brotli`
|
||||
- [X] Bzip2 `cramjam.bzip2`
|
||||
- [X] Lz4 `cramjam.lz4`
|
||||
- [X] Gzip `cramjam.gzip`
|
||||
- [X] Zlib `cramjam.zlib`
|
||||
- [X] Deflate `cramjam.deflate`
|
||||
- [X] ZSTD `cramjam.zstd`
|
||||
- [X] XZ / LZMA `cramjam.xz`
|
||||
|
||||
|
||||
Experimental (Requires build from source enabling each feature):
|
||||
|
||||
- [X] Blosc2 `cramjam.experimental.blosc2`
|
||||
- [X] ISA-L backend _(only on 64-bit targets)_
|
||||
- [X] igzip `cramjam.experimental.igzip`
|
||||
- [X] ideflate `cramjam.experimental.ideflate`
|
||||
- [X] izlib `cramjam.experimental.izlib`
|
||||
|
||||
All available for use as:
|
||||
|
||||
```python
|
||||
>>> import cramjam
|
||||
>>> import numpy as np
|
||||
>>> compressed = cramjam.snappy.compress(b"bytes here")
|
||||
>>> decompressed = cramjam.snappy.decompress(compressed)
|
||||
>>> decompressed
|
||||
cramjam.Buffer(len=10) # an object which implements the buffer protocol
|
||||
>>> bytes(decompressed)
|
||||
b"bytes here"
|
||||
>>> np.frombuffer(decompressed, dtype=np.uint8)
|
||||
array([ 98, 121, 116, 101, 115, 32, 104, 101, 114, 101], dtype=uint8)
|
||||
```
|
||||
|
||||
Where the API is `cramjam.<compression-variant>.compress/decompress` and accepts
|
||||
`bytes`/`bytearray`/`numpy.array`/`cramjam.File`/`cramjam.Buffer` / `memoryview` objects.
|
||||
|
||||
**de/compress_into**
|
||||
Additionally, all variants support `decompress_into` and `compress_into`.
|
||||
Ex.
|
||||
```python
|
||||
>>> import numpy as np
|
||||
>>> from cramjam import snappy, Buffer
|
||||
>>>
|
||||
>>> data = np.frombuffer(b'some bytes here', dtype=np.uint8)
|
||||
>>> data
|
||||
array([115, 111, 109, 101, 32, 98, 121, 116, 101, 115, 32, 104, 101,
|
||||
114, 101], dtype=uint8)
|
||||
>>>
|
||||
>>> compressed = Buffer()
|
||||
>>> snappy.compress_into(data, compressed)
|
||||
33 # 33 bytes written to compressed buffer
|
||||
>>>
|
||||
>>> compressed.tell() # Where is the buffer position?
|
||||
33 # goodie!
|
||||
>>>
|
||||
>>> compressed.seek(0) # Go back to the start of the buffer so we can prepare to decompress
|
||||
>>> decompressed = b'0' * len(data) # let's write to `bytes` as output
|
||||
>>> decompressed
|
||||
b'000000000000000'
|
||||
>>>
|
||||
>>> snappy.decompress_into(compressed, decompressed)
|
||||
15 # 15 bytes written to decompressed
|
||||
>>> decompressed
|
||||
b'some bytes here'
|
||||
```
|
||||
|
||||
|
||||
[TypeScript](./cramjam-js/README.md):
|
||||
|
||||
|
||||
```typescript
|
||||
|
||||
import {Compress, Decompress} from 'cramjam';
|
||||
|
||||
const decoder = new TextDecoder();
|
||||
const encoder = new TextEncoder();
|
||||
|
||||
const str = 'hello, world';
|
||||
const encoded = encoder.encode(str);
|
||||
|
||||
const compressed = Compress.brotli(encoded);
|
||||
const decompressed = Decompress.brotli(compressed);
|
||||
|
||||
const decoded = decoder.decode(decompressed);
|
||||
|
||||
```
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
cramjam-2.11.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
cramjam-2.11.0.dist-info/METADATA,sha256=oSNVpO84VB1eD49Aq_h106e1P0mcosr8QttntPRM3Wc,5561
|
||||
cramjam-2.11.0.dist-info/RECORD,,
|
||||
cramjam-2.11.0.dist-info/WHEEL,sha256=3AYdCBKhqgyMewMRMJE9Rm8WU27aRCol69bTswjLWkc,129
|
||||
cramjam-2.11.0.dist-info/licenses/LICENSE,sha256=tPiCv8gNJ6NzJ4m9iDqMHTQuj_6n9iKnGz397YNCBf8,1070
|
||||
cramjam/__init__.py,sha256=VjCBHPElEPhUkzw5PBYIbA1ZmQIbYLoCrhyBx8LIaHs,111
|
||||
cramjam/__pycache__/__init__.cpython-311.pyc,,
|
||||
cramjam/cramjam.cpython-311-x86_64-linux-gnu.so,sha256=mu9rdrte8kCWFdFmK1QC65SuxuDYGdGPQRUv6KC0GGQ,4579016
|
||||
@@ -0,0 +1,4 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: maturin (1.9.1)
|
||||
Root-Is-Purelib: false
|
||||
Tag: cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64
|
||||
+21
@@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020 Miles Granger
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
@@ -0,0 +1,5 @@
|
||||
from .cramjam import *
|
||||
|
||||
__doc__ = cramjam.__doc__
|
||||
if hasattr(cramjam, "__all__"):
|
||||
__all__ = cramjam.__all__
|
||||
BIN
Binary file not shown.
+1
@@ -0,0 +1 @@
|
||||
pip
|
||||
+317
@@ -0,0 +1,317 @@
|
||||
Metadata-Version: 2.4
|
||||
Name: curl_cffi
|
||||
Version: 0.13.0
|
||||
Summary: libcurl ffi bindings for Python, with impersonation support.
|
||||
Author-email: lexiforest <infinitesheldon@gmail.com>
|
||||
License: MIT License
|
||||
Project-URL: repository, https://github.com/lexiforest/curl_cffi
|
||||
Classifier: Development Status :: 4 - Beta
|
||||
Classifier: Intended Audience :: Developers
|
||||
Classifier: Programming Language :: Python :: 3
|
||||
Classifier: Programming Language :: Python :: 3.9
|
||||
Classifier: Programming Language :: Python :: 3.10
|
||||
Classifier: Programming Language :: Python :: 3.11
|
||||
Classifier: Programming Language :: Python :: 3.12
|
||||
Classifier: Programming Language :: Python :: 3.13
|
||||
Requires-Python: >=3.9
|
||||
Description-Content-Type: text/markdown
|
||||
License-File: LICENSE
|
||||
Requires-Dist: cffi>=1.12.0
|
||||
Requires-Dist: certifi>=2024.2.2
|
||||
Provides-Extra: extra
|
||||
Requires-Dist: readability-lxml>=0.8.1; extra == "extra"
|
||||
Requires-Dist: markdownify>=1.1.0; extra == "extra"
|
||||
Requires-Dist: lxml_html_clean; extra == "extra"
|
||||
Provides-Extra: dev
|
||||
Requires-Dist: charset_normalizer<4.0,>=3.3.2; extra == "dev"
|
||||
Requires-Dist: coverage<7.0,>=6.4.1; extra == "dev"
|
||||
Requires-Dist: cryptography<43.0,>=42.0.5; extra == "dev"
|
||||
Requires-Dist: httpx==0.23.1; extra == "dev"
|
||||
Requires-Dist: mypy<2.0,>=1.9.0; extra == "dev"
|
||||
Requires-Dist: pytest<9.0,>=8.1.1; extra == "dev"
|
||||
Requires-Dist: pytest-asyncio<1.0,>=0.23.6; extra == "dev"
|
||||
Requires-Dist: pytest-trio<1.0,>=0.8.0; extra == "dev"
|
||||
Requires-Dist: ruff<1.0,>=0.3.5; extra == "dev"
|
||||
Requires-Dist: trio<1.0,>=0.25.0; extra == "dev"
|
||||
Requires-Dist: trustme<2.0,>=1.1.0; extra == "dev"
|
||||
Requires-Dist: uvicorn<1.0,>=0.29.0; extra == "dev"
|
||||
Requires-Dist: websockets<13.0,>=12.0; extra == "dev"
|
||||
Requires-Dist: typing_extensions; extra == "dev"
|
||||
Provides-Extra: build
|
||||
Requires-Dist: cibuildwheel; extra == "build"
|
||||
Requires-Dist: wheel; extra == "build"
|
||||
Provides-Extra: test
|
||||
Requires-Dist: charset_normalizer<4.0,>=3.3.2; extra == "test"
|
||||
Requires-Dist: cryptography<43.0,>=42.0.5; extra == "test"
|
||||
Requires-Dist: fastapi<1.0,==0.110.0; extra == "test"
|
||||
Requires-Dist: httpx==0.23.1; extra == "test"
|
||||
Requires-Dist: proxy.py<3.0,>=2.4.3; extra == "test"
|
||||
Requires-Dist: pytest<9.0,>=8.1.1; extra == "test"
|
||||
Requires-Dist: pytest-asyncio<1.0,>=0.23.6; extra == "test"
|
||||
Requires-Dist: pytest-trio<1.0,>=0.8.0; extra == "test"
|
||||
Requires-Dist: python-multipart<1.0,>=0.0.9; extra == "test"
|
||||
Requires-Dist: trio<1.0,>=0.25.0; extra == "test"
|
||||
Requires-Dist: trustme<2.0,>=1.1.0; extra == "test"
|
||||
Requires-Dist: uvicorn<1.0,>=0.29.0; extra == "test"
|
||||
Requires-Dist: websockets<13.0,>=12.0; extra == "test"
|
||||
Requires-Dist: typing_extensions; extra == "test"
|
||||
Dynamic: license-file
|
||||
|
||||
# curl_cffi
|
||||
|
||||

|
||||

|
||||
[](https://badge.fury.io/py/curl-cffi)
|
||||
[](https://t.me/+lL9n33eZp480MGM1)
|
||||
[](https://discord.gg/kJqMHHgdn2)
|
||||
|
||||
[Documentation](https://curl-cffi.readthedocs.io)
|
||||
|
||||
Python binding for [curl-impersonate fork](https://github.com/lexiforest/curl-impersonate)
|
||||
via [cffi](https://cffi.readthedocs.io/en/latest/). For commercial support, visit [impersonate.pro](https://impersonate.pro).
|
||||
|
||||
`curl_cffi` is the most popular Python binding for `curl`. Unlike other pure
|
||||
python http clients like `httpx` or `requests`, `curl_cffi` can impersonate
|
||||
browsers' TLS/JA3 and HTTP/2 fingerprints. If you are blocked by some
|
||||
website for no obvious reason, you can give `curl_cffi` a try.
|
||||
|
||||
Python 3.9 is the minimum supported version since v0.10.
|
||||
|
||||
## Sponsors
|
||||
|
||||
Maintenance of this project is made possible by all the <a href="https://github.com/lexiforest/curl_cffi/graphs/contributors">contributors</a> and <a href="https://github.com/sponsors/lexiforest">sponsors</a>. If you'd like to sponsor this project and have your avatar or company logo appear below <a href="https://github.com/sponsors/lexiforest">click here</a>. 💖
|
||||
|
||||
------
|
||||
|
||||
### Bypass Cloudflare with API
|
||||
|
||||
<a href="https://yescaptcha.com/i/stfnIO" target="_blank"><img src="https://raw.githubusercontent.com/lexiforest/curl_cffi/main/assets/yescaptcha.png" alt="Yes Captcha!" height="47" width="149"></a>
|
||||
|
||||
Yescaptcha is a proxy service that bypasses Cloudflare and uses the API interface to
|
||||
obtain verified cookies (e.g. `cf_clearance`). Click [here](https://yescaptcha.com/i/stfnIO)
|
||||
to register: https://yescaptcha.com/i/stfnIO
|
||||
|
||||
------
|
||||
|
||||
## Features
|
||||
|
||||
- Supports JA3/TLS and http2 fingerprints impersonation, including recent browsers and custom fingerprints.
|
||||
- Much faster than requests/httpx, on par with aiohttp/pycurl, see [benchmarks](https://github.com/lexiforest/curl_cffi/tree/main/benchmark).
|
||||
- Mimics the requests API, no need to learn another one.
|
||||
- Pre-compiled, so you don't have to compile on your machine.
|
||||
- Supports `asyncio` with proxy rotation on each request.
|
||||
- Supports http 2.0 & 3.0, which requests does not.
|
||||
- Supports websocket.
|
||||
- MIT licensed.
|
||||
|
||||
||requests|aiohttp|httpx|pycurl|curl_cffi|
|
||||
|---|---|---|---|---|---|
|
||||
|http/2|❌|❌|✅|✅|✅|
|
||||
|http/3|❌|❌|❌|☑️<sup>1</sup>|✅<sup>2</sup>|
|
||||
|sync|✅|❌|✅|✅|✅|
|
||||
|async|❌|✅|✅|❌|✅|
|
||||
|websocket|❌|✅|❌|❌|✅|
|
||||
|fingerprints|❌|❌|❌|❌|✅|
|
||||
|speed|🐇|🐇🐇|🐇|🐇🐇|🐇🐇|
|
||||
|
||||
Notes:
|
||||
1. For pycurl, you need an http/3 enabled libcurl to make it work, while curl_cffi packages libcurl-impersonate inside Python wheels.
|
||||
2. Since v0.11.4.
|
||||
|
||||
## Install
|
||||
|
||||
pip install curl_cffi --upgrade
|
||||
|
||||
This should work on Linux, macOS and Windows out of the box.
|
||||
If it does not work on you platform, you may need to compile and install `curl-impersonate`
|
||||
first and set some environment variables like `LD_LIBRARY_PATH`.
|
||||
|
||||
To install beta releases:
|
||||
|
||||
pip install curl_cffi --upgrade --pre
|
||||
|
||||
To install unstable version from GitHub:
|
||||
|
||||
git clone https://github.com/lexiforest/curl_cffi/
|
||||
cd curl_cffi
|
||||
make preprocess
|
||||
pip install .
|
||||
|
||||
On macOS, you may need to install the following dependencies:
|
||||
|
||||
brew install zstd nghttp2
|
||||
|
||||
## Usage
|
||||
|
||||
`curl_cffi` comes with a low-level `curl` API and a high-level `requests`-like API.
|
||||
|
||||
### requests-like
|
||||
|
||||
|
||||
```python
|
||||
import curl_cffi
|
||||
|
||||
# Notice the impersonate parameter
|
||||
r = curl_cffi.get("https://tls.browserleaks.com/json", impersonate="chrome")
|
||||
|
||||
print(r.json())
|
||||
# output: {..., "ja3n_hash": "aa56c057ad164ec4fdcb7a5a283be9fc", ...}
|
||||
# the js3n fingerprint should be the same as target browser
|
||||
|
||||
# To keep using the latest browser version as `curl_cffi` updates,
|
||||
# simply set impersonate="chrome" without specifying a version.
|
||||
# Other similar values are: "safari" and "safari_ios"
|
||||
r = curl_cffi.get("https://tls.browserleaks.com/json", impersonate="chrome")
|
||||
|
||||
# Randomly choose a browser version based on current market share in real world
|
||||
# from: https://caniuse.com/usage-table
|
||||
# NOTE: this is a pro feature.
|
||||
r = curl_cffi.get("https://example.com", impersonate="realworld")
|
||||
|
||||
# To pin a specific version, use version numbers together.
|
||||
r = curl_cffi.get("https://tls.browserleaks.com/json", impersonate="chrome124")
|
||||
|
||||
# To impersonate other than browsers, bring your own ja3/akamai strings
|
||||
# See examples directory for details.
|
||||
r = curl_cffi.get("https://tls.browserleaks.com/json", ja3=..., akamai=...)
|
||||
|
||||
# http/socks proxies are supported
|
||||
proxies = {"https": "http://localhost:3128"}
|
||||
r = curl_cffi.get("https://tls.browserleaks.com/json", impersonate="chrome", proxies=proxies)
|
||||
|
||||
proxies = {"https": "socks://localhost:3128"}
|
||||
r = curl_cffi.get("https://tls.browserleaks.com/json", impersonate="chrome", proxies=proxies)
|
||||
```
|
||||
|
||||
### Sessions
|
||||
|
||||
```python
|
||||
s = curl_cffi.Session()
|
||||
|
||||
# httpbin is a http test website, this endpoint makes the server set cookies
|
||||
s.get("https://httpbin.org/cookies/set/foo/bar")
|
||||
print(s.cookies)
|
||||
# <Cookies[<Cookie foo=bar for httpbin.org />]>
|
||||
|
||||
# retrieve cookies again to verify
|
||||
r = s.get("https://httpbin.org/cookies")
|
||||
print(r.json())
|
||||
# {'cookies': {'foo': 'bar'}}
|
||||
```
|
||||
|
||||
### Supported impersonate browsers
|
||||
|
||||
`curl_cffi` supports the same browser versions as supported by my [fork](https://github.com/lexiforest/curl-impersonate) of [curl-impersonate](https://github.com/lwthiker/curl-impersonate):
|
||||
|
||||
Open source version of curl_cffi includes versions whose fingerprints differ from previous versions.
|
||||
If you see a version, e.g. `chrome135`, were skipped, you can simply impersonate it with your own headers and the previous version.
|
||||
|
||||
If you don't want to look up the headers etc, by yourself, consider buying commercial support from [impersonate.pro](https://impersonate.pro),
|
||||
we have comprehensive browser fingerprints database for almost all the browser versions on various platforms.
|
||||
|
||||
If you are trying to impersonate a target other than a browser, use `ja3=...` and `akamai=...`
|
||||
to specify your own customized fingerprints. See the [docs on impersonation](https://curl-cffi.readthedocs.io/en/latest/impersonate.html) for details.
|
||||
|
||||
|Browser|Open Source| Pro version|
|
||||
|---|---|---|
|
||||
|Chrome|chrome99, chrome100, chrome101, chrome104, chrome107, chrome110, chrome116<sup>[1]</sup>, chrome119<sup>[1]</sup>, chrome120<sup>[1]</sup>, chrome123<sup>[3]</sup>, chrome124<sup>[3]</sup>, chrome131<sup>[4]</sup>, chrome133a<sup>[5][6]</sup>, chrome136<sup>[6]</sup>|chrome132, chrome134, chrome135|
|
||||
|Chrome Android| chrome99_android, chrome131_android <sup>[4]</sup>|chrome132_android, chrome133_android, chrome134_android, chrome135_android|
|
||||
|Chrome iOS|N/A|coming soon|
|
||||
|Safari <sup>[7]</sup>|safari153 <sup>[2]</sup>, safari155 <sup>[2]</sup>, safari170 <sup>[1]</sup>, safari180 <sup>[4]</sup>, safari184 <sup>[6]</sup>, safari260 <sup>[8]</sup>|coming soon|
|
||||
|Safari iOS <sup>[7]</sup>| safari172_ios<sup>[1]</sup>, safari180_ios<sup>[4]</sup>, safari184_ios <sup>[6]</sup>, safari260_ios <sup>[8]</sup>|coming soon|
|
||||
|Firefox|firefox133<sup>[5]</sup>, firefox135<sup>[7]</sup>|coming soon|
|
||||
|Firefox Android|N/A|firefox135_android|
|
||||
|Tor|tor145 <sup>[7]</sup>|coming soon|
|
||||
|Edge|edge99, edge101|edge133, edge135|
|
||||
|Opera|N/A|coming soon|
|
||||
|Brave|N/A|coming soon|
|
||||
|
||||
|
||||
Notes:
|
||||
1. Added in version `0.6.0`.
|
||||
2. Fixed in version `0.6.0`, previous http2 fingerprints were [not correct](https://github.com/lwthiker/curl-impersonate/issues/215).
|
||||
3. Added in version `0.7.0`.
|
||||
4. Added in version `0.8.0`.
|
||||
5. Added in version `0.9.0`.
|
||||
6. The version postfix `-a`(e.g. `chrome133a`) means that this is an alternative version, i.e. the fingerprint has not been officially updated by browser, but has been observed because of A/B testing.
|
||||
5. Added in version `0.10.0`.
|
||||
6. Added in version `0.11.0`.
|
||||
7. Since `0.11.0`, the format `safari184_ios` is preferred over `safari18_4_ios`, both are supported, but the latter is quite confusing and hard to parse.
|
||||
8. Added in `0.12.0`.
|
||||
|
||||
### Asyncio
|
||||
|
||||
```python
|
||||
from curl_cffi import AsyncSession
|
||||
|
||||
async with AsyncSession() as s:
|
||||
r = await s.get("https://example.com")
|
||||
```
|
||||
|
||||
More concurrency:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from curl_cffi import AsyncSession
|
||||
|
||||
urls = [
|
||||
"https://google.com/",
|
||||
"https://facebook.com/",
|
||||
"https://twitter.com/",
|
||||
]
|
||||
|
||||
async with AsyncSession() as s:
|
||||
tasks = []
|
||||
for url in urls:
|
||||
task = s.get(url)
|
||||
tasks.append(task)
|
||||
results = await asyncio.gather(*tasks)
|
||||
```
|
||||
|
||||
For low-level APIs, Scrapy integration and other advanced topics, see the
|
||||
[docs](https://curl-cffi.readthedocs.io) for more details.
|
||||
|
||||
|
||||
### WebSockets
|
||||
|
||||
```python
|
||||
from curl_cffi import WebSocket
|
||||
|
||||
def on_message(ws: WebSocket, message: str | bytes):
|
||||
print(message)
|
||||
|
||||
ws = WebSocket(on_message=on_message)
|
||||
ws.run_forever("wss://api.gemini.com/v1/marketdata/BTCUSD")
|
||||
```
|
||||
|
||||
### Asyncio WebSockets
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from curl_cffi import AsyncSession
|
||||
|
||||
async with AsyncSession() as s:
|
||||
ws = await s.ws_connect("wss://echo.websocket.org")
|
||||
await asyncio.gather(*[ws.send_str("Hello, World!") for _ in range(10)])
|
||||
async for message in ws:
|
||||
print(message)
|
||||
```
|
||||
|
||||
## Ecosystem
|
||||
|
||||
- Integrating with Scrapy: [divtiply/scrapy-curl-cffi](https://github.com/divtiply/scrapy-curl-cffi), [jxlil/scrapy-impersonate](https://github.com/jxlil/scrapy-impersonate) and [tieyongjie/scrapy-fingerprint](https://github.com/tieyongjie/scrapy-fingerprint).
|
||||
- Integrating with [requests](https://github.com/el1s7/curl-adapter), [httpx](https://github.com/vgavro/httpx-curl-cffi) as adapter.
|
||||
- Integrating with captcha resolvers: [CapSolver](https://docs.capsolver.com/en/api/), [YesCaptcha](https://yescaptcha.atlassian.net/wiki/spaces/YESCAPTCHA/overview). Please see the head area for promo code and link.
|
||||
|
||||
## Acknowledgement
|
||||
|
||||
- Originally forked from [multippt/python_curl_cffi](https://github.com/multippt/python_curl_cffi), which is under the MIT license.
|
||||
- Headers/Cookies files are copied from [httpx](https://github.com/encode/httpx/blob/master/httpx/_models.py), which is under the BSD license.
|
||||
- Asyncio support is inspired by Tornado's curl http client.
|
||||
- The synchronous WebSocket API is inspired by [websocket_client](https://github.com/websocket-client/websocket-client).
|
||||
- The asynchronous WebSocket API is inspired by [aiohttp](https://github.com/aio-libs/aiohttp).
|
||||
|
||||
## Contributing
|
||||
|
||||
When submitting an PR, please use a different branch other than `main` and check the
|
||||
"Allow edits by maintainers" box, so I can update your PR with lint or style fixes. Thanks!
|
||||
+42
@@ -0,0 +1,42 @@
|
||||
curl_cffi-0.13.0.dist-info/INSTALLER,sha256=zuuue4knoyJ-UwPPXg8fezS7VCrXJQrAP7zeNuwvFQg,4
|
||||
curl_cffi-0.13.0.dist-info/METADATA,sha256=_JmSyrZadXm2qQeCRaLqgihkWpbUtYk9MkWEwrTm3lQ,13424
|
||||
curl_cffi-0.13.0.dist-info/RECORD,,
|
||||
curl_cffi-0.13.0.dist-info/WHEEL,sha256=FUx1nSbn7F5iTJP3Igdf8PlJvn2cCxV7V9DsqzIsUog,147
|
||||
curl_cffi-0.13.0.dist-info/licenses/LICENSE,sha256=PoiwKbULav021rGGQs5Mi27uTJA_HPq-9bgR9h4HBQs,1106
|
||||
curl_cffi-0.13.0.dist-info/top_level.txt,sha256=b51YB50I_vu6XAbSERmqtgaYciYADCA_baVoZ_T5Lzs,10
|
||||
curl_cffi/__init__.py,sha256=_3js_vIETSp3e-rMyy2wPflnodRa9PCeP4txZgizsV0,1689
|
||||
curl_cffi/__pycache__/__init__.cpython-311.pyc,,
|
||||
curl_cffi/__pycache__/__version__.cpython-311.pyc,,
|
||||
curl_cffi/__pycache__/_asyncio_selector.cpython-311.pyc,,
|
||||
curl_cffi/__pycache__/aio.cpython-311.pyc,,
|
||||
curl_cffi/__pycache__/const.cpython-311.pyc,,
|
||||
curl_cffi/__pycache__/curl.cpython-311.pyc,,
|
||||
curl_cffi/__pycache__/utils.cpython-311.pyc,,
|
||||
curl_cffi/__version__.py,sha256=V5JrAqGGOx4CN0E7qt-KOSZ1KbxGTS7yp5654MhTrss,229
|
||||
curl_cffi/_asyncio_selector.py,sha256=XHNkdHeWDsPvLvSpg1wpL4gU3PgYVTV96o95vKBe80w,13020
|
||||
curl_cffi/_wrapper.abi3.so,sha256=KYDT8aCALfdn1TxCmSzREwvHiF3SaWM6I9YgV-z4K4c,22874672
|
||||
curl_cffi/aio.py,sha256=oTiffUFDNfxxSHmCVAsjX0CIOs4IuBQ6mhm4WuY5KZs,11925
|
||||
curl_cffi/const.py,sha256=d5YuvTCTPLYfXwKDz5PuROwg-qxfJ7nFMKMiJtXUfP8,18026
|
||||
curl_cffi/curl.py,sha256=PFWh4PFYoHQDSrH7FHToquadkxOkXe-l41AmNW1tP3c,21147
|
||||
curl_cffi/py.typed,sha256=dcrsqJrcYfTX-ckLFJMTaj6mD8aDe2u0tkQG-ZYxnEg,26
|
||||
curl_cffi/requests/__init__.py,sha256=UK77C7FgnKEcMrPzdHMw0XfDA8zyoirT4fAAPzwpB3E,5941
|
||||
curl_cffi/requests/__pycache__/__init__.cpython-311.pyc,,
|
||||
curl_cffi/requests/__pycache__/cookies.cpython-311.pyc,,
|
||||
curl_cffi/requests/__pycache__/errors.cpython-311.pyc,,
|
||||
curl_cffi/requests/__pycache__/exceptions.cpython-311.pyc,,
|
||||
curl_cffi/requests/__pycache__/headers.cpython-311.pyc,,
|
||||
curl_cffi/requests/__pycache__/impersonate.cpython-311.pyc,,
|
||||
curl_cffi/requests/__pycache__/models.cpython-311.pyc,,
|
||||
curl_cffi/requests/__pycache__/session.cpython-311.pyc,,
|
||||
curl_cffi/requests/__pycache__/utils.cpython-311.pyc,,
|
||||
curl_cffi/requests/__pycache__/websockets.cpython-311.pyc,,
|
||||
curl_cffi/requests/cookies.py,sha256=QDEuhtsSjh6iNAKmp5TnGyyAe8wdDJCADCHY_GWCeCc,11867
|
||||
curl_cffi/requests/errors.py,sha256=R6N5lmOTdRukThkNGUihDAQRu8HSh27M8E3zfUJJX74,250
|
||||
curl_cffi/requests/exceptions.py,sha256=ViyLx3XHii_s7kjrO3GhVOVXhq2_UsYfAQl8MPwDnEM,6187
|
||||
curl_cffi/requests/headers.py,sha256=A2w20i_JbmmIVQpq5BvWW9rLGm-zc8AgWIjSp5BN0vo,11496
|
||||
curl_cffi/requests/impersonate.py,sha256=ZNwOyA6JaoMfLpRZwr4UXI4BYlh2Hp8si0n3zDrFsew,12774
|
||||
curl_cffi/requests/models.py,sha256=0Hoq1VwlxyIA6bft2O-gEf-NT9qKnCoUhAJAipMMgYM,10359
|
||||
curl_cffi/requests/session.py,sha256=WYklgQ2wxNIWEThpvyZN1H8ZvOiFs7gCKqBDtg9uW48,42652
|
||||
curl_cffi/requests/utils.py,sha256=g2GCbYIX2ewe8npwBqvKWEklw2m0jq9biWqiqdH7hAY,24731
|
||||
curl_cffi/requests/websockets.py,sha256=siZ0G-URxuEEHRb_yHx9qSK_S7F5HH0fRwZ3IIy0giY,28596
|
||||
curl_cffi/utils.py,sha256=gRVzO-vhjf596V6kr_SjwHlwJfDIwTrPbRLJvvNlUNE,307
|
||||
+6
@@ -0,0 +1,6 @@
|
||||
Wheel-Version: 1.0
|
||||
Generator: setuptools (80.9.0)
|
||||
Root-Is-Purelib: false
|
||||
Tag: cp39-abi3-manylinux_2_17_x86_64
|
||||
Tag: cp39-abi3-manylinux2014_x86_64
|
||||
|
||||
+23
@@ -0,0 +1,23 @@
|
||||
MIT License
|
||||
|
||||
|
||||
Copyright (c) 2018 multippt
|
||||
Copyright (c) 2022 curl_cffi developers
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user