Import python venv for stability
This commit is contained in:
@@ -0,0 +1,544 @@
|
||||
import functools
|
||||
from functools import lru_cache
|
||||
import socket
|
||||
import time as _time
|
||||
|
||||
from curl_cffi import requests
|
||||
from urllib.parse import urlsplit, urljoin
|
||||
from bs4 import BeautifulSoup
|
||||
import datetime
|
||||
|
||||
from frozendict import frozendict
|
||||
|
||||
from . import utils, cache
|
||||
from .config import YfConfig
|
||||
import threading
|
||||
|
||||
from .exceptions import YFException, YFDataException, YFRateLimitError
|
||||
|
||||
|
||||
def _is_transient_error(exception):
|
||||
"""Check if error is transient (network/timeout) and should be retried."""
|
||||
if isinstance(exception, (TimeoutError, socket.error, OSError)):
|
||||
return True
|
||||
error_type_name = type(exception).__name__
|
||||
transient_error_types = {
|
||||
'Timeout', 'TimeoutError', 'ConnectionError', 'ConnectTimeout',
|
||||
'ReadTimeout', 'ChunkedEncodingError', 'RemoteDisconnected',
|
||||
}
|
||||
return error_type_name in transient_error_types
|
||||
|
||||
cache_maxsize = 64
|
||||
|
||||
|
||||
def lru_cache_freezeargs(func):
|
||||
"""
|
||||
Decorator transforms mutable dictionary and list arguments into immutable types
|
||||
Needed so lru_cache can cache method calls what has dict or list arguments.
|
||||
"""
|
||||
|
||||
@functools.wraps(func)
|
||||
def wrapped(*args, **kwargs):
|
||||
args = tuple([frozendict(arg) if isinstance(arg, dict) else arg for arg in args])
|
||||
kwargs = {k: frozendict(v) if isinstance(v, dict) else v for k, v in kwargs.items()}
|
||||
args = tuple([tuple(arg) if isinstance(arg, list) else arg for arg in args])
|
||||
kwargs = {k: tuple(v) if isinstance(v, list) else v for k, v in kwargs.items()}
|
||||
return func(*args, **kwargs)
|
||||
|
||||
# copy over the lru_cache extra methods to this wrapper to be able to access them
|
||||
# after this decorator has been applied
|
||||
wrapped.cache_info = func.cache_info
|
||||
wrapped.cache_clear = func.cache_clear
|
||||
return wrapped
|
||||
|
||||
|
||||
class SingletonMeta(type):
|
||||
"""
|
||||
Metaclass that creates a Singleton instance.
|
||||
"""
|
||||
_instances = {}
|
||||
_lock = threading.Lock()
|
||||
|
||||
def __call__(cls, *args, **kwargs):
|
||||
with cls._lock:
|
||||
if cls not in cls._instances:
|
||||
instance = super().__call__(*args, **kwargs)
|
||||
cls._instances[cls] = instance
|
||||
else:
|
||||
# Update the existing instance
|
||||
if 'session' in kwargs or (args and len(args) > 0):
|
||||
session = kwargs.get('session') if 'session' in kwargs else args[0]
|
||||
cls._instances[cls]._set_session(session)
|
||||
return cls._instances[cls]
|
||||
|
||||
|
||||
class YfData(metaclass=SingletonMeta):
|
||||
"""
|
||||
Have one place to retrieve data from Yahoo API in order to ease caching and speed up operations.
|
||||
Singleton means one session one cookie shared by all threads.
|
||||
"""
|
||||
|
||||
def __init__(self, session=None):
|
||||
self._crumb = None
|
||||
self._cookie = None
|
||||
|
||||
# Default to using 'basic' strategy
|
||||
self._cookie_strategy = 'basic'
|
||||
# If it fails, then fallback method is 'csrf'
|
||||
# self._cookie_strategy = 'csrf'
|
||||
|
||||
self._cookie_lock = threading.Lock()
|
||||
|
||||
self._session = None
|
||||
self._set_session(session or requests.Session(impersonate="chrome"))
|
||||
|
||||
def _set_session(self, session):
|
||||
if session is None:
|
||||
return
|
||||
|
||||
try:
|
||||
session.cache
|
||||
except AttributeError:
|
||||
# Not caching
|
||||
self._session_is_caching = False
|
||||
else:
|
||||
# Is caching. This is annoying.
|
||||
# Can't simply use a non-caching session to fetch cookie & crumb,
|
||||
# because then the caching-session won't have cookie.
|
||||
self._session_is_caching = True
|
||||
# But since switch to curl_cffi, can't use requests_cache with it.
|
||||
raise YFDataException("request_cache sessions don't work with curl_cffi, which is necessary now for Yahoo API. Solution: stop setting session, let YF handle.")
|
||||
|
||||
if not isinstance(session, requests.session.Session):
|
||||
raise YFDataException(f"Yahoo API requires curl_cffi session not {type(session)}. Solution: stop setting session, let YF handle.")
|
||||
|
||||
with self._cookie_lock:
|
||||
self._session = session
|
||||
if YfConfig.network.proxy is not None:
|
||||
self._session.proxies = YfConfig.network.proxy
|
||||
|
||||
def _set_cookie_strategy(self, strategy, have_lock=False):
|
||||
if strategy == self._cookie_strategy:
|
||||
return
|
||||
if not have_lock:
|
||||
self._cookie_lock.acquire()
|
||||
|
||||
try:
|
||||
if self._cookie_strategy == 'csrf':
|
||||
utils.get_yf_logger().debug(f'toggling cookie strategy {self._cookie_strategy} -> basic')
|
||||
self._session.cookies.clear()
|
||||
self._cookie_strategy = 'basic'
|
||||
else:
|
||||
utils.get_yf_logger().debug(f'toggling cookie strategy {self._cookie_strategy} -> csrf')
|
||||
self._cookie_strategy = 'csrf'
|
||||
self._cookie = None
|
||||
self._crumb = None
|
||||
except Exception:
|
||||
self._cookie_lock.release()
|
||||
raise
|
||||
|
||||
if not have_lock:
|
||||
self._cookie_lock.release()
|
||||
|
||||
@utils.log_indent_decorator
|
||||
def _save_cookie_curlCffi(self):
|
||||
if self._session is None:
|
||||
return False
|
||||
cookies = self._session.cookies.jar._cookies
|
||||
if len(cookies) == 0:
|
||||
return False
|
||||
yh_domains = [k for k in cookies.keys() if 'yahoo' in k]
|
||||
if len(yh_domains) > 1:
|
||||
# Possible when cookie fetched with CSRF method. Discard consent cookie.
|
||||
yh_domains = [k for k in yh_domains if 'consent' not in k]
|
||||
if len(yh_domains) > 1:
|
||||
utils.get_yf_logger().debug(f'Multiple Yahoo cookies, not sure which to cache: {yh_domains}')
|
||||
return False
|
||||
if len(yh_domains) == 0:
|
||||
return False
|
||||
yh_domain = yh_domains[0]
|
||||
yh_cookie = {yh_domain: cookies[yh_domain]}
|
||||
cache.get_cookie_cache().store('curlCffi', yh_cookie)
|
||||
return True
|
||||
|
||||
@utils.log_indent_decorator
|
||||
def _load_cookie_curlCffi(self):
|
||||
if self._session is None:
|
||||
return False
|
||||
cookie_dict = cache.get_cookie_cache().lookup('curlCffi')
|
||||
if cookie_dict is None or len(cookie_dict) == 0:
|
||||
return False
|
||||
cookies = cookie_dict['cookie']
|
||||
domain = list(cookies.keys())[0]
|
||||
cookie = cookies[domain]['/']['A3']
|
||||
expiry_ts = cookie.expires
|
||||
if expiry_ts > 2e9:
|
||||
# convert ms to s
|
||||
expiry_ts //= 1e3
|
||||
expiry_dt = datetime.datetime.fromtimestamp(expiry_ts, tz=datetime.timezone.utc)
|
||||
expired = expiry_dt < datetime.datetime.now(datetime.timezone.utc)
|
||||
if expired:
|
||||
utils.get_yf_logger().debug('cached cookie expired')
|
||||
return False
|
||||
self._session.cookies.jar._cookies.update(cookies)
|
||||
self._cookie = cookie
|
||||
return True
|
||||
|
||||
@utils.log_indent_decorator
|
||||
def _get_cookie_basic(self, timeout=30):
|
||||
if self._cookie is not None:
|
||||
utils.get_yf_logger().debug('reusing cookie')
|
||||
return True
|
||||
elif self._load_cookie_curlCffi():
|
||||
utils.get_yf_logger().debug('reusing persistent cookie')
|
||||
return True
|
||||
|
||||
# To avoid infinite recursion, do NOT use self.get()
|
||||
# - 'allow_redirects' copied from @psychoz971 solution - does it help USA?
|
||||
try:
|
||||
self._session.get(
|
||||
url='https://fc.yahoo.com',
|
||||
timeout=timeout,
|
||||
allow_redirects=True)
|
||||
except requests.exceptions.DNSError as e:
|
||||
# Possible because url on some privacy/ad blocklists.
|
||||
# Can ignore because have second strategy.
|
||||
utils.get_yf_logger().debug("Handling DNS error on cookie fetch: " + str(e))
|
||||
return False
|
||||
self._save_cookie_curlCffi()
|
||||
return True
|
||||
|
||||
@utils.log_indent_decorator
|
||||
def _get_crumb_basic(self, timeout=30):
|
||||
if self._crumb is not None:
|
||||
utils.get_yf_logger().debug('reusing crumb')
|
||||
return self._crumb
|
||||
|
||||
if not self._get_cookie_basic():
|
||||
return None
|
||||
# - 'allow_redirects' copied from @psychoz971 solution - does it help USA?
|
||||
get_args = {
|
||||
'url': "https://query1.finance.yahoo.com/v1/test/getcrumb",
|
||||
'timeout': timeout,
|
||||
'allow_redirects': True
|
||||
}
|
||||
if self._session_is_caching:
|
||||
get_args['expire_after'] = self._expire_after
|
||||
crumb_response = self._session.get(**get_args)
|
||||
else:
|
||||
crumb_response = self._session.get(**get_args)
|
||||
self._crumb = crumb_response.text
|
||||
if crumb_response.status_code == 429 or "Too Many Requests" in self._crumb:
|
||||
utils.get_yf_logger().debug(f"Didn't receive crumb {self._crumb}")
|
||||
raise YFRateLimitError()
|
||||
|
||||
if self._crumb is None or '<html>' in self._crumb:
|
||||
utils.get_yf_logger().debug("Didn't receive crumb")
|
||||
return None
|
||||
|
||||
utils.get_yf_logger().debug(f"crumb = '{self._crumb}'")
|
||||
return self._crumb
|
||||
|
||||
@utils.log_indent_decorator
|
||||
def _get_cookie_and_crumb_basic(self, timeout):
|
||||
if not self._get_cookie_basic(timeout):
|
||||
return None
|
||||
return self._get_crumb_basic(timeout)
|
||||
|
||||
@utils.log_indent_decorator
|
||||
def _get_cookie_csrf(self, timeout):
|
||||
if self._cookie is not None:
|
||||
utils.get_yf_logger().debug('reusing cookie')
|
||||
return True
|
||||
|
||||
elif self._load_cookie_curlCffi():
|
||||
utils.get_yf_logger().debug('reusing persistent cookie')
|
||||
self._cookie = True
|
||||
return True
|
||||
|
||||
base_args = {
|
||||
'timeout': timeout}
|
||||
|
||||
get_args = {**base_args, 'url': 'https://guce.yahoo.com/consent'}
|
||||
try:
|
||||
if self._session_is_caching:
|
||||
get_args['expire_after'] = self._expire_after
|
||||
response = self._session.get(**get_args)
|
||||
else:
|
||||
response = self._session.get(**get_args)
|
||||
except requests.exceptions.ChunkedEncodingError:
|
||||
# No idea why happens, but handle nicely so can switch to other cookie method.
|
||||
utils.get_yf_logger().debug('_get_cookie_csrf() encountering requests.exceptions.ChunkedEncodingError, aborting')
|
||||
return False
|
||||
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
csrfTokenInput = soup.find('input', attrs={'name': 'csrfToken'})
|
||||
if csrfTokenInput is None:
|
||||
utils.get_yf_logger().debug('Failed to find "csrfToken" in response')
|
||||
return False
|
||||
csrfToken = csrfTokenInput['value']
|
||||
utils.get_yf_logger().debug(f'csrfToken = {csrfToken}')
|
||||
sessionIdInput = soup.find('input', attrs={'name': 'sessionId'})
|
||||
sessionId = sessionIdInput['value']
|
||||
utils.get_yf_logger().debug(f"sessionId='{sessionId}")
|
||||
|
||||
originalDoneUrl = 'https://finance.yahoo.com/'
|
||||
namespace = 'yahoo'
|
||||
data = {
|
||||
'agree': ['agree', 'agree'],
|
||||
'consentUUID': 'default',
|
||||
'sessionId': sessionId,
|
||||
'csrfToken': csrfToken,
|
||||
'originalDoneUrl': originalDoneUrl,
|
||||
'namespace': namespace,
|
||||
}
|
||||
post_args = {**base_args,
|
||||
'url': f'https://consent.yahoo.com/v2/collectConsent?sessionId={sessionId}',
|
||||
'data': data}
|
||||
get_args = {**base_args,
|
||||
'url': f'https://guce.yahoo.com/copyConsent?sessionId={sessionId}',
|
||||
'data': data}
|
||||
try:
|
||||
if self._session_is_caching:
|
||||
post_args['expire_after'] = self._expire_after
|
||||
get_args['expire_after'] = self._expire_after
|
||||
self._session.post(**post_args)
|
||||
self._session.get(**get_args)
|
||||
else:
|
||||
self._session.post(**post_args)
|
||||
self._session.get(**get_args)
|
||||
except requests.exceptions.ChunkedEncodingError:
|
||||
# No idea why happens, but handle nicely so can switch to other cookie method.
|
||||
utils.get_yf_logger().debug('_get_cookie_csrf() encountering requests.exceptions.ChunkedEncodingError, aborting')
|
||||
self._cookie = True
|
||||
self._save_cookie_curlCffi()
|
||||
return True
|
||||
|
||||
@utils.log_indent_decorator
|
||||
def _get_crumb_csrf(self, timeout=30):
|
||||
# Credit goes to @bot-unit #1729
|
||||
|
||||
if self._crumb is not None:
|
||||
utils.get_yf_logger().debug('reusing crumb')
|
||||
return self._crumb
|
||||
|
||||
if not self._get_cookie_csrf(timeout):
|
||||
# This cookie stored in session
|
||||
return None
|
||||
|
||||
get_args = {
|
||||
'url': 'https://query2.finance.yahoo.com/v1/test/getcrumb',
|
||||
'timeout': timeout}
|
||||
if self._session_is_caching:
|
||||
get_args['expire_after'] = self._expire_after
|
||||
r = self._session.get(**get_args)
|
||||
else:
|
||||
r = self._session.get(**get_args)
|
||||
self._crumb = r.text
|
||||
|
||||
if r.status_code == 429 or "Too Many Requests" in self._crumb:
|
||||
utils.get_yf_logger().debug(f"Didn't receive crumb {self._crumb}")
|
||||
raise YFRateLimitError()
|
||||
|
||||
if self._crumb is None or '<html>' in self._crumb or self._crumb == '':
|
||||
utils.get_yf_logger().debug("Didn't receive crumb")
|
||||
return None
|
||||
|
||||
utils.get_yf_logger().debug(f"crumb = '{self._crumb}'")
|
||||
return self._crumb
|
||||
|
||||
@utils.log_indent_decorator
|
||||
def _get_cookie_and_crumb(self, timeout=30):
|
||||
crumb, strategy = None, None
|
||||
|
||||
utils.get_yf_logger().debug(f"cookie_mode = '{self._cookie_strategy}'")
|
||||
|
||||
with self._cookie_lock:
|
||||
if self._cookie_strategy == 'csrf':
|
||||
crumb = self._get_crumb_csrf()
|
||||
if crumb is None:
|
||||
# Fail
|
||||
self._set_cookie_strategy('basic', have_lock=True)
|
||||
crumb = self._get_cookie_and_crumb_basic(timeout)
|
||||
else:
|
||||
# Fallback strategy
|
||||
crumb = self._get_cookie_and_crumb_basic(timeout)
|
||||
if crumb is None:
|
||||
# Fail
|
||||
self._set_cookie_strategy('csrf', have_lock=True)
|
||||
crumb = self._get_crumb_csrf()
|
||||
strategy = self._cookie_strategy
|
||||
return crumb, strategy
|
||||
|
||||
@utils.log_indent_decorator
|
||||
def get(self, url, params=None, timeout=30):
|
||||
response = self._make_request(url, request_method = self._session.get, params=params, timeout=timeout)
|
||||
|
||||
# Accept cookie-consent if redirected to consent page
|
||||
if not self._is_this_consent_url(response.url):
|
||||
# "Consent Page not detected"
|
||||
pass
|
||||
else:
|
||||
# "Consent Page detected"
|
||||
response = self._accept_consent_form(response, timeout)
|
||||
|
||||
return response
|
||||
|
||||
@utils.log_indent_decorator
|
||||
def post(self, url, body=None, params=None, timeout=30, data=None):
|
||||
return self._make_request(url, request_method = self._session.post, body=body, params=params, timeout=timeout, data=data)
|
||||
|
||||
@utils.log_indent_decorator
|
||||
def _make_request(self, url, request_method, body=None, params=None, timeout=30, data=None):
|
||||
# Important: treat input arguments as immutable.
|
||||
|
||||
if len(url) > 200:
|
||||
utils.get_yf_logger().debug(f'url={url[:200]}...')
|
||||
else:
|
||||
utils.get_yf_logger().debug(f'url={url}')
|
||||
utils.get_yf_logger().debug(f'params={params}')
|
||||
|
||||
# sync with config
|
||||
self._session.proxies = YfConfig.network.proxy
|
||||
|
||||
if params is None:
|
||||
params = {}
|
||||
if 'crumb' in params:
|
||||
raise YFException("Don't manually add 'crumb' to params dict, let data.py handle it")
|
||||
|
||||
crumb, strategy = self._get_cookie_and_crumb()
|
||||
if crumb is not None:
|
||||
crumbs = {'crumb': crumb}
|
||||
else:
|
||||
crumbs = {}
|
||||
|
||||
request_args = {
|
||||
'url': url,
|
||||
'params': {**params, **crumbs},
|
||||
'timeout': timeout
|
||||
}
|
||||
|
||||
if body:
|
||||
request_args['json'] = body
|
||||
|
||||
if data:
|
||||
request_args['data'] = data
|
||||
request_args['headers'] = {"Content-Type": "application/json"}
|
||||
|
||||
for attempt in range(YfConfig.network.retries + 1):
|
||||
try:
|
||||
response = request_method(**request_args)
|
||||
break
|
||||
except Exception as e:
|
||||
if _is_transient_error(e) and attempt < YfConfig.network.retries:
|
||||
_time.sleep(2 ** attempt)
|
||||
else:
|
||||
raise
|
||||
utils.get_yf_logger().debug(f'response code={response.status_code}')
|
||||
if response.status_code >= 400:
|
||||
# Retry with other cookie strategy
|
||||
if strategy == 'basic':
|
||||
self._set_cookie_strategy('csrf')
|
||||
else:
|
||||
self._set_cookie_strategy('basic')
|
||||
crumb, strategy = self._get_cookie_and_crumb(timeout)
|
||||
request_args['params']['crumb'] = crumb
|
||||
response = request_method(**request_args)
|
||||
utils.get_yf_logger().debug(f'response code={response.status_code}')
|
||||
|
||||
# Raise exception if rate limited
|
||||
if response.status_code == 429:
|
||||
raise YFRateLimitError()
|
||||
|
||||
return response
|
||||
|
||||
@lru_cache_freezeargs
|
||||
@lru_cache(maxsize=cache_maxsize)
|
||||
def cache_get(self, url, params=None, timeout=30):
|
||||
return self.get(url, params, timeout)
|
||||
|
||||
def get_raw_json(self, url, params=None, timeout=30):
|
||||
utils.get_yf_logger().debug(f'get_raw_json(): {url}')
|
||||
response = self.get(url, params=params, timeout=timeout)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def _is_this_consent_url(self, response_url: str) -> bool:
|
||||
"""
|
||||
Check if given response_url is consent page
|
||||
|
||||
Args:
|
||||
response_url (str) : response.url
|
||||
|
||||
Returns:
|
||||
True : This is cookie-consent page
|
||||
False : This is not cookie-consent page
|
||||
"""
|
||||
try:
|
||||
return urlsplit(response_url).hostname and urlsplit(
|
||||
response_url
|
||||
).hostname.endswith("consent.yahoo.com")
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _accept_consent_form(
|
||||
self, consent_resp: requests.Response, timeout: int
|
||||
) -> requests.Response:
|
||||
"""
|
||||
Click 'Accept all' to cookie-consent form and return response object.
|
||||
|
||||
Args:
|
||||
consent_resp (requests.Response) : Response instance of cookie-consent page
|
||||
timeout (int) : Raise TimeoutError if post doesn't respond
|
||||
|
||||
Returns:
|
||||
response (requests.Response) : Reponse instance received from the server after accepting cookie-consent post.
|
||||
"""
|
||||
soup = BeautifulSoup(consent_resp.text, "html.parser")
|
||||
|
||||
# Heuristic: pick the first form; Yahoo's CMP tends to have a single form for consent
|
||||
form = soup.find("form")
|
||||
if not form:
|
||||
return consent_resp
|
||||
|
||||
# action : URL to send "Accept Cookies"
|
||||
action = form.get("action") or consent_resp.url
|
||||
action = urljoin(consent_resp.url, action)
|
||||
|
||||
# Collect inputs (hidden tokens, etc.)
|
||||
"""
|
||||
<input name="csrfToken" type="hidden" value="..."/>
|
||||
<input name="sessionId" type="hidden" value="..."/>
|
||||
<input name="originalDoneUrl" type="hidden" value="..."/>
|
||||
<input name="namespace" type="hidden" value="yahoo"/>
|
||||
"""
|
||||
data = {}
|
||||
for inp in form.find_all("input"):
|
||||
name = inp.get("name")
|
||||
if not name:
|
||||
continue
|
||||
typ = (inp.get("type") or "text").lower()
|
||||
val = inp.get("value") or ""
|
||||
|
||||
if typ in ("checkbox", "radio"):
|
||||
# If it's clearly an "agree"/"accept" field or already checked, include it
|
||||
if (
|
||||
"agree" in name.lower()
|
||||
or "accept" in name.lower()
|
||||
or inp.has_attr("checked")
|
||||
):
|
||||
data[name] = val if val != "" else "1"
|
||||
else:
|
||||
data[name] = val
|
||||
|
||||
# If no explicit agree/accept in inputs, add a best-effort flag
|
||||
lowered = {k.lower() for k in data.keys()}
|
||||
if not any(("agree" in k or "accept" in k) for k in lowered):
|
||||
data["agree"] = "1"
|
||||
|
||||
# Submit the form with "Referer". Some servers check this header as a simple CSRF protection measure.
|
||||
headers = {"Referer": consent_resp.url}
|
||||
response = self._session.post(
|
||||
action, data=data, headers=headers, timeout=timeout, allow_redirects=True
|
||||
)
|
||||
return response
|
||||
Reference in New Issue
Block a user