You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
102 lines
3.3 KiB
102 lines
3.3 KiB
"""tldextract helpers for testing and fetching remote resources."""
|
|
|
|
import logging
|
|
import pkgutil
|
|
import re
|
|
from typing import List, Sequence, Tuple, Union, cast
|
|
|
|
import requests
|
|
from requests_file import FileAdapter # type: ignore[import]
|
|
|
|
from .cache import DiskCache
|
|
|
|
LOG = logging.getLogger("tldextract")
|
|
|
|
PUBLIC_SUFFIX_RE = re.compile(r"^(?P<suffix>[.*!]*\w[\S]*)", re.UNICODE | re.MULTILINE)
|
|
PUBLIC_PRIVATE_SUFFIX_SEPARATOR = "// ===BEGIN PRIVATE DOMAINS==="
|
|
|
|
|
|
class SuffixListNotFound(LookupError):
|
|
"""A recoverable error while looking up a suffix list.
|
|
|
|
Recoverable because you can specify backups, or use this library's bundled
|
|
snapshot.
|
|
"""
|
|
|
|
|
|
def find_first_response(
|
|
cache: DiskCache,
|
|
urls: Sequence[str],
|
|
cache_fetch_timeout: Union[float, int, None] = None,
|
|
) -> str:
|
|
"""Decode the first successfully fetched URL, from UTF-8 encoding to Python unicode."""
|
|
with requests.Session() as session:
|
|
session.mount("file://", FileAdapter())
|
|
|
|
for url in urls:
|
|
try:
|
|
return cache.cached_fetch_url(
|
|
session=session, url=url, timeout=cache_fetch_timeout
|
|
)
|
|
except requests.exceptions.RequestException:
|
|
LOG.exception("Exception reading Public Suffix List url %s", url)
|
|
raise SuffixListNotFound(
|
|
"No remote Public Suffix List found. Consider using a mirror, or avoid this"
|
|
" fetch by constructing your TLDExtract with `suffix_list_urls=()`."
|
|
)
|
|
|
|
|
|
def extract_tlds_from_suffix_list(suffix_list_text: str) -> Tuple[List[str], List[str]]:
|
|
"""Parse the raw suffix list text for its different designations of suffixes."""
|
|
public_text, _, private_text = suffix_list_text.partition(
|
|
PUBLIC_PRIVATE_SUFFIX_SEPARATOR
|
|
)
|
|
|
|
public_tlds = [m.group("suffix") for m in PUBLIC_SUFFIX_RE.finditer(public_text)]
|
|
private_tlds = [m.group("suffix") for m in PUBLIC_SUFFIX_RE.finditer(private_text)]
|
|
return public_tlds, private_tlds
|
|
|
|
|
|
def get_suffix_lists(
|
|
cache: DiskCache,
|
|
urls: Sequence[str],
|
|
cache_fetch_timeout: Union[float, int, None],
|
|
fallback_to_snapshot: bool,
|
|
) -> Tuple[List[str], List[str]]:
|
|
"""Fetch, parse, and cache the suffix lists"""
|
|
return cache.run_and_cache(
|
|
func=_get_suffix_lists,
|
|
namespace="publicsuffix.org-tlds",
|
|
kwargs={
|
|
"cache": cache,
|
|
"urls": urls,
|
|
"cache_fetch_timeout": cache_fetch_timeout,
|
|
"fallback_to_snapshot": fallback_to_snapshot,
|
|
},
|
|
hashed_argnames=["urls", "fallback_to_snapshot"],
|
|
)
|
|
|
|
|
|
def _get_suffix_lists(
|
|
cache: DiskCache,
|
|
urls: Sequence[str],
|
|
cache_fetch_timeout: Union[float, int, None],
|
|
fallback_to_snapshot: bool,
|
|
) -> Tuple[List[str], List[str]]:
|
|
"""Fetch, parse, and cache the suffix lists"""
|
|
|
|
try:
|
|
text = find_first_response(cache, urls, cache_fetch_timeout=cache_fetch_timeout)
|
|
except SuffixListNotFound as exc:
|
|
if fallback_to_snapshot:
|
|
maybe_pkg_data = pkgutil.get_data("tldextract", ".tld_set_snapshot")
|
|
# package maintainers guarantee file is included
|
|
pkg_data = cast(bytes, maybe_pkg_data)
|
|
text = pkg_data.decode("utf-8")
|
|
else:
|
|
raise exc
|
|
|
|
public_tlds, private_tlds = extract_tlds_from_suffix_list(text)
|
|
|
|
return public_tlds, private_tlds
|