Varia's website
https://varia.zone
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
1583 lines
57 KiB
1583 lines
57 KiB
2 weeks ago
|
"""CSS matcher."""
|
||
|
from __future__ import annotations
|
||
|
from datetime import datetime
|
||
|
from . import util
|
||
|
import re
|
||
|
from . import css_types as ct
|
||
|
import unicodedata
|
||
|
import bs4 # type: ignore[import-untyped]
|
||
|
from typing import Iterator, Iterable, Any, Callable, Sequence, cast # noqa: F401
|
||
|
|
||
|
# Empty tag pattern (whitespace okay)
|
||
|
RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
|
||
|
|
||
|
RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
|
||
|
|
||
|
# Relationships
|
||
|
REL_PARENT = ' '
|
||
|
REL_CLOSE_PARENT = '>'
|
||
|
REL_SIBLING = '~'
|
||
|
REL_CLOSE_SIBLING = '+'
|
||
|
|
||
|
# Relationships for :has() (forward looking)
|
||
|
REL_HAS_PARENT = ': '
|
||
|
REL_HAS_CLOSE_PARENT = ':>'
|
||
|
REL_HAS_SIBLING = ':~'
|
||
|
REL_HAS_CLOSE_SIBLING = ':+'
|
||
|
|
||
|
NS_XHTML = 'http://www.w3.org/1999/xhtml'
|
||
|
NS_XML = 'http://www.w3.org/XML/1998/namespace'
|
||
|
|
||
|
DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
|
||
|
RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
|
||
|
|
||
|
DIR_MAP = {
|
||
|
'ltr': ct.SEL_DIR_LTR,
|
||
|
'rtl': ct.SEL_DIR_RTL,
|
||
|
'auto': 0
|
||
|
}
|
||
|
|
||
|
RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
|
||
|
RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
|
||
|
RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
|
||
|
RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
|
||
|
RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
|
||
|
RE_DATETIME = re.compile(
|
||
|
r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
|
||
|
)
|
||
|
RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
|
||
|
|
||
|
MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
|
||
|
FEB = 2
|
||
|
SHORT_MONTH = 30
|
||
|
LONG_MONTH = 31
|
||
|
FEB_MONTH = 28
|
||
|
FEB_LEAP_MONTH = 29
|
||
|
DAYS_IN_WEEK = 7
|
||
|
|
||
|
|
||
|
class _FakeParent:
|
||
|
"""
|
||
|
Fake parent class.
|
||
|
|
||
|
When we have a fragment with no `BeautifulSoup` document object,
|
||
|
we can't evaluate `nth` selectors properly. Create a temporary
|
||
|
fake parent so we can traverse the root element as a child.
|
||
|
"""
|
||
|
|
||
|
def __init__(self, element: bs4.Tag) -> None:
|
||
|
"""Initialize."""
|
||
|
|
||
|
self.contents = [element]
|
||
|
|
||
|
def __len__(self) -> bs4.PageElement:
|
||
|
"""Length."""
|
||
|
|
||
|
return len(self.contents)
|
||
|
|
||
|
|
||
|
class _DocumentNav:
|
||
|
"""Navigate a Beautiful Soup document."""
|
||
|
|
||
|
@classmethod
|
||
|
def assert_valid_input(cls, tag: Any) -> None:
|
||
|
"""Check if valid input tag or document."""
|
||
|
|
||
|
# Fail on unexpected types.
|
||
|
if not cls.is_tag(tag):
|
||
|
raise TypeError(f"Expected a BeautifulSoup 'Tag', but instead received type {type(tag)}")
|
||
|
|
||
|
@staticmethod
|
||
|
def is_doc(obj: bs4.Tag) -> bool:
|
||
|
"""Is `BeautifulSoup` object."""
|
||
|
return isinstance(obj, bs4.BeautifulSoup)
|
||
|
|
||
|
@staticmethod
|
||
|
def is_tag(obj: bs4.PageElement) -> bool:
|
||
|
"""Is tag."""
|
||
|
return isinstance(obj, bs4.Tag)
|
||
|
|
||
|
@staticmethod
|
||
|
def is_declaration(obj: bs4.PageElement) -> bool: # pragma: no cover
|
||
|
"""Is declaration."""
|
||
|
return isinstance(obj, bs4.Declaration)
|
||
|
|
||
|
@staticmethod
|
||
|
def is_cdata(obj: bs4.PageElement) -> bool:
|
||
|
"""Is CDATA."""
|
||
|
return isinstance(obj, bs4.CData)
|
||
|
|
||
|
@staticmethod
|
||
|
def is_processing_instruction(obj: bs4.PageElement) -> bool: # pragma: no cover
|
||
|
"""Is processing instruction."""
|
||
|
return isinstance(obj, bs4.ProcessingInstruction)
|
||
|
|
||
|
@staticmethod
|
||
|
def is_navigable_string(obj: bs4.PageElement) -> bool:
|
||
|
"""Is navigable string."""
|
||
|
return isinstance(obj, bs4.NavigableString)
|
||
|
|
||
|
@staticmethod
|
||
|
def is_special_string(obj: bs4.PageElement) -> bool:
|
||
|
"""Is special string."""
|
||
|
return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
|
||
|
|
||
|
@classmethod
|
||
|
def is_content_string(cls, obj: bs4.PageElement) -> bool:
|
||
|
"""Check if node is content string."""
|
||
|
|
||
|
return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
|
||
|
|
||
|
@staticmethod
|
||
|
def create_fake_parent(el: bs4.Tag) -> _FakeParent:
|
||
|
"""Create fake parent for a given element."""
|
||
|
|
||
|
return _FakeParent(el)
|
||
|
|
||
|
@staticmethod
|
||
|
def is_xml_tree(el: bs4.Tag) -> bool:
|
||
|
"""Check if element (or document) is from a XML tree."""
|
||
|
|
||
|
return bool(el._is_xml)
|
||
|
|
||
|
def is_iframe(self, el: bs4.Tag) -> bool:
|
||
|
"""Check if element is an `iframe`."""
|
||
|
|
||
|
return bool(
|
||
|
((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and
|
||
|
self.is_html_tag(el) # type: ignore[attr-defined]
|
||
|
)
|
||
|
|
||
|
def is_root(self, el: bs4.Tag) -> bool:
|
||
|
"""
|
||
|
Return whether element is a root element.
|
||
|
|
||
|
We check that the element is the root of the tree (which we have already pre-calculated),
|
||
|
and we check if it is the root element under an `iframe`.
|
||
|
"""
|
||
|
|
||
|
root = self.root and self.root is el # type: ignore[attr-defined]
|
||
|
if not root:
|
||
|
parent = self.get_parent(el)
|
||
|
root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
|
||
|
return root
|
||
|
|
||
|
def get_contents(self, el: bs4.Tag, no_iframe: bool = False) -> Iterator[bs4.PageElement]:
|
||
|
"""Get contents or contents in reverse."""
|
||
|
if not no_iframe or not self.is_iframe(el):
|
||
|
yield from el.contents
|
||
|
|
||
|
def get_children(
|
||
|
self,
|
||
|
el: bs4.Tag,
|
||
|
start: int | None = None,
|
||
|
reverse: bool = False,
|
||
|
tags: bool = True,
|
||
|
no_iframe: bool = False
|
||
|
) -> Iterator[bs4.PageElement]:
|
||
|
"""Get children."""
|
||
|
|
||
|
if not no_iframe or not self.is_iframe(el):
|
||
|
last = len(el.contents) - 1
|
||
|
if start is None:
|
||
|
index = last if reverse else 0
|
||
|
else:
|
||
|
index = start
|
||
|
end = -1 if reverse else last + 1
|
||
|
incr = -1 if reverse else 1
|
||
|
|
||
|
if 0 <= index <= last:
|
||
|
while index != end:
|
||
|
node = el.contents[index]
|
||
|
index += incr
|
||
|
if not tags or self.is_tag(node):
|
||
|
yield node
|
||
|
|
||
|
def get_descendants(
|
||
|
self,
|
||
|
el: bs4.Tag,
|
||
|
tags: bool = True,
|
||
|
no_iframe: bool = False
|
||
|
) -> Iterator[bs4.PageElement]:
|
||
|
"""Get descendants."""
|
||
|
|
||
|
if not no_iframe or not self.is_iframe(el):
|
||
|
next_good = None
|
||
|
for child in el.descendants:
|
||
|
|
||
|
if next_good is not None:
|
||
|
if child is not next_good:
|
||
|
continue
|
||
|
next_good = None
|
||
|
|
||
|
is_tag = self.is_tag(child)
|
||
|
|
||
|
if no_iframe and is_tag and self.is_iframe(child):
|
||
|
if child.next_sibling is not None:
|
||
|
next_good = child.next_sibling
|
||
|
else:
|
||
|
last_child = child
|
||
|
while self.is_tag(last_child) and last_child.contents:
|
||
|
last_child = last_child.contents[-1]
|
||
|
next_good = last_child.next_element
|
||
|
yield child
|
||
|
if next_good is None:
|
||
|
break
|
||
|
# Coverage isn't seeing this even though it's executed
|
||
|
continue # pragma: no cover
|
||
|
|
||
|
if not tags or is_tag:
|
||
|
yield child
|
||
|
|
||
|
def get_parent(self, el: bs4.Tag, no_iframe: bool = False) -> bs4.Tag:
|
||
|
"""Get parent."""
|
||
|
|
||
|
parent = el.parent
|
||
|
if no_iframe and parent is not None and self.is_iframe(parent):
|
||
|
parent = None
|
||
|
return parent
|
||
|
|
||
|
@staticmethod
|
||
|
def get_tag_name(el: bs4.Tag) -> str | None:
|
||
|
"""Get tag."""
|
||
|
|
||
|
return cast('str | None', el.name)
|
||
|
|
||
|
@staticmethod
|
||
|
def get_prefix_name(el: bs4.Tag) -> str | None:
|
||
|
"""Get prefix."""
|
||
|
|
||
|
return cast('str | None', el.prefix)
|
||
|
|
||
|
@staticmethod
|
||
|
def get_uri(el: bs4.Tag) -> str | None:
|
||
|
"""Get namespace `URI`."""
|
||
|
|
||
|
return cast('str | None', el.namespace)
|
||
|
|
||
|
@classmethod
|
||
|
def get_next(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
|
||
|
"""Get next sibling tag."""
|
||
|
|
||
|
sibling = el.next_sibling
|
||
|
while tags and not cls.is_tag(sibling) and sibling is not None:
|
||
|
sibling = sibling.next_sibling
|
||
|
return sibling
|
||
|
|
||
|
@classmethod
|
||
|
def get_previous(cls, el: bs4.Tag, tags: bool = True) -> bs4.PageElement:
|
||
|
"""Get previous sibling tag."""
|
||
|
|
||
|
sibling = el.previous_sibling
|
||
|
while tags and not cls.is_tag(sibling) and sibling is not None:
|
||
|
sibling = sibling.previous_sibling
|
||
|
return sibling
|
||
|
|
||
|
@staticmethod
|
||
|
def has_html_ns(el: bs4.Tag) -> bool:
|
||
|
"""
|
||
|
Check if element has an HTML namespace.
|
||
|
|
||
|
This is a bit different than whether a element is treated as having an HTML namespace,
|
||
|
like we do in the case of `is_html_tag`.
|
||
|
"""
|
||
|
|
||
|
ns = getattr(el, 'namespace') if el else None # noqa: B009
|
||
|
return bool(ns and ns == NS_XHTML)
|
||
|
|
||
|
@staticmethod
|
||
|
def split_namespace(el: bs4.Tag, attr_name: str) -> tuple[str | None, str | None]:
|
||
|
"""Return namespace and attribute name without the prefix."""
|
||
|
|
||
|
return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
|
||
|
|
||
|
@classmethod
|
||
|
def normalize_value(cls, value: Any) -> str | Sequence[str]:
|
||
|
"""Normalize the value to be a string or list of strings."""
|
||
|
|
||
|
# Treat `None` as empty string.
|
||
|
if value is None:
|
||
|
return ''
|
||
|
|
||
|
# Pass through strings
|
||
|
if (isinstance(value, str)):
|
||
|
return value
|
||
|
|
||
|
# If it's a byte string, convert it to Unicode, treating it as UTF-8.
|
||
|
if isinstance(value, bytes):
|
||
|
return value.decode("utf8")
|
||
|
|
||
|
# BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
|
||
|
if isinstance(value, Sequence):
|
||
|
new_value = []
|
||
|
for v in value:
|
||
|
if not isinstance(v, (str, bytes)) and isinstance(v, Sequence):
|
||
|
# This is most certainly a user error and will crash and burn later.
|
||
|
# To keep things working, we'll do what we do with all objects,
|
||
|
# And convert them to strings.
|
||
|
new_value.append(str(v))
|
||
|
else:
|
||
|
# Convert the child to a string
|
||
|
new_value.append(cast(str, cls.normalize_value(v)))
|
||
|
return new_value
|
||
|
|
||
|
# Try and make anything else a string
|
||
|
return str(value)
|
||
|
|
||
|
@classmethod
|
||
|
def get_attribute_by_name(
|
||
|
cls,
|
||
|
el: bs4.Tag,
|
||
|
name: str,
|
||
|
default: str | Sequence[str] | None = None
|
||
|
) -> str | Sequence[str] | None:
|
||
|
"""Get attribute by name."""
|
||
|
|
||
|
value = default
|
||
|
if el._is_xml:
|
||
|
try:
|
||
|
value = cls.normalize_value(el.attrs[name])
|
||
|
except KeyError:
|
||
|
pass
|
||
|
else:
|
||
|
for k, v in el.attrs.items():
|
||
|
if util.lower(k) == name:
|
||
|
value = cls.normalize_value(v)
|
||
|
break
|
||
|
return value
|
||
|
|
||
|
@classmethod
|
||
|
def iter_attributes(cls, el: bs4.Tag) -> Iterator[tuple[str, str | Sequence[str] | None]]:
|
||
|
"""Iterate attributes."""
|
||
|
|
||
|
for k, v in el.attrs.items():
|
||
|
yield k, cls.normalize_value(v)
|
||
|
|
||
|
@classmethod
|
||
|
def get_classes(cls, el: bs4.Tag) -> Sequence[str]:
|
||
|
"""Get classes."""
|
||
|
|
||
|
classes = cls.get_attribute_by_name(el, 'class', [])
|
||
|
if isinstance(classes, str):
|
||
|
classes = RE_NOT_WS.findall(classes)
|
||
|
return cast(Sequence[str], classes)
|
||
|
|
||
|
def get_text(self, el: bs4.Tag, no_iframe: bool = False) -> str:
|
||
|
"""Get text."""
|
||
|
|
||
|
return ''.join(
|
||
|
[node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
|
||
|
)
|
||
|
|
||
|
def get_own_text(self, el: bs4.Tag, no_iframe: bool = False) -> list[str]:
|
||
|
"""Get Own Text."""
|
||
|
|
||
|
return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
|
||
|
|
||
|
|
||
|
class Inputs:
|
||
|
"""Class for parsing and validating input items."""
|
||
|
|
||
|
@staticmethod
|
||
|
def validate_day(year: int, month: int, day: int) -> bool:
|
||
|
"""Validate day."""
|
||
|
|
||
|
max_days = LONG_MONTH
|
||
|
if month == FEB:
|
||
|
max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
|
||
|
elif month in MONTHS_30:
|
||
|
max_days = SHORT_MONTH
|
||
|
return 1 <= day <= max_days
|
||
|
|
||
|
@staticmethod
|
||
|
def validate_week(year: int, week: int) -> bool:
|
||
|
"""Validate week."""
|
||
|
|
||
|
max_week = datetime.strptime(f"{12}-{31}-{year}", "%m-%d-%Y").isocalendar()[1]
|
||
|
if max_week == 1:
|
||
|
max_week = 53
|
||
|
return 1 <= week <= max_week
|
||
|
|
||
|
@staticmethod
|
||
|
def validate_month(month: int) -> bool:
|
||
|
"""Validate month."""
|
||
|
|
||
|
return 1 <= month <= 12
|
||
|
|
||
|
@staticmethod
|
||
|
def validate_year(year: int) -> bool:
|
||
|
"""Validate year."""
|
||
|
|
||
|
return 1 <= year
|
||
|
|
||
|
@staticmethod
|
||
|
def validate_hour(hour: int) -> bool:
|
||
|
"""Validate hour."""
|
||
|
|
||
|
return 0 <= hour <= 23
|
||
|
|
||
|
@staticmethod
|
||
|
def validate_minutes(minutes: int) -> bool:
|
||
|
"""Validate minutes."""
|
||
|
|
||
|
return 0 <= minutes <= 59
|
||
|
|
||
|
@classmethod
|
||
|
def parse_value(cls, itype: str, value: str | None) -> tuple[float, ...] | None:
|
||
|
"""Parse the input value."""
|
||
|
|
||
|
parsed = None # type: tuple[float, ...] | None
|
||
|
if value is None:
|
||
|
return value
|
||
|
if itype == "date":
|
||
|
m = RE_DATE.match(value)
|
||
|
if m:
|
||
|
year = int(m.group('year'), 10)
|
||
|
month = int(m.group('month'), 10)
|
||
|
day = int(m.group('day'), 10)
|
||
|
if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
|
||
|
parsed = (year, month, day)
|
||
|
elif itype == "month":
|
||
|
m = RE_MONTH.match(value)
|
||
|
if m:
|
||
|
year = int(m.group('year'), 10)
|
||
|
month = int(m.group('month'), 10)
|
||
|
if cls.validate_year(year) and cls.validate_month(month):
|
||
|
parsed = (year, month)
|
||
|
elif itype == "week":
|
||
|
m = RE_WEEK.match(value)
|
||
|
if m:
|
||
|
year = int(m.group('year'), 10)
|
||
|
week = int(m.group('week'), 10)
|
||
|
if cls.validate_year(year) and cls.validate_week(year, week):
|
||
|
parsed = (year, week)
|
||
|
elif itype == "time":
|
||
|
m = RE_TIME.match(value)
|
||
|
if m:
|
||
|
hour = int(m.group('hour'), 10)
|
||
|
minutes = int(m.group('minutes'), 10)
|
||
|
if cls.validate_hour(hour) and cls.validate_minutes(minutes):
|
||
|
parsed = (hour, minutes)
|
||
|
elif itype == "datetime-local":
|
||
|
m = RE_DATETIME.match(value)
|
||
|
if m:
|
||
|
year = int(m.group('year'), 10)
|
||
|
month = int(m.group('month'), 10)
|
||
|
day = int(m.group('day'), 10)
|
||
|
hour = int(m.group('hour'), 10)
|
||
|
minutes = int(m.group('minutes'), 10)
|
||
|
if (
|
||
|
cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
|
||
|
cls.validate_hour(hour) and cls.validate_minutes(minutes)
|
||
|
):
|
||
|
parsed = (year, month, day, hour, minutes)
|
||
|
elif itype in ("number", "range"):
|
||
|
m = RE_NUM.match(value)
|
||
|
if m:
|
||
|
parsed = (float(m.group('value')),)
|
||
|
return parsed
|
||
|
|
||
|
|
||
|
class CSSMatch(_DocumentNav):
|
||
|
"""Perform CSS matching."""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
selectors: ct.SelectorList,
|
||
|
scope: bs4.Tag,
|
||
|
namespaces: ct.Namespaces | None,
|
||
|
flags: int
|
||
|
) -> None:
|
||
|
"""Initialize."""
|
||
|
|
||
|
self.assert_valid_input(scope)
|
||
|
self.tag = scope
|
||
|
self.cached_meta_lang = [] # type: list[tuple[str, str]]
|
||
|
self.cached_default_forms = [] # type: list[tuple[bs4.Tag, bs4.Tag]]
|
||
|
self.cached_indeterminate_forms = [] # type: list[tuple[bs4.Tag, str, bool]]
|
||
|
self.selectors = selectors
|
||
|
self.namespaces = {} if namespaces is None else namespaces # type: ct.Namespaces | dict[str, str]
|
||
|
self.flags = flags
|
||
|
self.iframe_restrict = False
|
||
|
|
||
|
# Find the root element for the whole tree
|
||
|
doc = scope
|
||
|
parent = self.get_parent(doc)
|
||
|
while parent:
|
||
|
doc = parent
|
||
|
parent = self.get_parent(doc)
|
||
|
root = None
|
||
|
if not self.is_doc(doc):
|
||
|
root = doc
|
||
|
else:
|
||
|
for child in self.get_children(doc):
|
||
|
root = child
|
||
|
break
|
||
|
|
||
|
self.root = root
|
||
|
self.scope = scope if scope is not doc else root
|
||
|
self.has_html_namespace = self.has_html_ns(root)
|
||
|
|
||
|
# A document can be both XML and HTML (XHTML)
|
||
|
self.is_xml = self.is_xml_tree(doc)
|
||
|
self.is_html = not self.is_xml or self.has_html_namespace
|
||
|
|
||
|
def supports_namespaces(self) -> bool:
|
||
|
"""Check if namespaces are supported in the HTML type."""
|
||
|
|
||
|
return self.is_xml or self.has_html_namespace
|
||
|
|
||
|
def get_tag_ns(self, el: bs4.Tag) -> str:
|
||
|
"""Get tag namespace."""
|
||
|
|
||
|
if self.supports_namespaces():
|
||
|
namespace = ''
|
||
|
ns = self.get_uri(el)
|
||
|
if ns:
|
||
|
namespace = ns
|
||
|
else:
|
||
|
namespace = NS_XHTML
|
||
|
return namespace
|
||
|
|
||
|
def is_html_tag(self, el: bs4.Tag) -> bool:
|
||
|
"""Check if tag is in HTML namespace."""
|
||
|
|
||
|
return self.get_tag_ns(el) == NS_XHTML
|
||
|
|
||
|
def get_tag(self, el: bs4.Tag) -> str | None:
|
||
|
"""Get tag."""
|
||
|
|
||
|
name = self.get_tag_name(el)
|
||
|
return util.lower(name) if name is not None and not self.is_xml else name
|
||
|
|
||
|
def get_prefix(self, el: bs4.Tag) -> str | None:
|
||
|
"""Get prefix."""
|
||
|
|
||
|
prefix = self.get_prefix_name(el)
|
||
|
return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
|
||
|
|
||
|
def find_bidi(self, el: bs4.Tag) -> int | None:
|
||
|
"""Get directionality from element text."""
|
||
|
|
||
|
for node in self.get_children(el, tags=False):
|
||
|
|
||
|
# Analyze child text nodes
|
||
|
if self.is_tag(node):
|
||
|
|
||
|
# Avoid analyzing certain elements specified in the specification.
|
||
|
direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
|
||
|
if (
|
||
|
self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
|
||
|
not self.is_html_tag(node) or
|
||
|
direction is not None
|
||
|
):
|
||
|
continue # pragma: no cover
|
||
|
|
||
|
# Check directionality of this node's text
|
||
|
value = self.find_bidi(node)
|
||
|
if value is not None:
|
||
|
return value
|
||
|
|
||
|
# Direction could not be determined
|
||
|
continue # pragma: no cover
|
||
|
|
||
|
# Skip `doctype` comments, etc.
|
||
|
if self.is_special_string(node):
|
||
|
continue
|
||
|
|
||
|
# Analyze text nodes for directionality.
|
||
|
for c in node:
|
||
|
bidi = unicodedata.bidirectional(c)
|
||
|
if bidi in ('AL', 'R', 'L'):
|
||
|
return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
|
||
|
return None
|
||
|
|
||
|
def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool:
|
||
|
"""Filter the language tags."""
|
||
|
|
||
|
match = True
|
||
|
lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
|
||
|
ranges = lang_range.split('-')
|
||
|
subtags = lang_tag.lower().split('-')
|
||
|
length = len(ranges)
|
||
|
slength = len(subtags)
|
||
|
rindex = 0
|
||
|
sindex = 0
|
||
|
r = ranges[rindex]
|
||
|
s = subtags[sindex]
|
||
|
|
||
|
# Empty specified language should match unspecified language attributes
|
||
|
if length == 1 and slength == 1 and not r and r == s:
|
||
|
return True
|
||
|
|
||
|
# Primary tag needs to match
|
||
|
if (r != '*' and r != s) or (r == '*' and slength == 1 and not s):
|
||
|
match = False
|
||
|
|
||
|
rindex += 1
|
||
|
sindex += 1
|
||
|
|
||
|
# Match until we run out of ranges
|
||
|
while match and rindex < length:
|
||
|
r = ranges[rindex]
|
||
|
try:
|
||
|
s = subtags[sindex]
|
||
|
except IndexError:
|
||
|
# Ran out of subtags,
|
||
|
# but we still have ranges
|
||
|
match = False
|
||
|
continue
|
||
|
|
||
|
# Empty range
|
||
|
if not r:
|
||
|
match = False
|
||
|
continue
|
||
|
|
||
|
# Matched range
|
||
|
elif s == r:
|
||
|
rindex += 1
|
||
|
|
||
|
# Implicit wildcard cannot match
|
||
|
# singletons
|
||
|
elif len(s) == 1:
|
||
|
match = False
|
||
|
continue
|
||
|
|
||
|
# Implicitly matched, so grab next subtag
|
||
|
sindex += 1
|
||
|
|
||
|
return match
|
||
|
|
||
|
def match_attribute_name(
|
||
|
self,
|
||
|
el: bs4.Tag,
|
||
|
attr: str,
|
||
|
prefix: str | None
|
||
|
) -> str | Sequence[str] | None:
|
||
|
"""Match attribute name and return value if it exists."""
|
||
|
|
||
|
value = None
|
||
|
if self.supports_namespaces():
|
||
|
value = None
|
||
|
# If we have not defined namespaces, we can't very well find them, so don't bother trying.
|
||
|
if prefix:
|
||
|
ns = self.namespaces.get(prefix)
|
||
|
if ns is None and prefix != '*':
|
||
|
return None
|
||
|
else:
|
||
|
ns = None
|
||
|
|
||
|
for k, v in self.iter_attributes(el):
|
||
|
|
||
|
# Get attribute parts
|
||
|
namespace, name = self.split_namespace(el, k)
|
||
|
|
||
|
# Can't match a prefix attribute as we haven't specified one to match
|
||
|
# Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
|
||
|
if ns is None:
|
||
|
if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
|
||
|
value = v
|
||
|
break
|
||
|
# Coverage is not finding this even though it is executed.
|
||
|
# Adding a print statement before this (and erasing coverage) causes coverage to find the line.
|
||
|
# Ignore the false positive message.
|
||
|
continue # pragma: no cover
|
||
|
|
||
|
# We can't match our desired prefix attribute as the attribute doesn't have a prefix
|
||
|
if namespace is None or ns != namespace and prefix != '*':
|
||
|
continue
|
||
|
|
||
|
# The attribute doesn't match.
|
||
|
if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
|
||
|
continue
|
||
|
|
||
|
value = v
|
||
|
break
|
||
|
else:
|
||
|
for k, v in self.iter_attributes(el):
|
||
|
if util.lower(attr) != util.lower(k):
|
||
|
continue
|
||
|
value = v
|
||
|
break
|
||
|
return value
|
||
|
|
||
|
def match_namespace(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
|
||
|
"""Match the namespace of the element."""
|
||
|
|
||
|
match = True
|
||
|
namespace = self.get_tag_ns(el)
|
||
|
default_namespace = self.namespaces.get('')
|
||
|
tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix)
|
||
|
# We must match the default namespace if one is not provided
|
||
|
if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
|
||
|
match = False
|
||
|
# If we specified `|tag`, we must not have a namespace.
|
||
|
elif (tag.prefix is not None and tag.prefix == '' and namespace):
|
||
|
match = False
|
||
|
# Verify prefix matches
|
||
|
elif (
|
||
|
tag.prefix and
|
||
|
tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
|
||
|
):
|
||
|
match = False
|
||
|
return match
|
||
|
|
||
|
def match_attributes(self, el: bs4.Tag, attributes: tuple[ct.SelectorAttribute, ...]) -> bool:
|
||
|
"""Match attributes."""
|
||
|
|
||
|
match = True
|
||
|
if attributes:
|
||
|
for a in attributes:
|
||
|
temp = self.match_attribute_name(el, a.attribute, a.prefix)
|
||
|
pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
|
||
|
if temp is None:
|
||
|
match = False
|
||
|
break
|
||
|
value = temp if isinstance(temp, str) else ' '.join(temp)
|
||
|
if pattern is None:
|
||
|
continue
|
||
|
elif pattern.match(value) is None:
|
||
|
match = False
|
||
|
break
|
||
|
return match
|
||
|
|
||
|
def match_tagname(self, el: bs4.Tag, tag: ct.SelectorTag) -> bool:
|
||
|
"""Match tag name."""
|
||
|
|
||
|
name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
|
||
|
return not (
|
||
|
name is not None and
|
||
|
name not in (self.get_tag(el), '*')
|
||
|
)
|
||
|
|
||
|
def match_tag(self, el: bs4.Tag, tag: ct.SelectorTag | None) -> bool:
|
||
|
"""Match the tag."""
|
||
|
|
||
|
match = True
|
||
|
if tag is not None:
|
||
|
# Verify namespace
|
||
|
if not self.match_namespace(el, tag):
|
||
|
match = False
|
||
|
if not self.match_tagname(el, tag):
|
||
|
match = False
|
||
|
return match
|
||
|
|
||
|
def match_past_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
|
||
|
"""Match past relationship."""
|
||
|
|
||
|
found = False
|
||
|
# I don't think this can ever happen, but it makes `mypy` happy
|
||
|
if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
|
||
|
return found
|
||
|
|
||
|
if relation[0].rel_type == REL_PARENT:
|
||
|
parent = self.get_parent(el, no_iframe=self.iframe_restrict)
|
||
|
while not found and parent:
|
||
|
found = self.match_selectors(parent, relation)
|
||
|
parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
|
||
|
elif relation[0].rel_type == REL_CLOSE_PARENT:
|
||
|
parent = self.get_parent(el, no_iframe=self.iframe_restrict)
|
||
|
if parent:
|
||
|
found = self.match_selectors(parent, relation)
|
||
|
elif relation[0].rel_type == REL_SIBLING:
|
||
|
sibling = self.get_previous(el)
|
||
|
while not found and sibling:
|
||
|
found = self.match_selectors(sibling, relation)
|
||
|
sibling = self.get_previous(sibling)
|
||
|
elif relation[0].rel_type == REL_CLOSE_SIBLING:
|
||
|
sibling = self.get_previous(el)
|
||
|
if sibling and self.is_tag(sibling):
|
||
|
found = self.match_selectors(sibling, relation)
|
||
|
return found
|
||
|
|
||
|
def match_future_child(self, parent: bs4.Tag, relation: ct.SelectorList, recursive: bool = False) -> bool:
|
||
|
"""Match future child."""
|
||
|
|
||
|
match = False
|
||
|
if recursive:
|
||
|
children = self.get_descendants # type: Callable[..., Iterator[bs4.Tag]]
|
||
|
else:
|
||
|
children = self.get_children
|
||
|
for child in children(parent, no_iframe=self.iframe_restrict):
|
||
|
match = self.match_selectors(child, relation)
|
||
|
if match:
|
||
|
break
|
||
|
return match
|
||
|
|
||
|
def match_future_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
|
||
|
"""Match future relationship."""
|
||
|
|
||
|
found = False
|
||
|
# I don't think this can ever happen, but it makes `mypy` happy
|
||
|
if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
|
||
|
return found
|
||
|
|
||
|
if relation[0].rel_type == REL_HAS_PARENT:
|
||
|
found = self.match_future_child(el, relation, True)
|
||
|
elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
|
||
|
found = self.match_future_child(el, relation)
|
||
|
elif relation[0].rel_type == REL_HAS_SIBLING:
|
||
|
sibling = self.get_next(el)
|
||
|
while not found and sibling:
|
||
|
found = self.match_selectors(sibling, relation)
|
||
|
sibling = self.get_next(sibling)
|
||
|
elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
|
||
|
sibling = self.get_next(el)
|
||
|
if sibling and self.is_tag(sibling):
|
||
|
found = self.match_selectors(sibling, relation)
|
||
|
return found
|
||
|
|
||
|
def match_relations(self, el: bs4.Tag, relation: ct.SelectorList) -> bool:
|
||
|
"""Match relationship to other elements."""
|
||
|
|
||
|
found = False
|
||
|
|
||
|
if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None:
|
||
|
return found
|
||
|
|
||
|
if relation[0].rel_type.startswith(':'):
|
||
|
found = self.match_future_relations(el, relation)
|
||
|
else:
|
||
|
found = self.match_past_relations(el, relation)
|
||
|
|
||
|
return found
|
||
|
|
||
|
def match_id(self, el: bs4.Tag, ids: tuple[str, ...]) -> bool:
|
||
|
"""Match element's ID."""
|
||
|
|
||
|
found = True
|
||
|
for i in ids:
|
||
|
if i != self.get_attribute_by_name(el, 'id', ''):
|
||
|
found = False
|
||
|
break
|
||
|
return found
|
||
|
|
||
|
def match_classes(self, el: bs4.Tag, classes: tuple[str, ...]) -> bool:
|
||
|
"""Match element's classes."""
|
||
|
|
||
|
current_classes = self.get_classes(el)
|
||
|
found = True
|
||
|
for c in classes:
|
||
|
if c not in current_classes:
|
||
|
found = False
|
||
|
break
|
||
|
return found
|
||
|
|
||
|
def match_root(self, el: bs4.Tag) -> bool:
|
||
|
"""Match element as root."""
|
||
|
|
||
|
is_root = self.is_root(el)
|
||
|
if is_root:
|
||
|
sibling = self.get_previous(el, tags=False)
|
||
|
while is_root and sibling is not None:
|
||
|
if (
|
||
|
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
|
||
|
self.is_cdata(sibling)
|
||
|
):
|
||
|
is_root = False
|
||
|
else:
|
||
|
sibling = self.get_previous(sibling, tags=False)
|
||
|
if is_root:
|
||
|
sibling = self.get_next(el, tags=False)
|
||
|
while is_root and sibling is not None:
|
||
|
if (
|
||
|
self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
|
||
|
self.is_cdata(sibling)
|
||
|
):
|
||
|
is_root = False
|
||
|
else:
|
||
|
sibling = self.get_next(sibling, tags=False)
|
||
|
return is_root
|
||
|
|
||
|
def match_scope(self, el: bs4.Tag) -> bool:
|
||
|
"""Match element as scope."""
|
||
|
|
||
|
return self.scope is el
|
||
|
|
||
|
def match_nth_tag_type(self, el: bs4.Tag, child: bs4.Tag) -> bool:
|
||
|
"""Match tag type for `nth` matches."""
|
||
|
|
||
|
return (
|
||
|
(self.get_tag(child) == self.get_tag(el)) and
|
||
|
(self.get_tag_ns(child) == self.get_tag_ns(el))
|
||
|
)
|
||
|
|
||
|
def match_nth(self, el: bs4.Tag, nth: bs4.Tag) -> bool:
|
||
|
"""Match `nth` elements."""
|
||
|
|
||
|
matched = True
|
||
|
|
||
|
for n in nth:
|
||
|
matched = False
|
||
|
if n.selectors and not self.match_selectors(el, n.selectors):
|
||
|
break
|
||
|
parent = self.get_parent(el)
|
||
|
if parent is None:
|
||
|
parent = self.create_fake_parent(el)
|
||
|
last = n.last
|
||
|
last_index = len(parent) - 1
|
||
|
index = last_index if last else 0
|
||
|
relative_index = 0
|
||
|
a = n.a
|
||
|
b = n.b
|
||
|
var = n.n
|
||
|
count = 0
|
||
|
count_incr = 1
|
||
|
factor = -1 if last else 1
|
||
|
idx = last_idx = a * count + b if var else a
|
||
|
|
||
|
# We can only adjust bounds within a variable index
|
||
|
if var:
|
||
|
# Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
|
||
|
# Otherwise, increment to try to get in bounds.
|
||
|
adjust = None
|
||
|
while idx < 1 or idx > last_index:
|
||
|
if idx < 0:
|
||
|
diff_low = 0 - idx
|
||
|
if adjust is not None and adjust == 1:
|
||
|
break
|
||
|
adjust = -1
|
||
|
count += count_incr
|
||
|
idx = last_idx = a * count + b if var else a
|
||
|
diff = 0 - idx
|
||
|
if diff >= diff_low:
|
||
|
break
|
||
|
else:
|
||
|
diff_high = idx - last_index
|
||
|
if adjust is not None and adjust == -1:
|
||
|
break
|
||
|
adjust = 1
|
||
|
count += count_incr
|
||
|
idx = last_idx = a * count + b if var else a
|
||
|
diff = idx - last_index
|
||
|
if diff >= diff_high:
|
||
|
break
|
||
|
diff_high = diff
|
||
|
|
||
|
# If a < 0, our count is working backwards, so floor the index by increasing the count.
|
||
|
# Find the count that yields the lowest, in bound value and use that.
|
||
|
# Lastly reverse count increment so that we'll increase our index.
|
||
|
lowest = count
|
||
|
if a < 0:
|
||
|
while idx >= 1:
|
||
|
lowest = count
|
||
|
count += count_incr
|
||
|
idx = last_idx = a * count + b if var else a
|
||
|
count_incr = -1
|
||
|
count = lowest
|
||
|
idx = last_idx = a * count + b if var else a
|
||
|
|
||
|
# Evaluate elements while our calculated nth index is still in range
|
||
|
while 1 <= idx <= last_index + 1:
|
||
|
child = None
|
||
|
# Evaluate while our child index is still in range.
|
||
|
for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
|
||
|
index += factor
|
||
|
if not self.is_tag(child):
|
||
|
continue
|
||
|
# Handle `of S` in `nth-child`
|
||
|
if n.selectors and not self.match_selectors(child, n.selectors):
|
||
|
continue
|
||
|
# Handle `of-type`
|
||
|
if n.of_type and not self.match_nth_tag_type(el, child):
|
||
|
continue
|
||
|
relative_index += 1
|
||
|
if relative_index == idx:
|
||
|
if child is el:
|
||
|
matched = True
|
||
|
else:
|
||
|
break
|
||
|
if child is el:
|
||
|
break
|
||
|
if child is el:
|
||
|
break
|
||
|
last_idx = idx
|
||
|
count += count_incr
|
||
|
if count < 0:
|
||
|
# Count is counting down and has now ventured into invalid territory.
|
||
|
break
|
||
|
idx = a * count + b if var else a
|
||
|
if last_idx == idx:
|
||
|
break
|
||
|
if not matched:
|
||
|
break
|
||
|
return matched
|
||
|
|
||
|
def match_empty(self, el: bs4.Tag) -> bool:
|
||
|
"""Check if element is empty (if requested)."""
|
||
|
|
||
|
is_empty = True
|
||
|
for child in self.get_children(el, tags=False):
|
||
|
if self.is_tag(child):
|
||
|
is_empty = False
|
||
|
break
|
||
|
elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
|
||
|
is_empty = False
|
||
|
break
|
||
|
return is_empty
|
||
|
|
||
|
def match_subselectors(self, el: bs4.Tag, selectors: tuple[ct.SelectorList, ...]) -> bool:
|
||
|
"""Match selectors."""
|
||
|
|
||
|
match = True
|
||
|
for sel in selectors:
|
||
|
if not self.match_selectors(el, sel):
|
||
|
match = False
|
||
|
return match
|
||
|
|
||
|
def match_contains(self, el: bs4.Tag, contains: tuple[ct.SelectorContains, ...]) -> bool:
|
||
|
"""Match element if it contains text."""
|
||
|
|
||
|
match = True
|
||
|
content = None # type: str | Sequence[str] | None
|
||
|
for contain_list in contains:
|
||
|
if content is None:
|
||
|
if contain_list.own:
|
||
|
content = self.get_own_text(el, no_iframe=self.is_html)
|
||
|
else:
|
||
|
content = self.get_text(el, no_iframe=self.is_html)
|
||
|
found = False
|
||
|
for text in contain_list.text:
|
||
|
if contain_list.own:
|
||
|
for c in content:
|
||
|
if text in c:
|
||
|
found = True
|
||
|
break
|
||
|
if found:
|
||
|
break
|
||
|
else:
|
||
|
if text in content:
|
||
|
found = True
|
||
|
break
|
||
|
if not found:
|
||
|
match = False
|
||
|
return match
|
||
|
|
||
|
def match_default(self, el: bs4.Tag) -> bool:
|
||
|
"""Match default."""
|
||
|
|
||
|
match = False
|
||
|
|
||
|
# Find this input's form
|
||
|
form = None
|
||
|
parent = self.get_parent(el, no_iframe=True)
|
||
|
while parent and form is None:
|
||
|
if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
|
||
|
form = parent
|
||
|
else:
|
||
|
parent = self.get_parent(parent, no_iframe=True)
|
||
|
|
||
|
# Look in form cache to see if we've already located its default button
|
||
|
found_form = False
|
||
|
for f, t in self.cached_default_forms:
|
||
|
if f is form:
|
||
|
found_form = True
|
||
|
if t is el:
|
||
|
match = True
|
||
|
break
|
||
|
|
||
|
# We didn't have the form cached, so look for its default button
|
||
|
if not found_form:
|
||
|
for child in self.get_descendants(form, no_iframe=True):
|
||
|
name = self.get_tag(child)
|
||
|
# Can't do nested forms (haven't figured out why we never hit this)
|
||
|
if name == 'form': # pragma: no cover
|
||
|
break
|
||
|
if name in ('input', 'button'):
|
||
|
v = self.get_attribute_by_name(child, 'type', '')
|
||
|
if v and util.lower(v) == 'submit':
|
||
|
self.cached_default_forms.append((form, child))
|
||
|
if el is child:
|
||
|
match = True
|
||
|
break
|
||
|
return match
|
||
|
|
||
|
def match_indeterminate(self, el: bs4.Tag) -> bool:
|
||
|
"""Match default."""
|
||
|
|
||
|
match = False
|
||
|
name = cast(str, self.get_attribute_by_name(el, 'name'))
|
||
|
|
||
|
def get_parent_form(el: bs4.Tag) -> bs4.Tag | None:
|
||
|
"""Find this input's form."""
|
||
|
form = None
|
||
|
parent = self.get_parent(el, no_iframe=True)
|
||
|
while form is None:
|
||
|
if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
|
||
|
form = parent
|
||
|
break
|
||
|
last_parent = parent
|
||
|
parent = self.get_parent(parent, no_iframe=True)
|
||
|
if parent is None:
|
||
|
form = last_parent
|
||
|
break
|
||
|
return form
|
||
|
|
||
|
form = get_parent_form(el)
|
||
|
|
||
|
# Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
|
||
|
found_form = False
|
||
|
for f, n, i in self.cached_indeterminate_forms:
|
||
|
if f is form and n == name:
|
||
|
found_form = True
|
||
|
if i is True:
|
||
|
match = True
|
||
|
break
|
||
|
|
||
|
# We didn't have the form cached, so validate that the radio button is indeterminate
|
||
|
if not found_form:
|
||
|
checked = False
|
||
|
for child in self.get_descendants(form, no_iframe=True):
|
||
|
if child is el:
|
||
|
continue
|
||
|
tag_name = self.get_tag(child)
|
||
|
if tag_name == 'input':
|
||
|
is_radio = False
|
||
|
check = False
|
||
|
has_name = False
|
||
|
for k, v in self.iter_attributes(child):
|
||
|
if util.lower(k) == 'type' and util.lower(v) == 'radio':
|
||
|
is_radio = True
|
||
|
elif util.lower(k) == 'name' and v == name:
|
||
|
has_name = True
|
||
|
elif util.lower(k) == 'checked':
|
||
|
check = True
|
||
|
if is_radio and check and has_name and get_parent_form(child) is form:
|
||
|
checked = True
|
||
|
break
|
||
|
if checked:
|
||
|
break
|
||
|
if not checked:
|
||
|
match = True
|
||
|
self.cached_indeterminate_forms.append((form, name, match))
|
||
|
|
||
|
return match
|
||
|
|
||
|
def match_lang(self, el: bs4.Tag, langs: tuple[ct.SelectorLang, ...]) -> bool:
|
||
|
"""Match languages."""
|
||
|
|
||
|
match = False
|
||
|
has_ns = self.supports_namespaces()
|
||
|
root = self.root
|
||
|
has_html_namespace = self.has_html_namespace
|
||
|
|
||
|
# Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
|
||
|
parent = el
|
||
|
found_lang = None
|
||
|
last = None
|
||
|
while not found_lang:
|
||
|
has_html_ns = self.has_html_ns(parent)
|
||
|
for k, v in self.iter_attributes(parent):
|
||
|
attr_ns, attr = self.split_namespace(parent, k)
|
||
|
if (
|
||
|
((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
|
||
|
(
|
||
|
has_ns and not has_html_ns and attr_ns == NS_XML and
|
||
|
(util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
|
||
|
)
|
||
|
):
|
||
|
found_lang = v
|
||
|
break
|
||
|
last = parent
|
||
|
parent = self.get_parent(parent, no_iframe=self.is_html)
|
||
|
|
||
|
if parent is None:
|
||
|
root = last
|
||
|
has_html_namespace = self.has_html_ns(root)
|
||
|
parent = last
|
||
|
break
|
||
|
|
||
|
# Use cached meta language.
|
||
|
if found_lang is None and self.cached_meta_lang:
|
||
|
for cache in self.cached_meta_lang:
|
||
|
if root is cache[0]:
|
||
|
found_lang = cache[1]
|
||
|
|
||
|
# If we couldn't find a language, and the document is HTML, look to meta to determine language.
|
||
|
if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
|
||
|
# Find head
|
||
|
found = False
|
||
|
for tag in ('html', 'head'):
|
||
|
found = False
|
||
|
for child in self.get_children(parent, no_iframe=self.is_html):
|
||
|
if self.get_tag(child) == tag and self.is_html_tag(child):
|
||
|
found = True
|
||
|
parent = child
|
||
|
break
|
||
|
if not found: # pragma: no cover
|
||
|
break
|
||
|
|
||
|
# Search meta tags
|
||
|
if found:
|
||
|
for child in parent:
|
||
|
if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
|
||
|
c_lang = False
|
||
|
content = None
|
||
|
for k, v in self.iter_attributes(child):
|
||
|
if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
|
||
|
c_lang = True
|
||
|
if util.lower(k) == 'content':
|
||
|
content = v
|
||
|
if c_lang and content:
|
||
|
found_lang = content
|
||
|
self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
|
||
|
break
|
||
|
if found_lang is not None:
|
||
|
break
|
||
|
if found_lang is None:
|
||
|
self.cached_meta_lang.append((cast(str, root), ''))
|
||
|
|
||
|
# If we determined a language, compare.
|
||
|
if found_lang is not None:
|
||
|
for patterns in langs:
|
||
|
match = False
|
||
|
for pattern in patterns:
|
||
|
if self.extended_language_filter(pattern, cast(str, found_lang)):
|
||
|
match = True
|
||
|
if not match:
|
||
|
break
|
||
|
|
||
|
return match
|
||
|
|
||
|
def match_dir(self, el: bs4.Tag, directionality: int) -> bool:
|
||
|
"""Check directionality."""
|
||
|
|
||
|
# If we have to match both left and right, we can't match either.
|
||
|
if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
|
||
|
return False
|
||
|
|
||
|
if el is None or not self.is_html_tag(el):
|
||
|
return False
|
||
|
|
||
|
# Element has defined direction of left to right or right to left
|
||
|
direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
|
||
|
if direction not in (None, 0):
|
||
|
return direction == directionality
|
||
|
|
||
|
# Element is the document element (the root) and no direction assigned, assume left to right.
|
||
|
is_root = self.is_root(el)
|
||
|
if is_root and direction is None:
|
||
|
return ct.SEL_DIR_LTR == directionality
|
||
|
|
||
|
# If `input[type=telephone]` and no direction is assigned, assume left to right.
|
||
|
name = self.get_tag(el)
|
||
|
is_input = name == 'input'
|
||
|
is_textarea = name == 'textarea'
|
||
|
is_bdi = name == 'bdi'
|
||
|
itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
|
||
|
if is_input and itype == 'tel' and direction is None:
|
||
|
return ct.SEL_DIR_LTR == directionality
|
||
|
|
||
|
# Auto handling for text inputs
|
||
|
if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
|
||
|
if is_textarea:
|
||
|
value = ''.join(node for node in self.get_contents(el, no_iframe=True) if self.is_content_string(node))
|
||
|
else:
|
||
|
value = cast(str, self.get_attribute_by_name(el, 'value', ''))
|
||
|
if value:
|
||
|
for c in value:
|
||
|
bidi = unicodedata.bidirectional(c)
|
||
|
if bidi in ('AL', 'R', 'L'):
|
||
|
direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
|
||
|
return direction == directionality
|
||
|
# Assume left to right
|
||
|
return ct.SEL_DIR_LTR == directionality
|
||
|
elif is_root:
|
||
|
return ct.SEL_DIR_LTR == directionality
|
||
|
return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
|
||
|
|
||
|
# Auto handling for `bdi` and other non text inputs.
|
||
|
if (is_bdi and direction is None) or direction == 0:
|
||
|
direction = self.find_bidi(el)
|
||
|
if direction is not None:
|
||
|
return direction == directionality
|
||
|
elif is_root:
|
||
|
return ct.SEL_DIR_LTR == directionality
|
||
|
return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
|
||
|
|
||
|
# Match parents direction
|
||
|
return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
|
||
|
|
||
|
def match_range(self, el: bs4.Tag, condition: int) -> bool:
|
||
|
"""
|
||
|
Match range.
|
||
|
|
||
|
Behavior is modeled after what we see in browsers. Browsers seem to evaluate
|
||
|
if the value is out of range, and if not, it is in range. So a missing value
|
||
|
will not evaluate out of range; therefore, value is in range. Personally, I
|
||
|
feel like this should evaluate as neither in or out of range.
|
||
|
"""
|
||
|
|
||
|
out_of_range = False
|
||
|
|
||
|
itype = util.lower(self.get_attribute_by_name(el, 'type'))
|
||
|
mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None)))
|
||
|
mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None)))
|
||
|
|
||
|
# There is no valid min or max, so we cannot evaluate a range
|
||
|
if mn is None and mx is None:
|
||
|
return False
|
||
|
|
||
|
value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None)))
|
||
|
if value is not None:
|
||
|
if itype in ("date", "datetime-local", "month", "week", "number", "range"):
|
||
|
if mn is not None and value < mn:
|
||
|
out_of_range = True
|
||
|
if not out_of_range and mx is not None and value > mx:
|
||
|
out_of_range = True
|
||
|
elif itype == "time":
|
||
|
if mn is not None and mx is not None and mn > mx:
|
||
|
# Time is periodic, so this is a reversed/discontinuous range
|
||
|
if value < mn and value > mx:
|
||
|
out_of_range = True
|
||
|
else:
|
||
|
if mn is not None and value < mn:
|
||
|
out_of_range = True
|
||
|
if not out_of_range and mx is not None and value > mx:
|
||
|
out_of_range = True
|
||
|
|
||
|
return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
|
||
|
|
||
|
def match_defined(self, el: bs4.Tag) -> bool:
|
||
|
"""
|
||
|
Match defined.
|
||
|
|
||
|
`:defined` is related to custom elements in a browser.
|
||
|
|
||
|
- If the document is XML (not XHTML), all tags will match.
|
||
|
- Tags that are not custom (don't have a hyphen) are marked defined.
|
||
|
- If the tag has a prefix (without or without a namespace), it will not match.
|
||
|
|
||
|
This is of course requires the parser to provide us with the proper prefix and namespace info,
|
||
|
if it doesn't, there is nothing we can do.
|
||
|
"""
|
||
|
|
||
|
name = self.get_tag(el)
|
||
|
return (
|
||
|
name is not None and (
|
||
|
name.find('-') == -1 or
|
||
|
name.find(':') != -1 or
|
||
|
self.get_prefix(el) is not None
|
||
|
)
|
||
|
)
|
||
|
|
||
|
def match_placeholder_shown(self, el: bs4.Tag) -> bool:
|
||
|
"""
|
||
|
Match placeholder shown according to HTML spec.
|
||
|
|
||
|
- text area should be checked if they have content. A single newline does not count as content.
|
||
|
|
||
|
"""
|
||
|
|
||
|
match = False
|
||
|
content = self.get_text(el)
|
||
|
if content in ('', '\n'):
|
||
|
match = True
|
||
|
|
||
|
return match
|
||
|
|
||
|
def match_selectors(self, el: bs4.Tag, selectors: ct.SelectorList) -> bool:
|
||
|
"""Check if element matches one of the selectors."""
|
||
|
|
||
|
match = False
|
||
|
is_not = selectors.is_not
|
||
|
is_html = selectors.is_html
|
||
|
|
||
|
# Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
|
||
|
if is_html:
|
||
|
namespaces = self.namespaces
|
||
|
iframe_restrict = self.iframe_restrict
|
||
|
self.namespaces = {'html': NS_XHTML}
|
||
|
self.iframe_restrict = True
|
||
|
|
||
|
if not is_html or self.is_html:
|
||
|
for selector in selectors:
|
||
|
match = is_not
|
||
|
# We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
|
||
|
if isinstance(selector, ct.SelectorNull):
|
||
|
continue
|
||
|
# Verify tag matches
|
||
|
if not self.match_tag(el, selector.tag):
|
||
|
continue
|
||
|
# Verify tag is defined
|
||
|
if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
|
||
|
continue
|
||
|
# Verify element is root
|
||
|
if selector.flags & ct.SEL_ROOT and not self.match_root(el):
|
||
|
continue
|
||
|
# Verify element is scope
|
||
|
if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
|
||
|
continue
|
||
|
# Verify element has placeholder shown
|
||
|
if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
|
||
|
continue
|
||
|
# Verify `nth` matches
|
||
|
if not self.match_nth(el, selector.nth):
|
||
|
continue
|
||
|
if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
|
||
|
continue
|
||
|
# Verify id matches
|
||
|
if selector.ids and not self.match_id(el, selector.ids):
|
||
|
continue
|
||
|
# Verify classes match
|
||
|
if selector.classes and not self.match_classes(el, selector.classes):
|
||
|
continue
|
||
|
# Verify attribute(s) match
|
||
|
if not self.match_attributes(el, selector.attributes):
|
||
|
continue
|
||
|
# Verify ranges
|
||
|
if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
|
||
|
continue
|
||
|
# Verify language patterns
|
||
|
if selector.lang and not self.match_lang(el, selector.lang):
|
||
|
continue
|
||
|
# Verify pseudo selector patterns
|
||
|
if selector.selectors and not self.match_subselectors(el, selector.selectors):
|
||
|
continue
|
||
|
# Verify relationship selectors
|
||
|
if selector.relation and not self.match_relations(el, selector.relation):
|
||
|
continue
|
||
|
# Validate that the current default selector match corresponds to the first submit button in the form
|
||
|
if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
|
||
|
continue
|
||
|
# Validate that the unset radio button is among radio buttons with the same name in a form that are
|
||
|
# also not set.
|
||
|
if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
|
||
|
continue
|
||
|
# Validate element directionality
|
||
|
if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
|
||
|
continue
|
||
|
# Validate that the tag contains the specified text.
|
||
|
if selector.contains and not self.match_contains(el, selector.contains):
|
||
|
continue
|
||
|
match = not is_not
|
||
|
break
|
||
|
|
||
|
# Restore actual namespaces being used for external selector lists
|
||
|
if is_html:
|
||
|
self.namespaces = namespaces
|
||
|
self.iframe_restrict = iframe_restrict
|
||
|
|
||
|
return match
|
||
|
|
||
|
def select(self, limit: int = 0) -> Iterator[bs4.Tag]:
|
||
|
"""Match all tags under the targeted tag."""
|
||
|
|
||
|
lim = None if limit < 1 else limit
|
||
|
|
||
|
for child in self.get_descendants(self.tag):
|
||
|
if self.match(child):
|
||
|
yield child
|
||
|
if lim is not None:
|
||
|
lim -= 1
|
||
|
if lim < 1:
|
||
|
break
|
||
|
|
||
|
def closest(self) -> bs4.Tag | None:
|
||
|
"""Match closest ancestor."""
|
||
|
|
||
|
current = self.tag
|
||
|
closest = None
|
||
|
while closest is None and current is not None:
|
||
|
if self.match(current):
|
||
|
closest = current
|
||
|
else:
|
||
|
current = self.get_parent(current)
|
||
|
return closest
|
||
|
|
||
|
def filter(self) -> list[bs4.Tag]: # noqa A001
|
||
|
"""Filter tag's children."""
|
||
|
|
||
|
return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
|
||
|
|
||
|
def match(self, el: bs4.Tag) -> bool:
|
||
|
"""Match."""
|
||
|
|
||
|
return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
|
||
|
|
||
|
|
||
|
class SoupSieve(ct.Immutable):
|
||
|
"""Compiled Soup Sieve selector matching object."""
|
||
|
|
||
|
pattern: str
|
||
|
selectors: ct.SelectorList
|
||
|
namespaces: ct.Namespaces | None
|
||
|
custom: dict[str, str]
|
||
|
flags: int
|
||
|
|
||
|
__slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
pattern: str,
|
||
|
selectors: ct.SelectorList,
|
||
|
namespaces: ct.Namespaces | None,
|
||
|
custom: ct.CustomSelectors | None,
|
||
|
flags: int
|
||
|
):
|
||
|
"""Initialize."""
|
||
|
|
||
|
super().__init__(
|
||
|
pattern=pattern,
|
||
|
selectors=selectors,
|
||
|
namespaces=namespaces,
|
||
|
custom=custom,
|
||
|
flags=flags
|
||
|
)
|
||
|
|
||
|
def match(self, tag: bs4.Tag) -> bool:
|
||
|
"""Match."""
|
||
|
|
||
|
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
|
||
|
|
||
|
def closest(self, tag: bs4.Tag) -> bs4.Tag:
|
||
|
"""Match closest ancestor."""
|
||
|
|
||
|
return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
|
||
|
|
||
|
def filter(self, iterable: Iterable[bs4.Tag]) -> list[bs4.Tag]: # noqa A001
|
||
|
"""
|
||
|
Filter.
|
||
|
|
||
|
`CSSMatch` can cache certain searches for tags of the same document,
|
||
|
so if we are given a tag, all tags are from the same document,
|
||
|
and we can take advantage of the optimization.
|
||
|
|
||
|
Any other kind of iterable could have tags from different documents or detached tags,
|
||
|
so for those, we use a new `CSSMatch` for each item in the iterable.
|
||
|
"""
|
||
|
|
||
|
if CSSMatch.is_tag(iterable):
|
||
|
return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
|
||
|
else:
|
||
|
return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
|
||
|
|
||
|
def select_one(self, tag: bs4.Tag) -> bs4.Tag:
|
||
|
"""Select a single tag."""
|
||
|
|
||
|
tags = self.select(tag, limit=1)
|
||
|
return tags[0] if tags else None
|
||
|
|
||
|
def select(self, tag: bs4.Tag, limit: int = 0) -> list[bs4.Tag]:
|
||
|
"""Select the specified tags."""
|
||
|
|
||
|
return list(self.iselect(tag, limit))
|
||
|
|
||
|
def iselect(self, tag: bs4.Tag, limit: int = 0) -> Iterator[bs4.Tag]:
|
||
|
"""Iterate the specified tags."""
|
||
|
|
||
|
yield from CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit)
|
||
|
|
||
|
def __repr__(self) -> str: # pragma: no cover
|
||
|
"""Representation."""
|
||
|
|
||
|
return (
|
||
|
f"SoupSieve(pattern={self.pattern!r}, namespaces={self.namespaces!r}, "
|
||
|
f"custom={self.custom!r}, flags={self.flags!r})"
|
||
|
)
|
||
|
|
||
|
__str__ = __repr__
|
||
|
|
||
|
|
||
|
ct.pickle_register(SoupSieve)
|