import hashlib
import html
import logging
import re
from functools import cached_property
from typing import (
Any,
Callable,
Iterable,
List,
Optional,
Tuple,
TypeVar,
)
import bs4
from unidecode import unidecode
from ..product._product import Product
logger = logging.getLogger('freshpointsync.parser')
"""Logger for the `freshpointsync.parser` package."""
[docs]
def normalize_text(text: object) -> str:
"""Normalize the given text by removing diacritics, leading/trailing
whitespace, and converting it to lowercase. Non-string values are
converted to strings. `None` values are converted to empty strings.
Args:
text (Any): The text to be normalized.
Returns:
str: The normalized text.
"""
if text is None:
return ''
return unidecode(str(text).strip()).casefold()
[docs]
def hash_text(text: str) -> str:
"""Calculate the SHA-256 hash of the given text.
Args:
text (str): The text to be hashed.
Returns:
str: The SHA-256 hash of the input text in hexadecimal format.
"""
return hashlib.sha256(text.encode('utf-8')).hexdigest()
T = TypeVar('T')
class ProductHTMLParser:
"""A parser utility for extracting product information from HTML tags.
This class provides static methods to parse various attributes of a product
from its HTML representation. It's designed to work with BeautifulSoup
`Tag` objects, extracting data such as product name, ID number, pricing,
availability, etc.
"""
@staticmethod
def _extract_single_tag(resultset: bs4.ResultSet) -> bs4.Tag:
"""Get a single Tag in a ResultSet.
Args:
resultset (bs4.ResultSet): A `bs4.ResultSet` object
expected to contain exactly one `bs4.Tag` object.
Returns:
bs4.Tag: The Tag contained in the provided `resultset`.
Raises:
ValueError: If `resultset` does not contain exactly one Tag.
TypeError: If the extracted element is not a `bs4.Tag` object.
"""
if len(resultset) == 0:
raise ValueError('ResultSet is empty (expected one Tag element).')
if len(resultset) != 1:
raise ValueError(
f'Unexpected number of elements in the ResultSet'
f'(expected 1, got {len(resultset)}).'
)
if not isinstance(resultset[0], bs4.Tag):
raise TypeError(
f'The element in the ResultSet is not a Tag object. '
f'(got type "{type(resultset[0]).__name__}").'
)
return resultset[0]
@staticmethod
def _get_attr_value(attr_name: str, tag: bs4.Tag) -> str:
"""Get the value of a specified attribute from a Tag.
Args:
attr_name (str): The name of the attribute to retrieve.
tag (bs4.Tag): The Tag to extract the attribute from.
Returns:
str: The value of the specified attribute.
Raises:
KeyError: If the attribute is missing.
ValueError: If the attribute is not a string.
"""
try:
attr = tag[attr_name]
except KeyError as err:
raise KeyError(
f'Product attributes do not contain keyword "{attr_name}".'
) from err
if not isinstance(attr, str):
raise ValueError(
f'Unexpected "{attr_name}" attribute parsing results: '
f'attribute value is expected to be a string '
f'(got type "{type(attr).__name__}").'
)
return attr.strip()
@classmethod
def find_name(cls, product_data: bs4.Tag) -> str:
"""Extract the product name from the given product data."""
return html.unescape(cls._get_attr_value('data-name', product_data))
@classmethod
def find_id(cls, product_data: bs4.Tag) -> int:
"""Extract the product ID number from the given product data."""
return int(cls._get_attr_value('data-id', product_data))
@classmethod
def find_is_vegetarian(cls, product_data: bs4.Tag) -> bool:
"""Determine whether the product is vegetarian
from the given product data.
"""
return cls._get_attr_value('data-veggie', product_data) == '1'
@classmethod
def find_is_gluten_free(cls, product_data: bs4.Tag) -> bool:
"""Determine whether the product is gluten-free
from the given product data.
"""
return cls._get_attr_value('data-glutenfree', product_data) == '1'
@classmethod
def find_info(cls, product_data: bs4.Tag) -> str:
"""Extract the product info from the given product data."""
text = html.unescape(cls._get_attr_value('data-info', product_data))
lines = []
for line in text.split('\n'):
line_stripped = line.rstrip()
if line_stripped.endswith('<br />'):
line_stripped = line_stripped[:-6]
line_stripped = line_stripped.strip()
if line_stripped:
lines.append(line_stripped)
return '\n'.join(lines)
@classmethod
def find_pic_url(cls, product_data: bs4.Tag) -> str:
"""Extract the URL of the product's picture
from the given product data.
"""
return cls._get_attr_value('data-photourl', product_data)
@classmethod
def find_category(cls, product_data: bs4.Tag) -> str:
"""Extract the product category from the given product data."""
if product_data.parent is None:
raise AttributeError(
f'Unable to extract product category name for product '
f'"id={cls._find_id_safe(product_data)}" from the provided '
f'html data (parent data is missing).'
)
# 'string=bool' filters out empty strings and None values
category = product_data.parent.find_all(name='h2', string=bool)
try:
return cls._extract_single_tag(category).text.strip() # type: ignore
except Exception as exp:
raise ValueError(
f'Unable to extract product category name for product '
f'"id={cls._find_id_safe(product_data)}" from the provided '
f'html data ({exp}).'
) from exp
@classmethod
def _find_id_safe(cls, product_data: bs4.Tag) -> str:
"""Extract the product ID number from the given product data. If the ID
is not found, catch the raised exception and return a placeholder.
"""
try:
return str(cls.find_id(product_data))
except Exception as e:
logger.warning(
f'Unable to extract product ID from the provided html data '
f'({e}).'
)
return '?'
@classmethod
def _run_converter(
cls, converter: Callable[[], T], product_data: bs4.Tag
) -> T:
"""Run the given converter function and return the converted value.
Args:
converter (Callable[[], T]): The converter function
to be executed.
product_data (bs4.Tag): The product data to be passed to
the converter function.
Returns:
T: The converted value.
Raises:
ValueError: If an error occurs during the conversion process.
"""
try:
return converter()
except Exception as exc:
raise ValueError(
f'Unable to convert a parsed value for the product '
f'"id={cls._find_id_safe(product_data)}".'
) from exc
@classmethod
def find_quantity(cls, product_data: bs4.Tag) -> int:
"""Extract the quantity of the product from the given product data."""
if 'sold-out' in product_data.attrs.get('class', {}):
return 0
result = product_data.find_all(
name='span',
string=(
lambda text: bool(
text
and re.match(
pattern=r'^((posledni)|(\d+))\s(kus|kusy|kusu)!?$',
string=normalize_text(text),
)
)
),
)
if not result: # sold out products don't have the quantity text
return 0 # (should be caught by the "sold-out" check above)
quantity = normalize_text(cls._extract_single_tag(result).text)
if 'posledn' in quantity: # products that have only 1 item in stock
return 1 # have "posledni" in the quantity text
return cls._run_converter(
lambda: int(quantity.split()[0]), # regular ("2 kusy", "5 kusu")
product_data,
)
@classmethod
def find_price(cls, product_data: bs4.Tag) -> Tuple[float, float]:
"""Extract the full and current price of the product
from the given product data.
"""
result = product_data.find_all(
name='span',
string=(
lambda text: bool(
text
and re.match(
pattern=r'^\d+\.\d+$', string=normalize_text(text)
)
)
),
)
if len(result) == 1:
price_full = cls._run_converter(
lambda: float(result[0].text),
product_data, # price_full_str
)
return price_full, price_full
elif len(result) == 2:
price_full = cls._run_converter(
lambda: float(result[0].text),
product_data, # price_full_str
)
price_curr = cls._run_converter(
lambda: float(result[1].text),
product_data, # price_curr_str
)
if price_curr > price_full:
id_ = cls._find_id_safe(product_data)
raise ValueError(
f'Unexpected product "id={id_}" parsing results: '
f'current price "{price_curr}" is greater than '
f'the regular full price "{price_full}".'
)
# elif price_curr < price_full: # "data-isPromo" is unreliable
# id_ = cls._find_id_safe(product_data)
# if cls._get_attr_value('data-ispromo', product_data) != '1':
# raise ValueError(
# f'Unexpected product "id={id_}" parsing results: '
# f'current price "{price_curr}" is different from '
# f'the regular full price "{price_full}", '
# f'but the "isPromo" flag is not set.'
# )
return price_full, price_curr
raise ValueError(
f'Unexpected number of elements in the ResultSet'
f'(expected 1 or 2, got {len(result)}).'
)
[docs]
class ProductPageHTMLParser:
"""A parser for processing HTML contents of a FreshPoint.cz web page.
This class uses BeautifulSoup to parse HTML contents and extract data
related to the products listed on the page. The parser can search for
products by either name, ID, or both.
"""
def __init__(self, page_html: str) -> None:
"""Initialize the parser with HTML contents of a product page.
Args:
page_html (str): HTML contents of the product page.
"""
logger.info('Parsing page data...')
self._bs4_parser = bs4.BeautifulSoup(page_html, 'lxml')
@cached_property
def page_id(self) -> int:
"""Page ID (extracted from
the page HTML <script/> tag with the "deviceId" text).
"""
script_tag = self._bs4_parser.find(
'script', string=re.compile('deviceId')
)
if script_tag:
script_text = script_tag.get_text()
match = re.search(r'deviceId\s*=\s*"(.*?)"', script_text)
if not match:
raise ValueError(
'Unable to parse page ID ("deviceId" text '
'within the <script/> tag was not matched).'
)
try:
self._page_id = int(match.group(1))
except Exception as e:
raise ValueError('Unable to parse page ID.') from e
return self._page_id
raise ValueError(
'Unable to parse page ID '
'(<script/> tag with "deviceId" text was not found).'
)
@cached_property
def location_name(self) -> str:
"""The name of the location (extracted from
the page HTML <title/> tag).
"""
title_tag = self._bs4_parser.find('title')
if title_tag:
title_text = title_tag.get_text()
try:
location_name = title_text.split('|')[0].strip()
except Exception as e:
raise ValueError('Unable to parse location name.') from e
return location_name # type: ignore
raise ValueError(
'Unable to parse location name (<title/> tag was not found).'
)
@cached_property
def products(self) -> Tuple[Product, ...]:
"""A tuple of `Product` instances parsed from the page HTML."""
return self.find_products()
def _find_product_data(
self, name: Optional[str], id_: Optional[int]
) -> bs4.ResultSet:
"""A helper method to find raw HTML data for products matching
the specified name or ID. Can filter products by both attributes
simultaneously.
Args:
name (str | None): The name of the product to search for. If None,
ignores the name attribute in filtering.
id_ (int | None): The ID of the product to search for. If None,
ignores the ID attribute in filtering.
Returns:
bs4.ResultSet: A BeautifulSoup ResultSet containing
the found product elements' data.
"""
logger.debug(
'Searching for products with attributes "name=%s", "id=%s"...',
name if name else 'any',
str(id_) if id_ is not None else 'any',
)
attrs = {'class': lambda value: value and 'product' in value}
if name is not None:
attrs['data-name'] = lambda value: (
value and (normalize_text(name) in normalize_text(value))
)
if id_ is not None:
attrs['data-id'] = lambda value: (
value and (str(id_) == normalize_text(value))
)
return self._bs4_parser.find_all('div', attrs=attrs)
def _parse_product_data(self, product_data: bs4.Tag) -> Product:
"""A helper method to parse the product data to a `Product` object.
Args:
product_data (bs4.Tag): The Tag containing the product data.
Returns:
Product: An instance of the `Product` class
containitng the parsed data.
"""
# logger.debug(
# 'Parsing product data for product with attributes "id=%s"...',
# ProductHTMLParser._find_id_safe(product_data),
# )
price_full, price_curr = ProductHTMLParser.find_price(product_data)
return Product(
id_=ProductHTMLParser.find_id(product_data),
name=ProductHTMLParser.find_name(product_data),
category=ProductHTMLParser.find_category(product_data),
is_vegetarian=ProductHTMLParser.find_is_vegetarian(product_data),
is_gluten_free=ProductHTMLParser.find_is_gluten_free(product_data),
quantity=ProductHTMLParser.find_quantity(product_data),
price_curr=price_curr,
price_full=price_full,
info=ProductHTMLParser.find_info(product_data),
pic_url=ProductHTMLParser.find_pic_url(product_data),
location_id=self.page_id,
location=self.location_name,
)
[docs]
def find_product(
self,
name: Optional[str] = None,
id_: Optional[int] = None,
) -> Product:
"""Find a single product based on the specified name and/or ID.
Args:
name (str | None): The name of the product to filter by. Note that
product names are normalized to lowercase ASCII characters for
matching, allowing for partial and case-insensitive matches.
If None, name filtering is not applied.
id_ (int | None): The ID of the product to filter by. The ID match
is exact. If None, ID filtering is not applied.
Returns:
Product: A `Product` object with the specified name and/or ID.
Raises:
ValueError: If the product with the specified name and/or ID
is not found or if multiple products match the criteria
(i.e., the result is not unique).
"""
product_data = self._find_product_data(name, id_)
if len(product_data) == 0:
name = name if name else 'any'
id_str = str(id_) if id_ is not None else 'any'
raise ValueError(
f'Product with attributes "name={name}", "id={id_str}" '
f'was not found.'
)
if len(product_data) != 1:
name = name if name else 'any'
id_str = str(id_) if id_ is not None else 'any'
raise ValueError(
f'Product with attributes "name={name}", "id={id_str}" '
f'is not unique.'
)
return self._parse_product_data(product_data[0])
[docs]
def find_products(self, name: Optional[str] = None) -> Tuple[Product, ...]:
"""Find a list of products based on the specified name. If the name
is not specified, all products on the page are returned.
Args:
name (str | None): The name of the product to filter by. Note that
product names are normalized to lowercase ASCII characters for
matching, allowing for partial and case-insensitive matches.
If None, retrieves all products.
Returns:
tuple[Product]: `Product` objects with the specified name.
"""
product_data = self._find_product_data(name, None)
products = (self._parse_product_data(data) for data in product_data)
return tuple(products)
[docs]
def parse_page_contents(page_html: str) -> Tuple[Product, ...]:
"""Parse the HTML contents of a FreshPoint.cz web page and extract
product information.
Args:
page_html (str): HTML contents of the product page.
Returns:
tuple[Product]: A tuple of `Product` instances parsed from the page HTML.
"""
parser = ProductPageHTMLParser(page_html)
return parser.products
[docs]
class ProductFinder:
"""A utility for searching and filtering products based on certain
attributes and constraints. This class provides static methods to find
either a single product or a list of products from an collection of
`Product` instances.
"""
[docs]
@classmethod
def product_matches(
cls,
product: Product,
constraint: Optional[Callable[[Product], bool]] = None,
**attributes: Any,
) -> bool:
"""Check if a product matches the given attributes and an optional
constraint.
Args:
product (Product): The product to check.
constraint (Optional[Callable[[Product], bool]]):
An optional function that takes a `Product` instance as input
and returns a boolean indicating whether a certain constraint
is met for this instance.
**attributes (Any):
Arbitrary keyword arguments representing the product
attributes and properties and their expected values for
the product to match.
Returns:
bool:
True if the product matches the given attributes and
constraint, False otherwise.
"""
if constraint is not None and not constraint(product):
return False
return all(
getattr(product, key) == value for key, value in attributes.items()
)
[docs]
@classmethod
def find_product(
cls,
products: Iterable[Product],
constraint: Optional[Callable[[Product], bool]] = None,
**attributes: Any,
) -> Optional[Product]:
"""Find a single product in an iterable of products that matches
the given attributes and an optional constraint.
Args:
products (Iterable[Product]):
An iterable collection of `Product` instances.
constraint (Optional[Callable[[Product], bool]]):
An optional function that takes a `Product` instance as input
and returns a boolean indicating whether a certain constraint
is met for this instance.
**attributes (Any):
Arbitrary keyword arguments representing the product
attributes and properties and their expected values for
the product to match.
Returns:
Optional[Product]:
The first product in the iterable that matches the given
attributes and constraint, or None if no such product is found.
"""
for product in products:
if cls.product_matches(product, constraint, **attributes):
return product
return None
[docs]
@classmethod
def find_products(
cls,
products: Iterable[Product],
constraint: Optional[Callable[[Product], bool]] = None,
**attributes: Any,
) -> List[Product]:
"""Find all products in an iterable of products that match
the given attributes and an optional constraint.
Args:
products (Iterable[Product]):
An iterable collection of `Product` instances.
constraint (Optional[Callable[[Product], bool]]):
An optional function that takes a `Product` instance as input
and returns a boolean indicating whether a certain constraint
is met for this instance.
**attributes (Any):
Arbitrary keyword arguments representing the product
attributes and properties and their expected values for
the products to match.
Returns:
list[Product]:
A list of all products in the iterable that match the given
attributes and constraint.
"""
found_products = []
for product in products:
if cls.product_matches(product, constraint, **attributes):
found_products.append(product)
return found_products