from datetime import datetime
from copy import deepcopy
import dateparser
from dateparser.search import search_dates
from pytz import timezone
SPACE = " "
EASTERN = timezone("US/Eastern")
DATE_PARSER_SETTINGS = {
"DATE_ORDER": "MDY",
"PREFER_LOCALE_DATE_ORDER": False,
"TIMEZONE": "US/Eastern",
"TO_TIMEZONE": "US/Eastern",
"RETURN_AS_TIMEZONE_AWARE": True,
"PREFER_DAY_OF_MONTH": "first",
"PREFER_MONTH_OF_YEAR": "last",
"RELATIVE_BASE": datetime.now(EASTERN),
}
def process(obj: str | list[str]) -> str:
if isinstance(obj, str):
return process_list([obj])
elif isinstance(obj, list):
return process_list(obj)
else:
raise ValueError(f"Invalid type: {type(obj)}")
# Use this for raw text, when the inner strings are not clean.
def process_list(obj: list[str]) -> str:
# Normalize whitespace within each string as well.
return join([join(x.strip().split()) for x in obj])
# Use this for faster processing of clean text,
# when the inner strings are already clean.
def join(obj: list[str]) -> str:
return SPACE.join([x for x in obj if len(x) > 0])
def today() -> str:
return datetime.now(EASTERN).isoformat()
def xpath_match_class(class_name: str) -> str:
return f"contains(concat(' ', normalize-space(@class), ' '), ' {class_name} ')"
def parse_date(date: str, base: str | datetime | None = None) -> str:
if isinstance(base, str):
base = dateparser.parse(base, languages=["en"], settings=DATE_PARSER_SETTINGS)
if base is not None:
settings = deepcopy(DATE_PARSER_SETTINGS)
settings["RELATIVE_BASE"] = base
else:
settings = DATE_PARSER_SETTINGS # Not a deep copy, be careful.
return dateparser.parse(
date, languages=["en"], settings=settings
).isoformat() # If it doesn't find a date and returns None, this will throw an error, which is desirable.
# Test before using.
def find_dates(text: str, base: datetime | None = None) -> list[str]:
if isinstance(base, str):
base = dateparser.parse(base, languages=["en"], settings=DATE_PARSER_SETTINGS)
if base is not None:
settings = deepcopy(DATE_PARSER_SETTINGS)
settings["RELATIVE_BASE"] = base
else:
settings = DATE_PARSER_SETTINGS # Not a deep copy, be careful.
results = search_dates(text, languages=["en"], settings=settings)
if results is None:
return []
return [x[1].isoformat() for x in results]