"""Base crawler module with shared utilities.""" import gzip import io import json import time import sqlite3 import urllib.request import urllib.parse import urllib.error from datetime import datetime, timezone from pathlib import Path DB_PATH = Path(__file__).parent.parent / "database" / "providers.db" CRAWL_DELAY = 1.0 # seconds between requests (courtesy) USER_AGENT = ( "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" ) def fetch_url(url: str, method: str = "GET", data: dict | None = None, headers: dict | None = None, timeout: int = 30) -> str: """Fetch a URL and return the response body as text.""" hdrs = {"User-Agent": USER_AGENT} if headers: hdrs.update(headers) body = None if data and method == "POST": body = urllib.parse.urlencode(data, doseq=True).encode("utf-8") hdrs.setdefault("Content-Type", "application/x-www-form-urlencoded") elif data and method == "GET": url = url + "?" + urllib.parse.urlencode(data, doseq=True) req = urllib.request.Request(url, data=body, headers=hdrs, method=method) with urllib.request.urlopen(req, timeout=timeout) as resp: raw = resp.read() # Handle gzip-compressed responses if resp.headers.get("Content-Encoding") == "gzip" or raw[:2] == b"\x1f\x8b": raw = gzip.decompress(raw) charset = resp.headers.get_content_charset() or "utf-8" return raw.decode(charset) def fetch_json(url: str, method: str = "GET", data: dict | None = None, headers: dict | None = None) -> dict: """Fetch a URL and parse the response as JSON.""" text = fetch_url(url, method=method, data=data, headers=headers) return json.loads(text) def get_db() -> sqlite3.Connection: """Get a connection to the SQLite database.""" conn = sqlite3.connect(str(DB_PATH)) conn.execute("PRAGMA journal_mode=WAL") conn.execute("PRAGMA foreign_keys=ON") conn.row_factory = sqlite3.Row return conn def start_crawl_log(db: sqlite3.Connection, source_name: str) -> int: """Create a source_log entry and return its ID.""" cur = db.execute( "INSERT INTO source_log (source_name) VALUES (?)", (source_name,) ) db.commit() return cur.lastrowid def finish_crawl_log(db: sqlite3.Connection, log_id: int, found: int, new: int, updated: int, skipped: int, status: str = "completed", error: str | None = None): """Update a source_log entry with results.""" db.execute( """UPDATE source_log SET run_finished_at = datetime('now'), records_found = ?, records_new = ?, records_updated = ?, records_skipped = ?, status = ?, error_message = ? WHERE id = ?""", (found, new, updated, skipped, status, error, log_id) ) db.commit() def store_source_record(db: sqlite3.Connection, source_name: str, source_id: str, source_url: str | None, raw_data: dict, log_id: int) -> int | None: """Store a raw source record. Returns the row ID, or None if duplicate.""" try: cur = db.execute( """INSERT INTO source_record (source_name, source_id, source_url, raw_data, log_id) VALUES (?, ?, ?, ?, ?)""", (source_name, source_id, source_url, json.dumps(raw_data), log_id) ) db.commit() return cur.lastrowid except sqlite3.IntegrityError: # Duplicate source_name + source_id — already have this record return None def normalize_phone(phone: str | None) -> str | None: """Basic phone normalization.""" if not phone: return None # Remove common noise phone = phone.strip().replace("\xa0", " ") # If multiple numbers, take the first for sep in [";", "/", "|", ","]: if sep in phone: phone = phone.split(sep)[0].strip() return phone or None def normalize_state(state: str | None) -> str | None: """Normalize Australian state names to abbreviations.""" if not state: return None state = state.strip().upper() mapping = { "NEW SOUTH WALES": "NSW", "VICTORIA": "VIC", "QUEENSLAND": "QLD", "SOUTH AUSTRALIA": "SA", "WESTERN AUSTRALIA": "WA", "TASMANIA": "TAS", "NORTHERN TERRITORY": "NT", "AUSTRALIAN CAPITAL TERRITORY": "ACT", "AUSTRALIA CAPITAL TERRITORY": "ACT", } result = mapping.get(state, state) # Only return valid Australian states valid = {"NSW", "VIC", "QLD", "SA", "WA", "TAS", "NT", "ACT"} return result if result in valid else None def generate_slug(name: str) -> str: """Generate a URL-safe slug from a business name.""" import re slug = name.lower().strip() slug = re.sub(r"[''`]", "", slug) # remove apostrophes slug = re.sub(r"[^a-z0-9]+", "-", slug) # non-alphanum -> hyphen slug = slug.strip("-") return slug def to_intermediate(source: str, source_id: str, source_url: str | None, business: dict, locations: list[dict], packages: list[dict] | None = None) -> dict: """Build the normalized intermediate format record.""" return { "source": source, "sourceId": source_id, "sourceUrl": source_url, "scrapedAt": datetime.now(timezone.utc).isoformat(), "business": business, "locations": locations, "packages": packages or [], }