Provider-Crawl/crawlers/base.py

"""Base crawler module with shared utilities."""

import gzip
import io
import json
import time
import sqlite3
import urllib.request
import urllib.parse
import urllib.error
from datetime import datetime, timezone
from pathlib import Path

DB_PATH = Path(__file__).parent.parent / "database" / "providers.db"
CRAWL_DELAY = 1.0  # seconds between requests (courtesy)

USER_AGENT = (
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)


def fetch_url(url: str, method: str = "GET", data: dict | None = None,
              headers: dict | None = None, timeout: int = 30) -> str:
    """Fetch a URL and return the response body as text."""
    hdrs = {"User-Agent": USER_AGENT}
    if headers:
        hdrs.update(headers)

    body = None
    if data and method == "POST":
        body = urllib.parse.urlencode(data, doseq=True).encode("utf-8")
        hdrs.setdefault("Content-Type", "application/x-www-form-urlencoded")
    elif data and method == "GET":
        url = url + "?" + urllib.parse.urlencode(data, doseq=True)

    req = urllib.request.Request(url, data=body, headers=hdrs, method=method)
    with urllib.request.urlopen(req, timeout=timeout) as resp:
        raw = resp.read()
        # Handle gzip-compressed responses
        if resp.headers.get("Content-Encoding") == "gzip" or raw[:2] == b"\x1f\x8b":
            raw = gzip.decompress(raw)
        charset = resp.headers.get_content_charset() or "utf-8"
        return raw.decode(charset)


def fetch_json(url: str, method: str = "GET", data: dict | None = None,
               headers: dict | None = None) -> dict:
    """Fetch a URL and parse the response as JSON."""
    text = fetch_url(url, method=method, data=data, headers=headers)
    return json.loads(text)


def get_db() -> sqlite3.Connection:
    """Get a connection to the SQLite database."""
    conn = sqlite3.connect(str(DB_PATH))
    conn.execute("PRAGMA journal_mode=WAL")
    conn.execute("PRAGMA foreign_keys=ON")
    conn.row_factory = sqlite3.Row
    return conn


def start_crawl_log(db: sqlite3.Connection, source_name: str) -> int:
    """Create a source_log entry and return its ID."""
    cur = db.execute(
        "INSERT INTO source_log (source_name) VALUES (?)",
        (source_name,)
    )
    db.commit()
    return cur.lastrowid


def finish_crawl_log(db: sqlite3.Connection, log_id: int,
                     found: int, new: int, updated: int, skipped: int,
                     status: str = "completed", error: str | None = None):
    """Update a source_log entry with results."""
    db.execute(
        """UPDATE source_log
           SET run_finished_at = datetime('now'),
               records_found = ?, records_new = ?,
               records_updated = ?, records_skipped = ?,
               status = ?, error_message = ?
           WHERE id = ?""",
        (found, new, updated, skipped, status, error, log_id)
    )
    db.commit()


def store_source_record(db: sqlite3.Connection, source_name: str,
                        source_id: str, source_url: str | None,
                        raw_data: dict, log_id: int) -> int | None:
    """Store a raw source record. Returns the row ID, or None if duplicate."""
    try:
        cur = db.execute(
            """INSERT INTO source_record
               (source_name, source_id, source_url, raw_data, log_id)
               VALUES (?, ?, ?, ?, ?)""",
            (source_name, source_id, source_url, json.dumps(raw_data), log_id)
        )
        db.commit()
        return cur.lastrowid
    except sqlite3.IntegrityError:
        # Duplicate source_name + source_id — already have this record
        return None


def normalize_phone(phone: str | None) -> str | None:
    """Basic phone normalization."""
    if not phone:
        return None
    # Remove common noise
    phone = phone.strip().replace("\xa0", " ")
    # If multiple numbers, take the first
    for sep in [";", "/", "|", ","]:
        if sep in phone:
            phone = phone.split(sep)[0].strip()
    return phone or None


def normalize_state(state: str | None) -> str | None:
    """Normalize Australian state names to abbreviations."""
    if not state:
        return None
    state = state.strip().upper()
    mapping = {
        "NEW SOUTH WALES": "NSW",
        "VICTORIA": "VIC",
        "QUEENSLAND": "QLD",
        "SOUTH AUSTRALIA": "SA",
        "WESTERN AUSTRALIA": "WA",
        "TASMANIA": "TAS",
        "NORTHERN TERRITORY": "NT",
        "AUSTRALIAN CAPITAL TERRITORY": "ACT",
        "AUSTRALIA CAPITAL TERRITORY": "ACT",
    }
    result = mapping.get(state, state)
    # Only return valid Australian states
    valid = {"NSW", "VIC", "QLD", "SA", "WA", "TAS", "NT", "ACT"}
    return result if result in valid else None


def generate_slug(name: str) -> str:
    """Generate a URL-safe slug from a business name."""
    import re
    slug = name.lower().strip()
    slug = re.sub(r"[''`]", "", slug)          # remove apostrophes
    slug = re.sub(r"[^a-z0-9]+", "-", slug)    # non-alphanum -> hyphen
    slug = slug.strip("-")
    return slug


def to_intermediate(source: str, source_id: str, source_url: str | None,
                    business: dict, locations: list[dict],
                    packages: list[dict] | None = None) -> dict:
    """Build the normalized intermediate format record."""
    return {
        "source": source,
        "sourceId": source_id,
        "sourceUrl": source_url,
        "scrapedAt": datetime.now(timezone.utc).isoformat(),
        "business": business,
        "locations": locations,
        "packages": packages or [],
    }