Initial commit: funeral provider discovery pipeline

Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
2026-04-24 10:27:08 +10:00
commit cc91427789
30 changed files with 4706 additions and 0 deletions
--- a/crawlers/base.py
+++ b/crawlers/base.py
@@ -0,0 +1,164 @@
+"""Base crawler module with shared utilities."""
+
+import gzip
+import io
+import json
+import time
+import sqlite3
+import urllib.request
+import urllib.parse
+import urllib.error
+from datetime import datetime, timezone
+from pathlib import Path
+
+DB_PATH = Path(__file__).parent.parent / "database" / "providers.db"
+CRAWL_DELAY = 1.0  # seconds between requests (courtesy)
+
+USER_AGENT = (
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
+)
+
+
+def fetch_url(url: str, method: str = "GET", data: dict | None = None,
+              headers: dict | None = None, timeout: int = 30) -> str:
+    """Fetch a URL and return the response body as text."""
+    hdrs = {"User-Agent": USER_AGENT}
+    if headers:
+        hdrs.update(headers)
+
+    body = None
+    if data and method == "POST":
+        body = urllib.parse.urlencode(data, doseq=True).encode("utf-8")
+        hdrs.setdefault("Content-Type", "application/x-www-form-urlencoded")
+    elif data and method == "GET":
+        url = url + "?" + urllib.parse.urlencode(data, doseq=True)
+
+    req = urllib.request.Request(url, data=body, headers=hdrs, method=method)
+    with urllib.request.urlopen(req, timeout=timeout) as resp:
+        raw = resp.read()
+        # Handle gzip-compressed responses
+        if resp.headers.get("Content-Encoding") == "gzip" or raw[:2] == b"\x1f\x8b":
+            raw = gzip.decompress(raw)
+        charset = resp.headers.get_content_charset() or "utf-8"
+        return raw.decode(charset)
+
+
+def fetch_json(url: str, method: str = "GET", data: dict | None = None,
+               headers: dict | None = None) -> dict:
+    """Fetch a URL and parse the response as JSON."""
+    text = fetch_url(url, method=method, data=data, headers=headers)
+    return json.loads(text)
+
+
+def get_db() -> sqlite3.Connection:
+    """Get a connection to the SQLite database."""
+    conn = sqlite3.connect(str(DB_PATH))
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute("PRAGMA foreign_keys=ON")
+    conn.row_factory = sqlite3.Row
+    return conn
+
+
+def start_crawl_log(db: sqlite3.Connection, source_name: str) -> int:
+    """Create a source_log entry and return its ID."""
+    cur = db.execute(
+        "INSERT INTO source_log (source_name) VALUES (?)",
+        (source_name,)
+    )
+    db.commit()
+    return cur.lastrowid
+
+
+def finish_crawl_log(db: sqlite3.Connection, log_id: int,
+                     found: int, new: int, updated: int, skipped: int,
+                     status: str = "completed", error: str | None = None):
+    """Update a source_log entry with results."""
+    db.execute(
+        """UPDATE source_log
+           SET run_finished_at = datetime('now'),
+               records_found = ?, records_new = ?,
+               records_updated = ?, records_skipped = ?,
+               status = ?, error_message = ?
+           WHERE id = ?""",
+        (found, new, updated, skipped, status, error, log_id)
+    )
+    db.commit()
+
+
+def store_source_record(db: sqlite3.Connection, source_name: str,
+                        source_id: str, source_url: str | None,
+                        raw_data: dict, log_id: int) -> int | None:
+    """Store a raw source record. Returns the row ID, or None if duplicate."""
+    try:
+        cur = db.execute(
+            """INSERT INTO source_record
+               (source_name, source_id, source_url, raw_data, log_id)
+               VALUES (?, ?, ?, ?, ?)""",
+            (source_name, source_id, source_url, json.dumps(raw_data), log_id)
+        )
+        db.commit()
+        return cur.lastrowid
+    except sqlite3.IntegrityError:
+        # Duplicate source_name + source_id — already have this record
+        return None
+
+
+def normalize_phone(phone: str | None) -> str | None:
+    """Basic phone normalization."""
+    if not phone:
+        return None
+    # Remove common noise
+    phone = phone.strip().replace("\xa0", " ")
+    # If multiple numbers, take the first
+    for sep in [";", "/", "|", ","]:
+        if sep in phone:
+            phone = phone.split(sep)[0].strip()
+    return phone or None
+
+
+def normalize_state(state: str | None) -> str | None:
+    """Normalize Australian state names to abbreviations."""
+    if not state:
+        return None
+    state = state.strip().upper()
+    mapping = {
+        "NEW SOUTH WALES": "NSW",
+        "VICTORIA": "VIC",
+        "QUEENSLAND": "QLD",
+        "SOUTH AUSTRALIA": "SA",
+        "WESTERN AUSTRALIA": "WA",
+        "TASMANIA": "TAS",
+        "NORTHERN TERRITORY": "NT",
+        "AUSTRALIAN CAPITAL TERRITORY": "ACT",
+        "AUSTRALIA CAPITAL TERRITORY": "ACT",
+    }
+    result = mapping.get(state, state)
+    # Only return valid Australian states
+    valid = {"NSW", "VIC", "QLD", "SA", "WA", "TAS", "NT", "ACT"}
+    return result if result in valid else None
+
+
+def generate_slug(name: str) -> str:
+    """Generate a URL-safe slug from a business name."""
+    import re
+    slug = name.lower().strip()
+    slug = re.sub(r"[''`]", "", slug)          # remove apostrophes
+    slug = re.sub(r"[^a-z0-9]+", "-", slug)    # non-alphanum -> hyphen
+    slug = slug.strip("-")
+    return slug
+
+
+def to_intermediate(source: str, source_id: str, source_url: str | None,
+                    business: dict, locations: list[dict],
+                    packages: list[dict] | None = None) -> dict:
+    """Build the normalized intermediate format record."""
+    return {
+        "source": source,
+        "sourceId": source_id,
+        "sourceUrl": source_url,
+        "scrapedAt": datetime.now(timezone.utc).isoformat(),
+        "business": business,
+        "locations": locations,
+        "packages": packages or [],
+    }