Initial commit: funeral provider discovery pipeline

Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
2026-04-24 10:27:08 +10:00
commit cc91427789
30 changed files with 4706 additions and 0 deletions
--- a/crawlers/crawl_funerals_australia.py
+++ b/crawlers/crawl_funerals_australia.py
@@ -0,0 +1,179 @@
+"""Crawler for the Funerals Australia (formerly AFDA) member directory.
+
+Source: https://funeralsaustralia.org.au/find-a-member/
+Method: WordPress AJAX API (POST with get_clients_list action)
+Fields: name, address (structured), phone, email, website, lat/lng, displayImage
+"""
+
+import time
+import json
+from pathlib import Path
+
+from base import (
+    fetch_url, get_db, start_crawl_log, finish_crawl_log,
+    store_source_record, normalize_phone, normalize_state,
+    generate_slug, to_intermediate, CRAWL_DELAY,
+)
+
+SOURCE_NAME = "funerals_australia"
+API_URL = "https://funeralsaustralia.org.au/wp-admin/admin-ajax.php"
+
+PAGE_SIZE = 200  # API supports up to 200 per page
+
+
+def fetch_page(offset: int = 0) -> dict:
+    """Fetch a page of all members from the Funerals Australia API.
+
+    The API returns all members when no postcode/suburb filter is given,
+    which is more reliable than geo-filtered searches.
+    """
+    form_data = {
+        "action": "get_clients_list",
+        "params[size]": str(PAGE_SIZE),
+        "params[from]": str(offset),
+        "params[forceResults]": "true",
+        "params[paginated]": "true",
+    }
+
+    text = fetch_url(API_URL, method="POST", data=form_data,
+                     headers={"X-Requested-With": "XMLHttpRequest"})
+    return json.loads(text)
+
+
+def fetch_all_members() -> list[dict]:
+    """Fetch all members via pagination."""
+    all_results = []
+    offset = 0
+
+    while True:
+        data = fetch_page(offset)
+        results = data.get("results", [])
+        total = data.get("total", 0)
+
+        if not results:
+            break
+
+        all_results.extend(results)
+        print(f"    Fetched {len(all_results)}/{total}...")
+        offset += PAGE_SIZE
+
+        if offset >= total:
+            break
+
+        time.sleep(CRAWL_DELAY)
+
+    return all_results
+
+
+def parse_address(record: dict) -> dict:
+    """Extract structured address from a Funerals Australia record."""
+    addr_list = record.get("address", [])
+    if addr_list and isinstance(addr_list, list) and len(addr_list) > 0:
+        addr = addr_list[0]
+        return {
+            "line1": addr.get("line1", "").strip(),
+            "city": addr.get("city", "").strip(),
+            "state": normalize_state(addr.get("state")),
+            "postcode": addr.get("postcode", "").strip(),
+        }
+    return {"line1": "", "city": "", "state": None, "postcode": ""}
+
+
+def to_normalized(record: dict) -> dict:
+    """Convert a Funerals Australia record to intermediate format."""
+    addr = parse_address(record)
+    city = addr["city"]
+    if city and city == city.upper():
+        city = city.title()
+
+    lat_val = record.get("latitude")
+    lng_val = record.get("longitude")
+    try:
+        lat_val = float(lat_val) if lat_val else None
+        lng_val = float(lng_val) if lng_val else None
+    except (ValueError, TypeError):
+        lat_val = lng_val = None
+
+    website = record.get("website", "").strip() or None
+    if website and not website.startswith("http"):
+        website = "https://" + website
+
+    business = {
+        "name": record.get("name", "").strip(),
+        "abn": None,
+        "phone": normalize_phone(record.get("phone")),
+        "email": record.get("email", "").strip() or None,
+        "website": website,
+        "description": None,
+    }
+
+    locations = [{
+        "address": addr["line1"],
+        "suburb": city,
+        "state": addr["state"],
+        "postcode": addr["postcode"],
+        "lat": lat_val,
+        "lng": lng_val,
+        "phone": normalize_phone(record.get("phone")),
+    }]
+
+    source_id = record.get("id", "")
+    return to_intermediate(
+        source=SOURCE_NAME,
+        source_id=source_id,
+        source_url="https://funeralsaustralia.org.au/find-a-member/",
+        business=business,
+        locations=locations,
+    )
+
+
+def run():
+    """Run the full Funerals Australia crawl."""
+    db = get_db()
+    log_id = start_crawl_log(db, SOURCE_NAME)
+    print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})")
+
+    all_records = []
+    found = 0
+    new = 0
+    skipped = 0
+
+    try:
+        print("  Fetching all members (paginated)...")
+        all_records = fetch_all_members()
+        found = len(all_records)
+        print(f"  Total members fetched: {found}")
+
+        # Store records
+        for record in all_records:
+            source_id = record.get("id", "")
+            row_id = store_source_record(
+                db, SOURCE_NAME, source_id,
+                "https://funeralsaustralia.org.au/find-a-member/",
+                record, log_id
+            )
+            if row_id:
+                normalized = to_normalized(record)
+                db.execute(
+                    "UPDATE source_record SET normalized_data = ? WHERE id = ?",
+                    (json.dumps(normalized), row_id)
+                )
+                new += 1
+            else:
+                skipped += 1
+
+        db.commit()
+        finish_crawl_log(db, log_id, found, new, 0, skipped)
+        print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, {skipped} skipped")
+
+    except Exception as e:
+        finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e))
+        raise
+    finally:
+        db.close()
+
+    return all_records
+
+
+if __name__ == "__main__":
+    run()