"""Crawler for the Funerals Australia (formerly AFDA) member directory. Source: https://funeralsaustralia.org.au/find-a-member/ Method: WordPress AJAX API (POST with get_clients_list action) Fields: name, address (structured), phone, email, website, lat/lng, displayImage """ import time import json from pathlib import Path from base import ( fetch_url, get_db, start_crawl_log, finish_crawl_log, store_source_record, normalize_phone, normalize_state, generate_slug, to_intermediate, CRAWL_DELAY, ) SOURCE_NAME = "funerals_australia" API_URL = "https://funeralsaustralia.org.au/wp-admin/admin-ajax.php" PAGE_SIZE = 200 # API supports up to 200 per page def fetch_page(offset: int = 0) -> dict: """Fetch a page of all members from the Funerals Australia API. The API returns all members when no postcode/suburb filter is given, which is more reliable than geo-filtered searches. """ form_data = { "action": "get_clients_list", "params[size]": str(PAGE_SIZE), "params[from]": str(offset), "params[forceResults]": "true", "params[paginated]": "true", } text = fetch_url(API_URL, method="POST", data=form_data, headers={"X-Requested-With": "XMLHttpRequest"}) return json.loads(text) def fetch_all_members() -> list[dict]: """Fetch all members via pagination.""" all_results = [] offset = 0 while True: data = fetch_page(offset) results = data.get("results", []) total = data.get("total", 0) if not results: break all_results.extend(results) print(f" Fetched {len(all_results)}/{total}...") offset += PAGE_SIZE if offset >= total: break time.sleep(CRAWL_DELAY) return all_results def parse_address(record: dict) -> dict: """Extract structured address from a Funerals Australia record.""" addr_list = record.get("address", []) if addr_list and isinstance(addr_list, list) and len(addr_list) > 0: addr = addr_list[0] return { "line1": addr.get("line1", "").strip(), "city": addr.get("city", "").strip(), "state": normalize_state(addr.get("state")), "postcode": addr.get("postcode", "").strip(), } return {"line1": "", "city": "", "state": None, "postcode": ""} def to_normalized(record: dict) -> dict: """Convert a Funerals Australia record to intermediate format.""" addr = parse_address(record) city = addr["city"] if city and city == city.upper(): city = city.title() lat_val = record.get("latitude") lng_val = record.get("longitude") try: lat_val = float(lat_val) if lat_val else None lng_val = float(lng_val) if lng_val else None except (ValueError, TypeError): lat_val = lng_val = None website = record.get("website", "").strip() or None if website and not website.startswith("http"): website = "https://" + website business = { "name": record.get("name", "").strip(), "abn": None, "phone": normalize_phone(record.get("phone")), "email": record.get("email", "").strip() or None, "website": website, "description": None, } locations = [{ "address": addr["line1"], "suburb": city, "state": addr["state"], "postcode": addr["postcode"], "lat": lat_val, "lng": lng_val, "phone": normalize_phone(record.get("phone")), }] source_id = record.get("id", "") return to_intermediate( source=SOURCE_NAME, source_id=source_id, source_url="https://funeralsaustralia.org.au/find-a-member/", business=business, locations=locations, ) def run(): """Run the full Funerals Australia crawl.""" db = get_db() log_id = start_crawl_log(db, SOURCE_NAME) print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})") all_records = [] found = 0 new = 0 skipped = 0 try: print(" Fetching all members (paginated)...") all_records = fetch_all_members() found = len(all_records) print(f" Total members fetched: {found}") # Store records for record in all_records: source_id = record.get("id", "") row_id = store_source_record( db, SOURCE_NAME, source_id, "https://funeralsaustralia.org.au/find-a-member/", record, log_id ) if row_id: normalized = to_normalized(record) db.execute( "UPDATE source_record SET normalized_data = ? WHERE id = ?", (json.dumps(normalized), row_id) ) new += 1 else: skipped += 1 db.commit() finish_crawl_log(db, log_id, found, new, 0, skipped) print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, {skipped} skipped") except Exception as e: finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e)) raise finally: db.close() return all_records if __name__ == "__main__": run()