"""Crawler for the NFDA (National Funeral Directors Association) directory. Source: https://nfda.com.au/find-your-local-nfda-member/ Method: WPSL JSON API (GET requests with lat/lng search) Fields: name, address, city, state, postcode, lat/lng, phone, email """ import time import json from pathlib import Path from base import ( fetch_json, get_db, start_crawl_log, finish_crawl_log, store_source_record, normalize_phone, normalize_state, generate_slug, to_intermediate, CRAWL_DELAY, ) SOURCE_NAME = "nfda" API_URL = "https://nfda.com.au/wp-admin/admin-ajax.php" # Search centroids covering Australia with large radius SEARCH_POINTS = [ {"name": "Sydney", "lat": -33.87, "lng": 151.21}, {"name": "Melbourne", "lat": -37.81, "lng": 144.96}, {"name": "Brisbane", "lat": -27.47, "lng": 153.03}, {"name": "Perth", "lat": -31.95, "lng": 115.86}, {"name": "Adelaide", "lat": -34.93, "lng": 138.60}, {"name": "Hobart", "lat": -42.88, "lng": 147.33}, {"name": "Darwin", "lat": -12.46, "lng": 130.85}, {"name": "Townsville", "lat": -19.26, "lng": 146.82}, {"name": "Central NSW", "lat": -30.0, "lng": 150.0}, {"name": "Inland QLD", "lat": -23.0, "lng": 145.0}, ] def fetch_members(lat: float, lng: float, max_results: int = 50, radius: int = 5000) -> list[dict]: """Fetch NFDA members near a given lat/lng.""" params = { "action": "store_search", "lat": str(lat), "lng": str(lng), "max_results": str(max_results), "search_radius": str(radius), "autoload": "1", } data = fetch_json(API_URL, method="GET", data=params) if isinstance(data, list): return data return [] def to_normalized(record: dict) -> dict: """Convert an NFDA record to intermediate format.""" state = normalize_state(record.get("state", "")) business = { "name": record.get("store", "").strip(), "abn": None, "phone": normalize_phone(record.get("phone")), "email": record.get("email", "").strip() or None, "website": record.get("url", "").strip() or None, "description": None, } lat_val = record.get("lat") lng_val = record.get("lng") try: lat_val = float(lat_val) if lat_val else None lng_val = float(lng_val) if lng_val else None except (ValueError, TypeError): lat_val = lng_val = None city = record.get("city", "").strip() # Normalize city casing (some are ALL CAPS) if city and city == city.upper(): city = city.title() locations = [{ "address": record.get("address", "").strip(), "suburb": city, "state": state, "postcode": record.get("zip", "").strip(), "lat": lat_val, "lng": lng_val, "phone": normalize_phone(record.get("phone")), }] source_id = str(record.get("id", "")) return to_intermediate( source=SOURCE_NAME, source_id=source_id, source_url="https://nfda.com.au/find-your-local-nfda-member/", business=business, locations=locations, ) def run(): """Run the full NFDA crawl.""" db = get_db() log_id = start_crawl_log(db, SOURCE_NAME) print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})") seen_ids = set() all_records = [] found = 0 new = 0 skipped = 0 try: for point in SEARCH_POINTS: print(f" Searching near {point['name']}...", end=" ", flush=True) members = fetch_members(point["lat"], point["lng"]) new_count = 0 for member in members: member_id = str(member.get("id", "")) if member_id in seen_ids: continue seen_ids.add(member_id) all_records.append(member) new_count += 1 print(f"{len(members)} results, {new_count} new unique") found += len(members) time.sleep(CRAWL_DELAY) print(f" Total unique members: {len(all_records)}") # Store records for record in all_records: source_id = str(record.get("id", "")) row_id = store_source_record( db, SOURCE_NAME, source_id, "https://nfda.com.au/find-your-local-nfda-member/", record, log_id ) if row_id: normalized = to_normalized(record) db.execute( "UPDATE source_record SET normalized_data = ? WHERE id = ?", (json.dumps(normalized), row_id) ) new += 1 else: skipped += 1 db.commit() finish_crawl_log(db, log_id, found, new, 0, skipped) print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, {skipped} skipped") except Exception as e: finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e)) raise finally: db.close() return all_records if __name__ == "__main__": run()