Initial commit: funeral provider discovery pipeline

Python crawlers for VIC Register, Funerals Australia, NFDA
n8n workflows for scheduled discovery and enrichment
SQLite schema and seeded dev database (1,463 providers)
End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
Richie
2026-04-24 10:27:08 +10:00
commit cc91427789
30 changed files with 4706 additions and 0 deletions

163
crawlers/crawl_nfda.py Normal file
View File

@@ -0,0 +1,163 @@
"""Crawler for the NFDA (National Funeral Directors Association) directory.
Source: https://nfda.com.au/find-your-local-nfda-member/
Method: WPSL JSON API (GET requests with lat/lng search)
Fields: name, address, city, state, postcode, lat/lng, phone, email
"""
import time
import json
from pathlib import Path
from base import (
fetch_json, get_db, start_crawl_log, finish_crawl_log,
store_source_record, normalize_phone, normalize_state,
generate_slug, to_intermediate, CRAWL_DELAY,
)
SOURCE_NAME = "nfda"
API_URL = "https://nfda.com.au/wp-admin/admin-ajax.php"
# Search centroids covering Australia with large radius
SEARCH_POINTS = [
{"name": "Sydney", "lat": -33.87, "lng": 151.21},
{"name": "Melbourne", "lat": -37.81, "lng": 144.96},
{"name": "Brisbane", "lat": -27.47, "lng": 153.03},
{"name": "Perth", "lat": -31.95, "lng": 115.86},
{"name": "Adelaide", "lat": -34.93, "lng": 138.60},
{"name": "Hobart", "lat": -42.88, "lng": 147.33},
{"name": "Darwin", "lat": -12.46, "lng": 130.85},
{"name": "Townsville", "lat": -19.26, "lng": 146.82},
{"name": "Central NSW", "lat": -30.0, "lng": 150.0},
{"name": "Inland QLD", "lat": -23.0, "lng": 145.0},
]
def fetch_members(lat: float, lng: float, max_results: int = 50,
radius: int = 5000) -> list[dict]:
"""Fetch NFDA members near a given lat/lng."""
params = {
"action": "store_search",
"lat": str(lat),
"lng": str(lng),
"max_results": str(max_results),
"search_radius": str(radius),
"autoload": "1",
}
data = fetch_json(API_URL, method="GET", data=params)
if isinstance(data, list):
return data
return []
def to_normalized(record: dict) -> dict:
"""Convert an NFDA record to intermediate format."""
state = normalize_state(record.get("state", ""))
business = {
"name": record.get("store", "").strip(),
"abn": None,
"phone": normalize_phone(record.get("phone")),
"email": record.get("email", "").strip() or None,
"website": record.get("url", "").strip() or None,
"description": None,
}
lat_val = record.get("lat")
lng_val = record.get("lng")
try:
lat_val = float(lat_val) if lat_val else None
lng_val = float(lng_val) if lng_val else None
except (ValueError, TypeError):
lat_val = lng_val = None
city = record.get("city", "").strip()
# Normalize city casing (some are ALL CAPS)
if city and city == city.upper():
city = city.title()
locations = [{
"address": record.get("address", "").strip(),
"suburb": city,
"state": state,
"postcode": record.get("zip", "").strip(),
"lat": lat_val,
"lng": lng_val,
"phone": normalize_phone(record.get("phone")),
}]
source_id = str(record.get("id", ""))
return to_intermediate(
source=SOURCE_NAME,
source_id=source_id,
source_url="https://nfda.com.au/find-your-local-nfda-member/",
business=business,
locations=locations,
)
def run():
"""Run the full NFDA crawl."""
db = get_db()
log_id = start_crawl_log(db, SOURCE_NAME)
print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})")
seen_ids = set()
all_records = []
found = 0
new = 0
skipped = 0
try:
for point in SEARCH_POINTS:
print(f" Searching near {point['name']}...", end=" ", flush=True)
members = fetch_members(point["lat"], point["lng"])
new_count = 0
for member in members:
member_id = str(member.get("id", ""))
if member_id in seen_ids:
continue
seen_ids.add(member_id)
all_records.append(member)
new_count += 1
print(f"{len(members)} results, {new_count} new unique")
found += len(members)
time.sleep(CRAWL_DELAY)
print(f" Total unique members: {len(all_records)}")
# Store records
for record in all_records:
source_id = str(record.get("id", ""))
row_id = store_source_record(
db, SOURCE_NAME, source_id,
"https://nfda.com.au/find-your-local-nfda-member/",
record, log_id
)
if row_id:
normalized = to_normalized(record)
db.execute(
"UPDATE source_record SET normalized_data = ? WHERE id = ?",
(json.dumps(normalized), row_id)
)
new += 1
else:
skipped += 1
db.commit()
finish_crawl_log(db, log_id, found, new, 0, skipped)
print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, {skipped} skipped")
except Exception as e:
finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e))
raise
finally:
db.close()
return all_records
if __name__ == "__main__":
run()