Initial commit: funeral provider discovery pipeline

Python crawlers for VIC Register, Funerals Australia, NFDA
n8n workflows for scheduled discovery and enrichment
SQLite schema and seeded dev database (1,463 providers)
End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
Richie
2026-04-24 10:27:08 +10:00
commit cc91427789
30 changed files with 4706 additions and 0 deletions

View File

@@ -0,0 +1,179 @@
"""Crawler for the Funerals Australia (formerly AFDA) member directory.
Source: https://funeralsaustralia.org.au/find-a-member/
Method: WordPress AJAX API (POST with get_clients_list action)
Fields: name, address (structured), phone, email, website, lat/lng, displayImage
"""
import time
import json
from pathlib import Path
from base import (
fetch_url, get_db, start_crawl_log, finish_crawl_log,
store_source_record, normalize_phone, normalize_state,
generate_slug, to_intermediate, CRAWL_DELAY,
)
SOURCE_NAME = "funerals_australia"
API_URL = "https://funeralsaustralia.org.au/wp-admin/admin-ajax.php"
PAGE_SIZE = 200 # API supports up to 200 per page
def fetch_page(offset: int = 0) -> dict:
"""Fetch a page of all members from the Funerals Australia API.
The API returns all members when no postcode/suburb filter is given,
which is more reliable than geo-filtered searches.
"""
form_data = {
"action": "get_clients_list",
"params[size]": str(PAGE_SIZE),
"params[from]": str(offset),
"params[forceResults]": "true",
"params[paginated]": "true",
}
text = fetch_url(API_URL, method="POST", data=form_data,
headers={"X-Requested-With": "XMLHttpRequest"})
return json.loads(text)
def fetch_all_members() -> list[dict]:
"""Fetch all members via pagination."""
all_results = []
offset = 0
while True:
data = fetch_page(offset)
results = data.get("results", [])
total = data.get("total", 0)
if not results:
break
all_results.extend(results)
print(f" Fetched {len(all_results)}/{total}...")
offset += PAGE_SIZE
if offset >= total:
break
time.sleep(CRAWL_DELAY)
return all_results
def parse_address(record: dict) -> dict:
"""Extract structured address from a Funerals Australia record."""
addr_list = record.get("address", [])
if addr_list and isinstance(addr_list, list) and len(addr_list) > 0:
addr = addr_list[0]
return {
"line1": addr.get("line1", "").strip(),
"city": addr.get("city", "").strip(),
"state": normalize_state(addr.get("state")),
"postcode": addr.get("postcode", "").strip(),
}
return {"line1": "", "city": "", "state": None, "postcode": ""}
def to_normalized(record: dict) -> dict:
"""Convert a Funerals Australia record to intermediate format."""
addr = parse_address(record)
city = addr["city"]
if city and city == city.upper():
city = city.title()
lat_val = record.get("latitude")
lng_val = record.get("longitude")
try:
lat_val = float(lat_val) if lat_val else None
lng_val = float(lng_val) if lng_val else None
except (ValueError, TypeError):
lat_val = lng_val = None
website = record.get("website", "").strip() or None
if website and not website.startswith("http"):
website = "https://" + website
business = {
"name": record.get("name", "").strip(),
"abn": None,
"phone": normalize_phone(record.get("phone")),
"email": record.get("email", "").strip() or None,
"website": website,
"description": None,
}
locations = [{
"address": addr["line1"],
"suburb": city,
"state": addr["state"],
"postcode": addr["postcode"],
"lat": lat_val,
"lng": lng_val,
"phone": normalize_phone(record.get("phone")),
}]
source_id = record.get("id", "")
return to_intermediate(
source=SOURCE_NAME,
source_id=source_id,
source_url="https://funeralsaustralia.org.au/find-a-member/",
business=business,
locations=locations,
)
def run():
"""Run the full Funerals Australia crawl."""
db = get_db()
log_id = start_crawl_log(db, SOURCE_NAME)
print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})")
all_records = []
found = 0
new = 0
skipped = 0
try:
print(" Fetching all members (paginated)...")
all_records = fetch_all_members()
found = len(all_records)
print(f" Total members fetched: {found}")
# Store records
for record in all_records:
source_id = record.get("id", "")
row_id = store_source_record(
db, SOURCE_NAME, source_id,
"https://funeralsaustralia.org.au/find-a-member/",
record, log_id
)
if row_id:
normalized = to_normalized(record)
db.execute(
"UPDATE source_record SET normalized_data = ? WHERE id = ?",
(json.dumps(normalized), row_id)
)
new += 1
else:
skipped += 1
db.commit()
finish_crawl_log(db, log_id, found, new, 0, skipped)
print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, {skipped} skipped")
except Exception as e:
finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e))
raise
finally:
db.close()
return all_records
if __name__ == "__main__":
run()