Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
179
crawlers/crawl_funerals_australia.py
Normal file
179
crawlers/crawl_funerals_australia.py
Normal file
@@ -0,0 +1,179 @@
|
||||
"""Crawler for the Funerals Australia (formerly AFDA) member directory.
|
||||
|
||||
Source: https://funeralsaustralia.org.au/find-a-member/
|
||||
Method: WordPress AJAX API (POST with get_clients_list action)
|
||||
Fields: name, address (structured), phone, email, website, lat/lng, displayImage
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from base import (
|
||||
fetch_url, get_db, start_crawl_log, finish_crawl_log,
|
||||
store_source_record, normalize_phone, normalize_state,
|
||||
generate_slug, to_intermediate, CRAWL_DELAY,
|
||||
)
|
||||
|
||||
SOURCE_NAME = "funerals_australia"
|
||||
API_URL = "https://funeralsaustralia.org.au/wp-admin/admin-ajax.php"
|
||||
|
||||
PAGE_SIZE = 200 # API supports up to 200 per page
|
||||
|
||||
|
||||
def fetch_page(offset: int = 0) -> dict:
|
||||
"""Fetch a page of all members from the Funerals Australia API.
|
||||
|
||||
The API returns all members when no postcode/suburb filter is given,
|
||||
which is more reliable than geo-filtered searches.
|
||||
"""
|
||||
form_data = {
|
||||
"action": "get_clients_list",
|
||||
"params[size]": str(PAGE_SIZE),
|
||||
"params[from]": str(offset),
|
||||
"params[forceResults]": "true",
|
||||
"params[paginated]": "true",
|
||||
}
|
||||
|
||||
text = fetch_url(API_URL, method="POST", data=form_data,
|
||||
headers={"X-Requested-With": "XMLHttpRequest"})
|
||||
return json.loads(text)
|
||||
|
||||
|
||||
def fetch_all_members() -> list[dict]:
|
||||
"""Fetch all members via pagination."""
|
||||
all_results = []
|
||||
offset = 0
|
||||
|
||||
while True:
|
||||
data = fetch_page(offset)
|
||||
results = data.get("results", [])
|
||||
total = data.get("total", 0)
|
||||
|
||||
if not results:
|
||||
break
|
||||
|
||||
all_results.extend(results)
|
||||
print(f" Fetched {len(all_results)}/{total}...")
|
||||
offset += PAGE_SIZE
|
||||
|
||||
if offset >= total:
|
||||
break
|
||||
|
||||
time.sleep(CRAWL_DELAY)
|
||||
|
||||
return all_results
|
||||
|
||||
|
||||
def parse_address(record: dict) -> dict:
|
||||
"""Extract structured address from a Funerals Australia record."""
|
||||
addr_list = record.get("address", [])
|
||||
if addr_list and isinstance(addr_list, list) and len(addr_list) > 0:
|
||||
addr = addr_list[0]
|
||||
return {
|
||||
"line1": addr.get("line1", "").strip(),
|
||||
"city": addr.get("city", "").strip(),
|
||||
"state": normalize_state(addr.get("state")),
|
||||
"postcode": addr.get("postcode", "").strip(),
|
||||
}
|
||||
return {"line1": "", "city": "", "state": None, "postcode": ""}
|
||||
|
||||
|
||||
def to_normalized(record: dict) -> dict:
|
||||
"""Convert a Funerals Australia record to intermediate format."""
|
||||
addr = parse_address(record)
|
||||
city = addr["city"]
|
||||
if city and city == city.upper():
|
||||
city = city.title()
|
||||
|
||||
lat_val = record.get("latitude")
|
||||
lng_val = record.get("longitude")
|
||||
try:
|
||||
lat_val = float(lat_val) if lat_val else None
|
||||
lng_val = float(lng_val) if lng_val else None
|
||||
except (ValueError, TypeError):
|
||||
lat_val = lng_val = None
|
||||
|
||||
website = record.get("website", "").strip() or None
|
||||
if website and not website.startswith("http"):
|
||||
website = "https://" + website
|
||||
|
||||
business = {
|
||||
"name": record.get("name", "").strip(),
|
||||
"abn": None,
|
||||
"phone": normalize_phone(record.get("phone")),
|
||||
"email": record.get("email", "").strip() or None,
|
||||
"website": website,
|
||||
"description": None,
|
||||
}
|
||||
|
||||
locations = [{
|
||||
"address": addr["line1"],
|
||||
"suburb": city,
|
||||
"state": addr["state"],
|
||||
"postcode": addr["postcode"],
|
||||
"lat": lat_val,
|
||||
"lng": lng_val,
|
||||
"phone": normalize_phone(record.get("phone")),
|
||||
}]
|
||||
|
||||
source_id = record.get("id", "")
|
||||
return to_intermediate(
|
||||
source=SOURCE_NAME,
|
||||
source_id=source_id,
|
||||
source_url="https://funeralsaustralia.org.au/find-a-member/",
|
||||
business=business,
|
||||
locations=locations,
|
||||
)
|
||||
|
||||
|
||||
def run():
|
||||
"""Run the full Funerals Australia crawl."""
|
||||
db = get_db()
|
||||
log_id = start_crawl_log(db, SOURCE_NAME)
|
||||
print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})")
|
||||
|
||||
all_records = []
|
||||
found = 0
|
||||
new = 0
|
||||
skipped = 0
|
||||
|
||||
try:
|
||||
print(" Fetching all members (paginated)...")
|
||||
all_records = fetch_all_members()
|
||||
found = len(all_records)
|
||||
print(f" Total members fetched: {found}")
|
||||
|
||||
# Store records
|
||||
for record in all_records:
|
||||
source_id = record.get("id", "")
|
||||
row_id = store_source_record(
|
||||
db, SOURCE_NAME, source_id,
|
||||
"https://funeralsaustralia.org.au/find-a-member/",
|
||||
record, log_id
|
||||
)
|
||||
if row_id:
|
||||
normalized = to_normalized(record)
|
||||
db.execute(
|
||||
"UPDATE source_record SET normalized_data = ? WHERE id = ?",
|
||||
(json.dumps(normalized), row_id)
|
||||
)
|
||||
new += 1
|
||||
else:
|
||||
skipped += 1
|
||||
|
||||
db.commit()
|
||||
finish_crawl_log(db, log_id, found, new, 0, skipped)
|
||||
print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, {skipped} skipped")
|
||||
|
||||
except Exception as e:
|
||||
finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e))
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
return all_records
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
Reference in New Issue
Block a user