Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
164 lines
5.0 KiB
Python
164 lines
5.0 KiB
Python
"""Crawler for the NFDA (National Funeral Directors Association) directory.
|
|
|
|
Source: https://nfda.com.au/find-your-local-nfda-member/
|
|
Method: WPSL JSON API (GET requests with lat/lng search)
|
|
Fields: name, address, city, state, postcode, lat/lng, phone, email
|
|
"""
|
|
|
|
import time
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from base import (
|
|
fetch_json, get_db, start_crawl_log, finish_crawl_log,
|
|
store_source_record, normalize_phone, normalize_state,
|
|
generate_slug, to_intermediate, CRAWL_DELAY,
|
|
)
|
|
|
|
SOURCE_NAME = "nfda"
|
|
API_URL = "https://nfda.com.au/wp-admin/admin-ajax.php"
|
|
|
|
# Search centroids covering Australia with large radius
|
|
SEARCH_POINTS = [
|
|
{"name": "Sydney", "lat": -33.87, "lng": 151.21},
|
|
{"name": "Melbourne", "lat": -37.81, "lng": 144.96},
|
|
{"name": "Brisbane", "lat": -27.47, "lng": 153.03},
|
|
{"name": "Perth", "lat": -31.95, "lng": 115.86},
|
|
{"name": "Adelaide", "lat": -34.93, "lng": 138.60},
|
|
{"name": "Hobart", "lat": -42.88, "lng": 147.33},
|
|
{"name": "Darwin", "lat": -12.46, "lng": 130.85},
|
|
{"name": "Townsville", "lat": -19.26, "lng": 146.82},
|
|
{"name": "Central NSW", "lat": -30.0, "lng": 150.0},
|
|
{"name": "Inland QLD", "lat": -23.0, "lng": 145.0},
|
|
]
|
|
|
|
|
|
def fetch_members(lat: float, lng: float, max_results: int = 50,
|
|
radius: int = 5000) -> list[dict]:
|
|
"""Fetch NFDA members near a given lat/lng."""
|
|
params = {
|
|
"action": "store_search",
|
|
"lat": str(lat),
|
|
"lng": str(lng),
|
|
"max_results": str(max_results),
|
|
"search_radius": str(radius),
|
|
"autoload": "1",
|
|
}
|
|
data = fetch_json(API_URL, method="GET", data=params)
|
|
if isinstance(data, list):
|
|
return data
|
|
return []
|
|
|
|
|
|
def to_normalized(record: dict) -> dict:
|
|
"""Convert an NFDA record to intermediate format."""
|
|
state = normalize_state(record.get("state", ""))
|
|
|
|
business = {
|
|
"name": record.get("store", "").strip(),
|
|
"abn": None,
|
|
"phone": normalize_phone(record.get("phone")),
|
|
"email": record.get("email", "").strip() or None,
|
|
"website": record.get("url", "").strip() or None,
|
|
"description": None,
|
|
}
|
|
|
|
lat_val = record.get("lat")
|
|
lng_val = record.get("lng")
|
|
try:
|
|
lat_val = float(lat_val) if lat_val else None
|
|
lng_val = float(lng_val) if lng_val else None
|
|
except (ValueError, TypeError):
|
|
lat_val = lng_val = None
|
|
|
|
city = record.get("city", "").strip()
|
|
# Normalize city casing (some are ALL CAPS)
|
|
if city and city == city.upper():
|
|
city = city.title()
|
|
|
|
locations = [{
|
|
"address": record.get("address", "").strip(),
|
|
"suburb": city,
|
|
"state": state,
|
|
"postcode": record.get("zip", "").strip(),
|
|
"lat": lat_val,
|
|
"lng": lng_val,
|
|
"phone": normalize_phone(record.get("phone")),
|
|
}]
|
|
|
|
source_id = str(record.get("id", ""))
|
|
return to_intermediate(
|
|
source=SOURCE_NAME,
|
|
source_id=source_id,
|
|
source_url="https://nfda.com.au/find-your-local-nfda-member/",
|
|
business=business,
|
|
locations=locations,
|
|
)
|
|
|
|
|
|
def run():
|
|
"""Run the full NFDA crawl."""
|
|
db = get_db()
|
|
log_id = start_crawl_log(db, SOURCE_NAME)
|
|
print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})")
|
|
|
|
seen_ids = set()
|
|
all_records = []
|
|
found = 0
|
|
new = 0
|
|
skipped = 0
|
|
|
|
try:
|
|
for point in SEARCH_POINTS:
|
|
print(f" Searching near {point['name']}...", end=" ", flush=True)
|
|
members = fetch_members(point["lat"], point["lng"])
|
|
new_count = 0
|
|
|
|
for member in members:
|
|
member_id = str(member.get("id", ""))
|
|
if member_id in seen_ids:
|
|
continue
|
|
seen_ids.add(member_id)
|
|
all_records.append(member)
|
|
new_count += 1
|
|
|
|
print(f"{len(members)} results, {new_count} new unique")
|
|
found += len(members)
|
|
time.sleep(CRAWL_DELAY)
|
|
|
|
print(f" Total unique members: {len(all_records)}")
|
|
|
|
# Store records
|
|
for record in all_records:
|
|
source_id = str(record.get("id", ""))
|
|
row_id = store_source_record(
|
|
db, SOURCE_NAME, source_id,
|
|
"https://nfda.com.au/find-your-local-nfda-member/",
|
|
record, log_id
|
|
)
|
|
if row_id:
|
|
normalized = to_normalized(record)
|
|
db.execute(
|
|
"UPDATE source_record SET normalized_data = ? WHERE id = ?",
|
|
(json.dumps(normalized), row_id)
|
|
)
|
|
new += 1
|
|
else:
|
|
skipped += 1
|
|
|
|
db.commit()
|
|
finish_crawl_log(db, log_id, found, new, 0, skipped)
|
|
print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, {skipped} skipped")
|
|
|
|
except Exception as e:
|
|
finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e))
|
|
raise
|
|
finally:
|
|
db.close()
|
|
|
|
return all_records
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run()
|