Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
163
crawlers/crawl_nfda.py
Normal file
163
crawlers/crawl_nfda.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""Crawler for the NFDA (National Funeral Directors Association) directory.
|
||||
|
||||
Source: https://nfda.com.au/find-your-local-nfda-member/
|
||||
Method: WPSL JSON API (GET requests with lat/lng search)
|
||||
Fields: name, address, city, state, postcode, lat/lng, phone, email
|
||||
"""
|
||||
|
||||
import time
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from base import (
|
||||
fetch_json, get_db, start_crawl_log, finish_crawl_log,
|
||||
store_source_record, normalize_phone, normalize_state,
|
||||
generate_slug, to_intermediate, CRAWL_DELAY,
|
||||
)
|
||||
|
||||
SOURCE_NAME = "nfda"
|
||||
API_URL = "https://nfda.com.au/wp-admin/admin-ajax.php"
|
||||
|
||||
# Search centroids covering Australia with large radius
|
||||
SEARCH_POINTS = [
|
||||
{"name": "Sydney", "lat": -33.87, "lng": 151.21},
|
||||
{"name": "Melbourne", "lat": -37.81, "lng": 144.96},
|
||||
{"name": "Brisbane", "lat": -27.47, "lng": 153.03},
|
||||
{"name": "Perth", "lat": -31.95, "lng": 115.86},
|
||||
{"name": "Adelaide", "lat": -34.93, "lng": 138.60},
|
||||
{"name": "Hobart", "lat": -42.88, "lng": 147.33},
|
||||
{"name": "Darwin", "lat": -12.46, "lng": 130.85},
|
||||
{"name": "Townsville", "lat": -19.26, "lng": 146.82},
|
||||
{"name": "Central NSW", "lat": -30.0, "lng": 150.0},
|
||||
{"name": "Inland QLD", "lat": -23.0, "lng": 145.0},
|
||||
]
|
||||
|
||||
|
||||
def fetch_members(lat: float, lng: float, max_results: int = 50,
|
||||
radius: int = 5000) -> list[dict]:
|
||||
"""Fetch NFDA members near a given lat/lng."""
|
||||
params = {
|
||||
"action": "store_search",
|
||||
"lat": str(lat),
|
||||
"lng": str(lng),
|
||||
"max_results": str(max_results),
|
||||
"search_radius": str(radius),
|
||||
"autoload": "1",
|
||||
}
|
||||
data = fetch_json(API_URL, method="GET", data=params)
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
return []
|
||||
|
||||
|
||||
def to_normalized(record: dict) -> dict:
|
||||
"""Convert an NFDA record to intermediate format."""
|
||||
state = normalize_state(record.get("state", ""))
|
||||
|
||||
business = {
|
||||
"name": record.get("store", "").strip(),
|
||||
"abn": None,
|
||||
"phone": normalize_phone(record.get("phone")),
|
||||
"email": record.get("email", "").strip() or None,
|
||||
"website": record.get("url", "").strip() or None,
|
||||
"description": None,
|
||||
}
|
||||
|
||||
lat_val = record.get("lat")
|
||||
lng_val = record.get("lng")
|
||||
try:
|
||||
lat_val = float(lat_val) if lat_val else None
|
||||
lng_val = float(lng_val) if lng_val else None
|
||||
except (ValueError, TypeError):
|
||||
lat_val = lng_val = None
|
||||
|
||||
city = record.get("city", "").strip()
|
||||
# Normalize city casing (some are ALL CAPS)
|
||||
if city and city == city.upper():
|
||||
city = city.title()
|
||||
|
||||
locations = [{
|
||||
"address": record.get("address", "").strip(),
|
||||
"suburb": city,
|
||||
"state": state,
|
||||
"postcode": record.get("zip", "").strip(),
|
||||
"lat": lat_val,
|
||||
"lng": lng_val,
|
||||
"phone": normalize_phone(record.get("phone")),
|
||||
}]
|
||||
|
||||
source_id = str(record.get("id", ""))
|
||||
return to_intermediate(
|
||||
source=SOURCE_NAME,
|
||||
source_id=source_id,
|
||||
source_url="https://nfda.com.au/find-your-local-nfda-member/",
|
||||
business=business,
|
||||
locations=locations,
|
||||
)
|
||||
|
||||
|
||||
def run():
|
||||
"""Run the full NFDA crawl."""
|
||||
db = get_db()
|
||||
log_id = start_crawl_log(db, SOURCE_NAME)
|
||||
print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})")
|
||||
|
||||
seen_ids = set()
|
||||
all_records = []
|
||||
found = 0
|
||||
new = 0
|
||||
skipped = 0
|
||||
|
||||
try:
|
||||
for point in SEARCH_POINTS:
|
||||
print(f" Searching near {point['name']}...", end=" ", flush=True)
|
||||
members = fetch_members(point["lat"], point["lng"])
|
||||
new_count = 0
|
||||
|
||||
for member in members:
|
||||
member_id = str(member.get("id", ""))
|
||||
if member_id in seen_ids:
|
||||
continue
|
||||
seen_ids.add(member_id)
|
||||
all_records.append(member)
|
||||
new_count += 1
|
||||
|
||||
print(f"{len(members)} results, {new_count} new unique")
|
||||
found += len(members)
|
||||
time.sleep(CRAWL_DELAY)
|
||||
|
||||
print(f" Total unique members: {len(all_records)}")
|
||||
|
||||
# Store records
|
||||
for record in all_records:
|
||||
source_id = str(record.get("id", ""))
|
||||
row_id = store_source_record(
|
||||
db, SOURCE_NAME, source_id,
|
||||
"https://nfda.com.au/find-your-local-nfda-member/",
|
||||
record, log_id
|
||||
)
|
||||
if row_id:
|
||||
normalized = to_normalized(record)
|
||||
db.execute(
|
||||
"UPDATE source_record SET normalized_data = ? WHERE id = ?",
|
||||
(json.dumps(normalized), row_id)
|
||||
)
|
||||
new += 1
|
||||
else:
|
||||
skipped += 1
|
||||
|
||||
db.commit()
|
||||
finish_crawl_log(db, log_id, found, new, 0, skipped)
|
||||
print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, {skipped} skipped")
|
||||
|
||||
except Exception as e:
|
||||
finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e))
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
return all_records
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
Reference in New Issue
Block a user