Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
363 lines
11 KiB
Python
363 lines
11 KiB
Python
"""Crawler for Gathered Here funeral director directory.
|
|
|
|
Source: https://www.gatheredhere.com.au
|
|
Method: XML sitemap → fetch individual profile pages → parse HTML
|
|
Fields: name, address, coords, phone, email, website, description, pricing, reviews
|
|
"""
|
|
|
|
import re
|
|
import time
|
|
import json
|
|
import xml.etree.ElementTree as ET
|
|
from html.parser import HTMLParser
|
|
from pathlib import Path
|
|
|
|
from base import (
|
|
fetch_url, get_db, start_crawl_log, finish_crawl_log,
|
|
store_source_record, normalize_phone, normalize_state,
|
|
generate_slug, to_intermediate, CRAWL_DELAY,
|
|
)
|
|
|
|
SOURCE_NAME = "gathered_here"
|
|
SITEMAP_URL = "https://www.gatheredhere.com.au/sitemap/sitemap-funerals-listings-0.xml"
|
|
BASE_URL = "https://www.gatheredhere.com.au"
|
|
|
|
|
|
def fetch_all_listing_urls() -> list[str]:
|
|
"""Fetch and parse the sitemap to get all funeral director profile URLs."""
|
|
xml_text = fetch_url(SITEMAP_URL)
|
|
root = ET.fromstring(xml_text)
|
|
ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
|
|
|
|
urls = []
|
|
for url_elem in root.findall("sm:url", ns):
|
|
loc = url_elem.find("sm:loc", ns)
|
|
if loc is not None and loc.text:
|
|
url = loc.text.strip()
|
|
# Only include individual profile pages (singular /funeral-director/)
|
|
if "/funeral-director/" in url and "/funeral-directors/" not in url:
|
|
urls.append(url)
|
|
|
|
return urls
|
|
|
|
|
|
def extract_next_data(html_text: str) -> dict | None:
|
|
"""Extract __NEXT_DATA__ JSON from a Next.js page."""
|
|
pattern = r'<script\s+id="__NEXT_DATA__"\s+type="application/json">(.*?)</script>'
|
|
match = re.search(pattern, html_text, re.DOTALL)
|
|
if match:
|
|
try:
|
|
return json.loads(match.group(1))
|
|
except json.JSONDecodeError:
|
|
return None
|
|
return None
|
|
|
|
|
|
def extract_from_next_data(next_data: dict) -> dict | None:
|
|
"""Extract listing data from __NEXT_DATA__ props."""
|
|
try:
|
|
props = next_data.get("props", {}).get("pageProps", {})
|
|
|
|
# Structure: singleListing.listing contains the actual data
|
|
single = props.get("singleListing", {})
|
|
if single:
|
|
listing = single.get("listing")
|
|
if listing and isinstance(listing, dict):
|
|
return listing
|
|
|
|
# Fallback paths
|
|
listing = props.get("listing") or props.get("post") or props.get("data")
|
|
return listing
|
|
except (KeyError, TypeError):
|
|
return None
|
|
|
|
|
|
def extract_from_html(html_text: str, url: str) -> dict:
|
|
"""Extract listing data from page HTML using regex patterns as fallback."""
|
|
data = {"url": url}
|
|
|
|
# Title
|
|
title_match = re.search(r'<h1[^>]*>(.*?)</h1>', html_text, re.DOTALL)
|
|
if title_match:
|
|
data["title"] = re.sub(r'<[^>]+>', '', title_match.group(1)).strip()
|
|
|
|
# Phone
|
|
phone_match = re.search(r'href="tel:([^"]+)"', html_text)
|
|
if phone_match:
|
|
data["phone"] = phone_match.group(1).strip()
|
|
|
|
# Email
|
|
email_match = re.search(r'href="mailto:([^"]+)"', html_text)
|
|
if email_match:
|
|
data["email"] = email_match.group(1).strip()
|
|
|
|
# Website
|
|
website_match = re.search(
|
|
r'<a[^>]*class="[^"]*website[^"]*"[^>]*href="([^"]+)"', html_text
|
|
)
|
|
if website_match:
|
|
data["website"] = website_match.group(1).strip()
|
|
|
|
# Address from structured data
|
|
addr_match = re.search(
|
|
r'"streetAddress"\s*:\s*"([^"]*)"', html_text
|
|
)
|
|
if addr_match:
|
|
data["address"] = addr_match.group(1)
|
|
|
|
locality_match = re.search(r'"addressLocality"\s*:\s*"([^"]*)"', html_text)
|
|
if locality_match:
|
|
data["suburb"] = locality_match.group(1)
|
|
|
|
region_match = re.search(r'"addressRegion"\s*:\s*"([^"]*)"', html_text)
|
|
if region_match:
|
|
data["state"] = region_match.group(1)
|
|
|
|
postcode_match = re.search(r'"postalCode"\s*:\s*"([^"]*)"', html_text)
|
|
if postcode_match:
|
|
data["postcode"] = postcode_match.group(1)
|
|
|
|
# Coordinates
|
|
lat_match = re.search(r'"latitude"\s*:\s*"?(-?[\d.]+)"?', html_text)
|
|
lng_match = re.search(r'"longitude"\s*:\s*"?(-?[\d.]+)"?', html_text)
|
|
if lat_match:
|
|
data["lat"] = float(lat_match.group(1))
|
|
if lng_match:
|
|
data["lng"] = float(lng_match.group(1))
|
|
|
|
return data
|
|
|
|
|
|
def extract_pricing(listing_data: dict) -> dict:
|
|
"""Extract pricing from listing meta fields."""
|
|
meta = listing_data.get("meta", {})
|
|
if not meta:
|
|
return {}
|
|
|
|
pricing = {}
|
|
price_fields = {
|
|
# With viewing prices
|
|
"cremation_no_service_viewY": "cremation_no_service_with_viewing",
|
|
"cremation_single_viewY": "cremation_single_service_with_viewing",
|
|
"cremation_dual_viewY": "cremation_dual_service_with_viewing",
|
|
"cremation_graveside_viewY": "cremation_graveside_with_viewing",
|
|
"burial_single_viewY": "burial_single_service_with_viewing",
|
|
"burial_dual_viewY": "burial_dual_service_with_viewing",
|
|
"burial_graveside_viewY": "burial_graveside_with_viewing",
|
|
"burial_no_service_viewY": "burial_no_service_with_viewing",
|
|
# Without viewing prices
|
|
"cremation_no_service_viewN": "cremation_no_service",
|
|
"cremation_single_viewN": "cremation_single_service",
|
|
"cremation_dual_viewN": "cremation_dual_service",
|
|
"cremation_graveside_viewN": "cremation_graveside",
|
|
"burial_single_viewN": "burial_single_service",
|
|
"burial_dual_viewN": "burial_dual_service",
|
|
"burial_graveside_viewN": "burial_graveside",
|
|
"burial_no_service_viewN": "burial_no_service",
|
|
}
|
|
|
|
for meta_key, label in price_fields.items():
|
|
val = meta.get(meta_key, "")
|
|
if val:
|
|
# Parse price string like "$2,299" to float
|
|
cleaned = re.sub(r'[^\d.]', '', str(val))
|
|
if cleaned:
|
|
try:
|
|
pricing[label] = float(cleaned)
|
|
except ValueError:
|
|
pass
|
|
|
|
return pricing
|
|
|
|
|
|
def pricing_to_packages(pricing: dict) -> list[dict]:
|
|
"""Convert flat pricing dict to package format."""
|
|
packages = []
|
|
|
|
# Map pricing keys to funeral types
|
|
type_mappings = [
|
|
("cremation_no_service", "Cremation Only"),
|
|
("cremation_single_service", "Service & Cremation"),
|
|
("cremation_single_service_with_viewing", "Service & Cremation"),
|
|
("burial_single_service", "Service & Burial"),
|
|
("burial_graveside", "Graveside Burial"),
|
|
]
|
|
|
|
for price_key, funeral_type in type_mappings:
|
|
if price_key in pricing:
|
|
name = price_key.replace("_", " ").title()
|
|
packages.append({
|
|
"name": name,
|
|
"funeralType": funeral_type,
|
|
"price": pricing[price_key],
|
|
"inclusions": [], # Not available from Gathered Here listing pages
|
|
})
|
|
|
|
return packages
|
|
|
|
|
|
def to_normalized(listing_data: dict, url: str) -> dict:
|
|
"""Convert Gathered Here listing data to intermediate format."""
|
|
meta = listing_data.get("meta", {}) if isinstance(listing_data.get("meta"), dict) else {}
|
|
|
|
name = listing_data.get("title", listing_data.get("name", "")).strip()
|
|
slug = listing_data.get("slug", "")
|
|
|
|
# Extract location
|
|
suburb = meta.get("geolocation_city", "")
|
|
state = normalize_state(meta.get("geolocation_state_short", ""))
|
|
postcode = meta.get("geolocation_postcode", "")
|
|
lat = meta.get("geolocation_lat")
|
|
lng = meta.get("geolocation_long")
|
|
|
|
try:
|
|
lat = float(lat) if lat else None
|
|
lng = float(lng) if lng else None
|
|
except (ValueError, TypeError):
|
|
lat = lng = None
|
|
|
|
email = meta.get("email", "") or meta.get("_application", "")
|
|
phone = meta.get("phone", "") or listing_data.get("phone", "")
|
|
|
|
# Try to get description from content or excerpt
|
|
description = listing_data.get("excerpt", listing_data.get("content", ""))
|
|
if description:
|
|
description = re.sub(r'<[^>]+>', '', description).strip()
|
|
if len(description) > 500:
|
|
description = description[:497] + "..."
|
|
|
|
# Website
|
|
website = listing_data.get("website") or meta.get("website") or None
|
|
|
|
# Pricing
|
|
pricing = extract_pricing(listing_data)
|
|
packages = pricing_to_packages(pricing)
|
|
|
|
business = {
|
|
"name": name,
|
|
"abn": None,
|
|
"phone": normalize_phone(phone),
|
|
"email": email.strip() or None,
|
|
"website": website,
|
|
"description": description or None,
|
|
}
|
|
|
|
locations = [{
|
|
"address": meta.get("geolocation_formatted_address", ""),
|
|
"suburb": suburb,
|
|
"state": state,
|
|
"postcode": postcode,
|
|
"lat": lat,
|
|
"lng": lng,
|
|
"phone": normalize_phone(phone),
|
|
}]
|
|
|
|
source_id = slug or generate_slug(name)
|
|
return to_intermediate(
|
|
source=SOURCE_NAME,
|
|
source_id=source_id,
|
|
source_url=url,
|
|
business=business,
|
|
locations=locations,
|
|
packages=packages,
|
|
)
|
|
|
|
|
|
def crawl_profile(url: str) -> dict | None:
|
|
"""Crawl a single Gathered Here profile page."""
|
|
try:
|
|
html_text = fetch_url(url)
|
|
except Exception as e:
|
|
print(f" Error fetching {url}: {e}")
|
|
return None
|
|
|
|
# Try __NEXT_DATA__ first (structured)
|
|
next_data = extract_next_data(html_text)
|
|
if next_data:
|
|
listing = extract_from_next_data(next_data)
|
|
if listing:
|
|
listing["_source"] = "next_data"
|
|
return listing
|
|
|
|
# Fallback to HTML parsing
|
|
data = extract_from_html(html_text, url)
|
|
data["_source"] = "html_fallback"
|
|
return data
|
|
|
|
|
|
def run(limit: int | None = None):
|
|
"""Run the full Gathered Here crawl.
|
|
|
|
Args:
|
|
limit: If set, only crawl this many profiles (for testing).
|
|
"""
|
|
db = get_db()
|
|
log_id = start_crawl_log(db, SOURCE_NAME)
|
|
print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})")
|
|
|
|
found = 0
|
|
new = 0
|
|
skipped = 0
|
|
errors = 0
|
|
|
|
try:
|
|
# Step 1: Get all profile URLs from sitemap
|
|
print(" Fetching sitemap...", end=" ", flush=True)
|
|
urls = fetch_all_listing_urls()
|
|
print(f"{len(urls)} profile URLs found")
|
|
|
|
if limit:
|
|
urls = urls[:limit]
|
|
print(f" (limited to {limit} for testing)")
|
|
|
|
# Step 2: Crawl each profile
|
|
for i, url in enumerate(urls):
|
|
slug = url.rstrip("/").split("/")[-1]
|
|
|
|
if (i + 1) % 50 == 0 or i == 0:
|
|
print(f" Crawling {i+1}/{len(urls)}: {slug}")
|
|
|
|
listing_data = crawl_profile(url)
|
|
found += 1
|
|
|
|
if not listing_data:
|
|
errors += 1
|
|
continue
|
|
|
|
source_id = slug
|
|
row_id = store_source_record(
|
|
db, SOURCE_NAME, source_id, url, listing_data, log_id
|
|
)
|
|
|
|
if row_id:
|
|
normalized = to_normalized(listing_data, url)
|
|
db.execute(
|
|
"UPDATE source_record SET normalized_data = ? WHERE id = ?",
|
|
(json.dumps(normalized), row_id)
|
|
)
|
|
new += 1
|
|
else:
|
|
skipped += 1
|
|
|
|
if (i + 1) % 10 == 0:
|
|
db.commit() # periodic commit
|
|
|
|
time.sleep(CRAWL_DELAY)
|
|
|
|
db.commit()
|
|
finish_crawl_log(db, log_id, found, new, 0, skipped)
|
|
print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, "
|
|
f"{skipped} skipped, {errors} errors")
|
|
|
|
except Exception as e:
|
|
finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e))
|
|
raise
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
limit = int(sys.argv[1]) if len(sys.argv) > 1 else None
|
|
run(limit=limit)
|