Initial commit: funeral provider discovery pipeline

Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
2026-04-24 10:27:08 +10:00
commit cc91427789
30 changed files with 4706 additions and 0 deletions
--- a/crawlers/crawl_gathered_here.py
+++ b/crawlers/crawl_gathered_here.py
@@ -0,0 +1,362 @@
+"""Crawler for Gathered Here funeral director directory.
+
+Source: https://www.gatheredhere.com.au
+Method: XML sitemap → fetch individual profile pages → parse HTML
+Fields: name, address, coords, phone, email, website, description, pricing, reviews
+"""
+
+import re
+import time
+import json
+import xml.etree.ElementTree as ET
+from html.parser import HTMLParser
+from pathlib import Path
+
+from base import (
+    fetch_url, get_db, start_crawl_log, finish_crawl_log,
+    store_source_record, normalize_phone, normalize_state,
+    generate_slug, to_intermediate, CRAWL_DELAY,
+)
+
+SOURCE_NAME = "gathered_here"
+SITEMAP_URL = "https://www.gatheredhere.com.au/sitemap/sitemap-funerals-listings-0.xml"
+BASE_URL = "https://www.gatheredhere.com.au"
+
+
+def fetch_all_listing_urls() -> list[str]:
+    """Fetch and parse the sitemap to get all funeral director profile URLs."""
+    xml_text = fetch_url(SITEMAP_URL)
+    root = ET.fromstring(xml_text)
+    ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}
+
+    urls = []
+    for url_elem in root.findall("sm:url", ns):
+        loc = url_elem.find("sm:loc", ns)
+        if loc is not None and loc.text:
+            url = loc.text.strip()
+            # Only include individual profile pages (singular /funeral-director/)
+            if "/funeral-director/" in url and "/funeral-directors/" not in url:
+                urls.append(url)
+
+    return urls
+
+
+def extract_next_data(html_text: str) -> dict | None:
+    """Extract __NEXT_DATA__ JSON from a Next.js page."""
+    pattern = r'<script\s+id="__NEXT_DATA__"\s+type="application/json">(.*?)</script>'
+    match = re.search(pattern, html_text, re.DOTALL)
+    if match:
+        try:
+            return json.loads(match.group(1))
+        except json.JSONDecodeError:
+            return None
+    return None
+
+
+def extract_from_next_data(next_data: dict) -> dict | None:
+    """Extract listing data from __NEXT_DATA__ props."""
+    try:
+        props = next_data.get("props", {}).get("pageProps", {})
+
+        # Structure: singleListing.listing contains the actual data
+        single = props.get("singleListing", {})
+        if single:
+            listing = single.get("listing")
+            if listing and isinstance(listing, dict):
+                return listing
+
+        # Fallback paths
+        listing = props.get("listing") or props.get("post") or props.get("data")
+        return listing
+    except (KeyError, TypeError):
+        return None
+
+
+def extract_from_html(html_text: str, url: str) -> dict:
+    """Extract listing data from page HTML using regex patterns as fallback."""
+    data = {"url": url}
+
+    # Title
+    title_match = re.search(r'<h1[^>]*>(.*?)</h1>', html_text, re.DOTALL)
+    if title_match:
+        data["title"] = re.sub(r'<[^>]+>', '', title_match.group(1)).strip()
+
+    # Phone
+    phone_match = re.search(r'href="tel:([^"]+)"', html_text)
+    if phone_match:
+        data["phone"] = phone_match.group(1).strip()
+
+    # Email
+    email_match = re.search(r'href="mailto:([^"]+)"', html_text)
+    if email_match:
+        data["email"] = email_match.group(1).strip()
+
+    # Website
+    website_match = re.search(
+        r'<a[^>]*class="[^"]*website[^"]*"[^>]*href="([^"]+)"', html_text
+    )
+    if website_match:
+        data["website"] = website_match.group(1).strip()
+
+    # Address from structured data
+    addr_match = re.search(
+        r'"streetAddress"\s*:\s*"([^"]*)"', html_text
+    )
+    if addr_match:
+        data["address"] = addr_match.group(1)
+
+    locality_match = re.search(r'"addressLocality"\s*:\s*"([^"]*)"', html_text)
+    if locality_match:
+        data["suburb"] = locality_match.group(1)
+
+    region_match = re.search(r'"addressRegion"\s*:\s*"([^"]*)"', html_text)
+    if region_match:
+        data["state"] = region_match.group(1)
+
+    postcode_match = re.search(r'"postalCode"\s*:\s*"([^"]*)"', html_text)
+    if postcode_match:
+        data["postcode"] = postcode_match.group(1)
+
+    # Coordinates
+    lat_match = re.search(r'"latitude"\s*:\s*"?(-?[\d.]+)"?', html_text)
+    lng_match = re.search(r'"longitude"\s*:\s*"?(-?[\d.]+)"?', html_text)
+    if lat_match:
+        data["lat"] = float(lat_match.group(1))
+    if lng_match:
+        data["lng"] = float(lng_match.group(1))
+
+    return data
+
+
+def extract_pricing(listing_data: dict) -> dict:
+    """Extract pricing from listing meta fields."""
+    meta = listing_data.get("meta", {})
+    if not meta:
+        return {}
+
+    pricing = {}
+    price_fields = {
+        # With viewing prices
+        "cremation_no_service_viewY": "cremation_no_service_with_viewing",
+        "cremation_single_viewY": "cremation_single_service_with_viewing",
+        "cremation_dual_viewY": "cremation_dual_service_with_viewing",
+        "cremation_graveside_viewY": "cremation_graveside_with_viewing",
+        "burial_single_viewY": "burial_single_service_with_viewing",
+        "burial_dual_viewY": "burial_dual_service_with_viewing",
+        "burial_graveside_viewY": "burial_graveside_with_viewing",
+        "burial_no_service_viewY": "burial_no_service_with_viewing",
+        # Without viewing prices
+        "cremation_no_service_viewN": "cremation_no_service",
+        "cremation_single_viewN": "cremation_single_service",
+        "cremation_dual_viewN": "cremation_dual_service",
+        "cremation_graveside_viewN": "cremation_graveside",
+        "burial_single_viewN": "burial_single_service",
+        "burial_dual_viewN": "burial_dual_service",
+        "burial_graveside_viewN": "burial_graveside",
+        "burial_no_service_viewN": "burial_no_service",
+    }
+
+    for meta_key, label in price_fields.items():
+        val = meta.get(meta_key, "")
+        if val:
+            # Parse price string like "$2,299" to float
+            cleaned = re.sub(r'[^\d.]', '', str(val))
+            if cleaned:
+                try:
+                    pricing[label] = float(cleaned)
+                except ValueError:
+                    pass
+
+    return pricing
+
+
+def pricing_to_packages(pricing: dict) -> list[dict]:
+    """Convert flat pricing dict to package format."""
+    packages = []
+
+    # Map pricing keys to funeral types
+    type_mappings = [
+        ("cremation_no_service", "Cremation Only"),
+        ("cremation_single_service", "Service & Cremation"),
+        ("cremation_single_service_with_viewing", "Service & Cremation"),
+        ("burial_single_service", "Service & Burial"),
+        ("burial_graveside", "Graveside Burial"),
+    ]
+
+    for price_key, funeral_type in type_mappings:
+        if price_key in pricing:
+            name = price_key.replace("_", " ").title()
+            packages.append({
+                "name": name,
+                "funeralType": funeral_type,
+                "price": pricing[price_key],
+                "inclusions": [],  # Not available from Gathered Here listing pages
+            })
+
+    return packages
+
+
+def to_normalized(listing_data: dict, url: str) -> dict:
+    """Convert Gathered Here listing data to intermediate format."""
+    meta = listing_data.get("meta", {}) if isinstance(listing_data.get("meta"), dict) else {}
+
+    name = listing_data.get("title", listing_data.get("name", "")).strip()
+    slug = listing_data.get("slug", "")
+
+    # Extract location
+    suburb = meta.get("geolocation_city", "")
+    state = normalize_state(meta.get("geolocation_state_short", ""))
+    postcode = meta.get("geolocation_postcode", "")
+    lat = meta.get("geolocation_lat")
+    lng = meta.get("geolocation_long")
+
+    try:
+        lat = float(lat) if lat else None
+        lng = float(lng) if lng else None
+    except (ValueError, TypeError):
+        lat = lng = None
+
+    email = meta.get("email", "") or meta.get("_application", "")
+    phone = meta.get("phone", "") or listing_data.get("phone", "")
+
+    # Try to get description from content or excerpt
+    description = listing_data.get("excerpt", listing_data.get("content", ""))
+    if description:
+        description = re.sub(r'<[^>]+>', '', description).strip()
+        if len(description) > 500:
+            description = description[:497] + "..."
+
+    # Website
+    website = listing_data.get("website") or meta.get("website") or None
+
+    # Pricing
+    pricing = extract_pricing(listing_data)
+    packages = pricing_to_packages(pricing)
+
+    business = {
+        "name": name,
+        "abn": None,
+        "phone": normalize_phone(phone),
+        "email": email.strip() or None,
+        "website": website,
+        "description": description or None,
+    }
+
+    locations = [{
+        "address": meta.get("geolocation_formatted_address", ""),
+        "suburb": suburb,
+        "state": state,
+        "postcode": postcode,
+        "lat": lat,
+        "lng": lng,
+        "phone": normalize_phone(phone),
+    }]
+
+    source_id = slug or generate_slug(name)
+    return to_intermediate(
+        source=SOURCE_NAME,
+        source_id=source_id,
+        source_url=url,
+        business=business,
+        locations=locations,
+        packages=packages,
+    )
+
+
+def crawl_profile(url: str) -> dict | None:
+    """Crawl a single Gathered Here profile page."""
+    try:
+        html_text = fetch_url(url)
+    except Exception as e:
+        print(f"    Error fetching {url}: {e}")
+        return None
+
+    # Try __NEXT_DATA__ first (structured)
+    next_data = extract_next_data(html_text)
+    if next_data:
+        listing = extract_from_next_data(next_data)
+        if listing:
+            listing["_source"] = "next_data"
+            return listing
+
+    # Fallback to HTML parsing
+    data = extract_from_html(html_text, url)
+    data["_source"] = "html_fallback"
+    return data
+
+
+def run(limit: int | None = None):
+    """Run the full Gathered Here crawl.
+
+    Args:
+        limit: If set, only crawl this many profiles (for testing).
+    """
+    db = get_db()
+    log_id = start_crawl_log(db, SOURCE_NAME)
+    print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})")
+
+    found = 0
+    new = 0
+    skipped = 0
+    errors = 0
+
+    try:
+        # Step 1: Get all profile URLs from sitemap
+        print("  Fetching sitemap...", end=" ", flush=True)
+        urls = fetch_all_listing_urls()
+        print(f"{len(urls)} profile URLs found")
+
+        if limit:
+            urls = urls[:limit]
+            print(f"  (limited to {limit} for testing)")
+
+        # Step 2: Crawl each profile
+        for i, url in enumerate(urls):
+            slug = url.rstrip("/").split("/")[-1]
+
+            if (i + 1) % 50 == 0 or i == 0:
+                print(f"  Crawling {i+1}/{len(urls)}: {slug}")
+
+            listing_data = crawl_profile(url)
+            found += 1
+
+            if not listing_data:
+                errors += 1
+                continue
+
+            source_id = slug
+            row_id = store_source_record(
+                db, SOURCE_NAME, source_id, url, listing_data, log_id
+            )
+
+            if row_id:
+                normalized = to_normalized(listing_data, url)
+                db.execute(
+                    "UPDATE source_record SET normalized_data = ? WHERE id = ?",
+                    (json.dumps(normalized), row_id)
+                )
+                new += 1
+            else:
+                skipped += 1
+
+            if (i + 1) % 10 == 0:
+                db.commit()  # periodic commit
+
+            time.sleep(CRAWL_DELAY)
+
+        db.commit()
+        finish_crawl_log(db, log_id, found, new, 0, skipped)
+        print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, "
+              f"{skipped} skipped, {errors} errors")
+
+    except Exception as e:
+        finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e))
+        raise
+    finally:
+        db.close()
+
+
+if __name__ == "__main__":
+    import sys
+    limit = int(sys.argv[1]) if len(sys.argv) > 1 else None
+    run(limit=limit)