Initial commit: funeral provider discovery pipeline

Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
2026-04-24 10:27:08 +10:00
commit cc91427789
30 changed files with 4706 additions and 0 deletions
--- a/crawlers/enrich_websites.py
+++ b/crawlers/enrich_websites.py
@@ -0,0 +1,393 @@
+"""Website enrichment module.
+
+For each provider with a website but no packages yet, crawls their site
+to find pricing/packages pages and extracts structured data.
+
+Two extraction modes:
+1. Direct HTML parsing (for sites with clear pricing structure)
+2. AI extraction via API call (for complex/varied layouts)
+
+This module handles the crawling and page discovery.
+AI extraction is delegated to the N8N workflow (Claude Haiku node).
+"""
+
+import json
+import re
+import time
+import urllib.parse
+import urllib.error
+from pathlib import Path
+
+from base import fetch_url, get_db, CRAWL_DELAY
+
+# Common URL patterns for pricing/packages pages
+PRICING_PATHS = [
+    "/pricing",
+    "/prices",
+    "/our-prices",
+    "/packages",
+    "/funeral-packages",
+    "/services",
+    "/our-services",
+    "/funeral-costs",
+    "/funeral-services",
+    "/service-options",
+    "/price-list",
+    "/transparency",
+    "/funeral-pricing",
+    "/costs",
+    "/cremation",
+    "/cremation-packages",
+    "/burial",
+    "/plan-a-funeral",
+    "/arrange",
+]
+
+# Keywords that suggest a link leads to pricing
+PRICING_KEYWORDS = [
+    "pric", "cost", "packag", "service", "plan",
+    "cremation", "burial", "funeral",
+    "transparency", "disclosure",
+]
+
+
+def find_pricing_page(base_url: str, homepage_html: str) -> str | None:
+    """Try to find the pricing/packages page URL.
+
+    Strategy:
+    1. Try common URL patterns
+    2. Parse homepage links for pricing-related keywords
+    """
+    base = base_url.rstrip("/")
+
+    # Strategy 1: Try common paths
+    for path in PRICING_PATHS:
+        test_url = base + path
+        try:
+            html = fetch_url(test_url, timeout=10)
+            # Verify it's not a 404 soft-redirect (check for pricing content)
+            if len(html) > 1000 and ("$" in html or "price" in html.lower()):
+                return test_url
+        except (urllib.error.HTTPError, urllib.error.URLError, Exception):
+            continue
+        time.sleep(0.3)
+
+    # Strategy 2: Parse homepage links
+    link_pattern = re.compile(
+        r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>',
+        re.IGNORECASE | re.DOTALL
+    )
+
+    for match in link_pattern.finditer(homepage_html):
+        href = match.group(1)
+        text = re.sub(r"<[^>]+>", "", match.group(2)).lower().strip()
+        href_lower = href.lower()
+
+        # Check if link text or URL contains pricing keywords
+        if any(kw in text or kw in href_lower for kw in PRICING_KEYWORDS):
+            # Resolve relative URLs
+            if href.startswith("/"):
+                full_url = base + href
+            elif href.startswith("http"):
+                # Only follow links to the same domain
+                if urllib.parse.urlparse(base).netloc in href:
+                    full_url = href
+                else:
+                    continue
+            else:
+                full_url = base + "/" + href
+
+            try:
+                html = fetch_url(full_url, timeout=10)
+                if len(html) > 500:
+                    return full_url
+            except Exception:
+                continue
+            time.sleep(0.3)
+
+    return None
+
+
+def extract_description(html: str) -> str | None:
+    """Extract a business description from homepage HTML."""
+    # Try meta description first
+    meta_match = re.search(
+        r'<meta\s+(?:name="description"\s+content="([^"]+)"|content="([^"]+)"\s+name="description")',
+        html, re.IGNORECASE
+    )
+    if meta_match:
+        desc = meta_match.group(1) or meta_match.group(2)
+        if desc and len(desc) > 20:
+            return desc.strip()
+
+    # Try OG description
+    og_match = re.search(
+        r'<meta\s+property="og:description"\s+content="([^"]+)"',
+        html, re.IGNORECASE
+    )
+    if og_match and len(og_match.group(1)) > 20:
+        return og_match.group(1).strip()
+
+    return None
+
+
+def extract_contact_info(html: str) -> dict:
+    """Extract contact details from HTML."""
+    info = {}
+
+    # Phone
+    phone_match = re.search(r'href="tel:([^"]+)"', html)
+    if phone_match:
+        info["phone"] = phone_match.group(1).strip()
+
+    # Email
+    email_match = re.search(r'href="mailto:([^"?]+)"', html)
+    if email_match:
+        info["email"] = email_match.group(1).strip()
+
+    # Address from JSON-LD
+    addr_match = re.search(r'"streetAddress"\s*:\s*"([^"]*)"', html)
+    if addr_match:
+        info["address"] = addr_match.group(1)
+
+    return info
+
+
+def check_has_pricing(html: str) -> bool:
+    """Quick check whether a page contains pricing information."""
+    # Look for dollar signs near numbers
+    price_pattern = re.compile(r'\$[\d,]+(?:\.\d{2})?')
+    prices_found = price_pattern.findall(html)
+
+    # Filter out tiny amounts (likely not funeral pricing)
+    significant_prices = []
+    for p in prices_found:
+        cleaned = p.replace("$", "").replace(",", "").strip()
+        if not cleaned:
+            continue
+        try:
+            amount = float(cleaned)
+        except ValueError:
+            continue
+        if amount >= 100:
+            significant_prices.append(amount)
+
+    return len(significant_prices) >= 1
+
+
+def prepare_for_ai_extraction(html: str) -> str:
+    """Clean HTML for AI extraction — remove noise, keep content."""
+    # Remove script and style tags
+    cleaned = re.sub(r"<script[^>]*>.*?</script>", "", html,
+                     flags=re.DOTALL | re.IGNORECASE)
+    cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned,
+                     flags=re.DOTALL | re.IGNORECASE)
+
+    # Remove HTML comments
+    cleaned = re.sub(r"<!--.*?-->", "", cleaned, flags=re.DOTALL)
+
+    # Remove nav, header, footer elements
+    for tag in ["nav", "header", "footer"]:
+        cleaned = re.sub(
+            rf"<{tag}[^>]*>.*?</{tag}>", "", cleaned,
+            flags=re.DOTALL | re.IGNORECASE
+        )
+
+    # Strip remaining tags but keep text
+    text = re.sub(r"<[^>]+>", " ", cleaned)
+    # Collapse whitespace
+    text = re.sub(r"\s+", " ", text).strip()
+
+    # Truncate to ~8000 chars (fits well within Haiku context)
+    if len(text) > 8000:
+        text = text[:8000] + "..."
+
+    return text
+
+
+def enrich_provider(provider_id: int, website: str, db) -> dict:
+    """Crawl a provider's website and extract enrichment data.
+
+    Returns a dict with what was found.
+    """
+    result = {
+        "homepage_fetched": False,
+        "description": None,
+        "contact_info": {},
+        "pricing_page_url": None,
+        "has_pricing": False,
+        "pricing_page_text": None,  # cleaned text for AI extraction
+        "pdf_links": [],
+    }
+
+    # Step 1: Fetch homepage
+    try:
+        homepage = fetch_url(website, timeout=15)
+        result["homepage_fetched"] = True
+    except Exception as e:
+        result["error"] = str(e)[:200]
+        return result
+
+    # Step 2: Extract description and contact info
+    result["description"] = extract_description(homepage)
+    result["contact_info"] = extract_contact_info(homepage)
+
+    # Step 3: Find pricing page
+    time.sleep(CRAWL_DELAY)
+    pricing_url = find_pricing_page(website, homepage)
+
+    if pricing_url:
+        result["pricing_page_url"] = pricing_url
+        try:
+            pricing_html = fetch_url(pricing_url, timeout=15)
+            result["has_pricing"] = check_has_pricing(pricing_html)
+            result["pricing_page_text"] = prepare_for_ai_extraction(pricing_html)
+
+            # Check for PDF links
+            pdf_links = re.findall(
+                r'href="([^"]*\.pdf[^"]*)"', pricing_html, re.IGNORECASE
+            )
+            for pdf_href in pdf_links:
+                if pdf_href.startswith("/"):
+                    pdf_href = website.rstrip("/") + pdf_href
+                elif not pdf_href.startswith("http"):
+                    pdf_href = website.rstrip("/") + "/" + pdf_href
+                result["pdf_links"].append(pdf_href)
+
+        except Exception:
+            pass
+    else:
+        # Check homepage itself for pricing
+        if check_has_pricing(homepage):
+            result["has_pricing"] = True
+            result["pricing_page_url"] = website
+            result["pricing_page_text"] = prepare_for_ai_extraction(homepage)
+
+    return result
+
+
+def run(limit: int | None = None, state_filter: str | None = None):
+    """Enrich all providers that have a website but no packages."""
+    db = get_db()
+
+    query = """
+        SELECT fb.id, fb.title, fb.website, fb.business_state
+        FROM funeral_brand fb
+        LEFT JOIN package p ON p.brand_id = fb.id
+        WHERE fb.website IS NOT NULL
+          AND fb.verified = 0
+          AND p.id IS NULL
+    """
+    params = []
+
+    if state_filter:
+        query += " AND fb.business_state = ?"
+        params.append(state_filter)
+
+    query += " ORDER BY fb.id"
+
+    if limit:
+        query += f" LIMIT {limit}"
+
+    providers = db.execute(query, params).fetchall()
+    print(f"Providers to enrich: {len(providers)}")
+
+    enriched = 0
+    pricing_found = 0
+    failed = 0
+
+    for i, prov in enumerate(providers):
+        if (i + 1) % 5 == 0 or i == 0:
+            print(f"  [{i+1}/{len(providers)}] {prov['title']}")
+
+        result = enrich_provider(prov["id"], prov["website"], db)
+
+        if not result["homepage_fetched"]:
+            failed += 1
+            db.execute(
+                """UPDATE funeral_brand
+                   SET enrichment_status = 'failed', updated_at = datetime('now')
+                   WHERE id = ?""",
+                (prov["id"],)
+            )
+            continue
+
+        enriched += 1
+
+        # Update brand with discovered info
+        updates = {}
+        if result["description"] and not db.execute(
+            "SELECT description FROM funeral_brand WHERE id = ?", (prov["id"],)
+        ).fetchone()["description"]:
+            updates["description"] = result["description"]
+
+        contact = result["contact_info"]
+        brand = db.execute("SELECT * FROM funeral_brand WHERE id = ?",
+                           (prov["id"],)).fetchone()
+        if contact.get("email") and not brand["email"]:
+            updates["email"] = contact["email"]
+        if contact.get("phone") and not brand["phone"]:
+            updates["phone"] = contact["phone"]
+
+        if result["has_pricing"]:
+            pricing_found += 1
+            updates["enrichment_status"] = "partial"  # has pricing, needs AI extraction
+        else:
+            updates["enrichment_status"] = "partial"  # homepage enriched, no pricing
+
+        if updates:
+            set_parts = [f"{k} = ?" for k in updates]
+            values = list(updates.values()) + [prov["id"]]
+            db.execute(
+                f"UPDATE funeral_brand SET {', '.join(set_parts)}, "
+                f"updated_at = datetime('now') WHERE id = ?",
+                values
+            )
+
+        # Store pricing page text for later AI extraction
+        if result["pricing_page_text"]:
+            db.execute(
+                """INSERT OR REPLACE INTO source_record
+                   (source_name, source_id, source_url, raw_data,
+                    matched_brand_id, match_type)
+                   VALUES ('website_crawl', ?, ?, ?, ?, 'enrichment')""",
+                (
+                    f"brand_{prov['id']}",
+                    result["pricing_page_url"],
+                    json.dumps({
+                        "pricing_text": result["pricing_page_text"],
+                        "pdf_links": result["pdf_links"],
+                        "has_pricing": result["has_pricing"],
+                    }),
+                    prov["id"],
+                )
+            )
+
+        if (i + 1) % 10 == 0:
+            db.commit()
+
+        time.sleep(CRAWL_DELAY)
+
+    db.commit()
+    print(f"\nDone: {enriched} enriched, {pricing_found} with pricing, {failed} failed")
+
+    db.close()
+
+
+if __name__ == "__main__":
+    import sys
+    limit = None
+    state = None
+
+    for arg in sys.argv[1:]:
+        if arg.startswith("--state="):
+            state = arg.split("=")[1]
+        elif arg.startswith("--limit="):
+            limit = int(arg.split("=")[1])
+        else:
+            try:
+                limit = int(arg)
+            except ValueError:
+                pass
+
+    run(limit=limit, state_filter=state)