Initial commit: funeral provider discovery pipeline

Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
2026-04-24 10:27:08 +10:00
commit cc91427789
30 changed files with 4706 additions and 0 deletions
--- a/crawlers/discover_websites.py
+++ b/crawlers/discover_websites.py
@@ -0,0 +1,320 @@
+"""Website discovery module.
+
+For each provider without a website URL, attempts to find their website
+using multiple strategies (tried in order):
+
+1. Serper.dev (2,500 free Google searches, no CC needed)
+2. DuckDuckGo lite (free fallback, rate-limited)
+3. URL pattern guessing (businessname.com.au)
+
+Also validates discovered URLs to confirm they belong to the business.
+
+Configuration:
+  Set SERPER_API_KEY env var or in config.json to enable Serper.dev.
+  Without it, falls back to DuckDuckGo.
+"""
+
+import json
+import os
+import re
+import time
+import urllib.parse
+import urllib.request
+import urllib.error
+from pathlib import Path
+
+from base import (
+    fetch_url, get_db, normalize_phone, CRAWL_DELAY,
+)
+
+# Load Serper API key from env or config
+SERPER_API_KEY = os.environ.get("SERPER_API_KEY")
+if not SERPER_API_KEY:
+    config_path = Path(__file__).parent / "config.json"
+    if config_path.exists():
+        with open(config_path) as f:
+            config = json.load(f)
+            SERPER_API_KEY = config.get("serper_api_key")
+
+# Domains to skip when extracting search results
+SKIP_DOMAINS = [
+    "yellowpages", "whitepages", "truelocal", "yelp", "cylex",
+    "australia247", "showmelocal", "hotfrog", "localsearch",
+    "facebook.com", "linkedin.com", "instagram.com", "twitter.com",
+    "gatheredhere", "ezifunerals", "funeralocity", "funeraldirectory",
+    "deathsandfunerals", "mytributes", "obits.com",
+    "duckduckgo.com", "google.com", "bing.com",
+    "nfda.com.au", "funeralsaustralia.org",
+    "wikipedia.org", "youtube.com",
+]
+
+
+def search_serper(query: str) -> list[str]:
+    """Search via Serper.dev (Google results as JSON). 2,500 free queries."""
+    if not SERPER_API_KEY:
+        return []
+
+    url = "https://google.serper.dev/search"
+    data = json.dumps({"q": query, "gl": "au", "num": 10}).encode("utf-8")
+    req = urllib.request.Request(url, data=data, headers={
+        "X-API-KEY": SERPER_API_KEY,
+        "Content-Type": "application/json",
+    })
+
+    try:
+        with urllib.request.urlopen(req, timeout=15) as resp:
+            result = json.loads(resp.read().decode("utf-8"))
+    except Exception:
+        return []
+
+    results = []
+    for item in result.get("organic", []):
+        link = item.get("link", "")
+        if not link:
+            continue
+        if any(d in link.lower() for d in SKIP_DOMAINS):
+            continue
+        results.append(link)
+
+    return results
+
+
+def search_ddg(query: str) -> list[str]:
+    """Search DuckDuckGo lite and return result URLs (filtered)."""
+    encoded = urllib.parse.quote(query)
+    url = f"https://lite.duckduckgo.com/lite/?q={encoded}"
+
+    try:
+        html = fetch_url(url)
+    except Exception:
+        return []
+
+    # Extract redirect URLs from DDG lite format
+    raw_links = re.findall(
+        r'href="//duckduckgo\.com/l/\?uddg=([^&"]+)', html
+    )
+
+    results = []
+    for link in raw_links:
+        decoded = urllib.parse.unquote(link)
+        # Skip ads
+        if "ad_domain" in decoded or "ad_provider" in decoded:
+            continue
+        # Skip directory/aggregator sites
+        if any(d in decoded.lower() for d in SKIP_DOMAINS):
+            continue
+        results.append(decoded)
+
+    return results
+
+
+def validate_url(url: str, business_name: str) -> dict:
+    """Validate that a URL is a real website belonging to this business.
+
+    Returns: {valid: bool, confidence: str, reason: str}
+    """
+    try:
+        html = fetch_url(url, timeout=15)
+    except urllib.error.HTTPError as e:
+        return {"valid": False, "confidence": "none", "reason": f"HTTP {e.code}"}
+    except Exception as e:
+        return {"valid": False, "confidence": "none", "reason": str(e)[:100]}
+
+    html_lower = html.lower()
+
+    # Check if it's a parked/for-sale domain
+    parked_signals = ["domain is for sale", "buy this domain",
+                      "parked domain", "this domain", "godaddy",
+                      "domain parking"]
+    if any(s in html_lower for s in parked_signals):
+        return {"valid": False, "confidence": "none", "reason": "parked domain"}
+
+    # Check if the page mentions the business name
+    name_parts = business_name.lower().split()
+    # Require at least 2 name parts to match (or all if name is 1-2 words)
+    min_matches = min(2, len(name_parts))
+    matches = sum(1 for part in name_parts
+                  if len(part) > 2 and part in html_lower)
+
+    if matches >= min_matches:
+        return {"valid": True, "confidence": "confirmed", "reason": "name found in page"}
+
+    # Check title tag
+    title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
+    if title_match:
+        title = title_match.group(1).lower()
+        if any(part in title for part in name_parts if len(part) > 2):
+            return {"valid": True, "confidence": "probable",
+                    "reason": "partial name in title"}
+
+    # Check for funeral-related content (it's at least a funeral business)
+    funeral_signals = ["funeral", "cremation", "burial", "memorial",
+                       "chapel", "obituar", "condolence"]
+    if any(s in html_lower for s in funeral_signals):
+        return {"valid": True, "confidence": "probable",
+                "reason": "funeral content found, name not confirmed"}
+
+    return {"valid": False, "confidence": "low",
+            "reason": "business name not found on page"}
+
+
+def guess_urls(business_name: str) -> list[str]:
+    """Generate candidate URLs from a business name."""
+    # Clean name for domain guessing
+    slug = business_name.lower().strip()
+    slug = re.sub(r"[''`]", "", slug)
+    slug = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug)
+    slug = re.sub(r"[^a-z0-9]+", "", slug)
+
+    # Also try hyphenated version
+    slug_hyphen = business_name.lower().strip()
+    slug_hyphen = re.sub(r"[''`]", "", slug_hyphen)
+    slug_hyphen = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug_hyphen)
+    slug_hyphen = re.sub(r"[^a-z0-9]+", "-", slug_hyphen).strip("-")
+
+    candidates = []
+    for s in [slug, slug_hyphen]:
+        if s:
+            candidates.append(f"https://www.{s}.com.au")
+            candidates.append(f"https://{s}.com.au")
+
+    return candidates
+
+
+def discover_website(name: str, suburb: str | None, state: str | None,
+                     phone: str | None = None) -> dict | None:
+    """Attempt to discover a business website.
+
+    Returns: {url, confidence, method, validation} or None.
+    """
+    # Build search query
+    query_parts = [name]
+    if suburb:
+        query_parts.append(suburb)
+    if state:
+        query_parts.append(state)
+    query = " ".join(query_parts)
+
+    # Strategy 1: Serper.dev (Google results, 2500 free)
+    results = search_serper(query)
+
+    # Strategy 2: DuckDuckGo fallback
+    if not results:
+        results = search_ddg(query)
+
+    for url in results[:3]:
+        validation = validate_url(url, name)
+        if validation["valid"]:
+            return {
+                "url": url.rstrip("/"),
+                "confidence": validation["confidence"],
+                "method": "search",
+                "validation": validation,
+            }
+        time.sleep(0.5)
+
+    # Strategy 2: URL guessing
+    candidates = guess_urls(name)
+    for url in candidates:
+        try:
+            validation = validate_url(url, name)
+            if validation["valid"]:
+                return {
+                    "url": url.rstrip("/"),
+                    "confidence": validation["confidence"],
+                    "method": "guess",
+                    "validation": validation,
+                }
+        except Exception:
+            continue
+        time.sleep(0.3)
+
+    return None
+
+
+def run(limit: int | None = None, state_filter: str | None = None):
+    """Discover websites for all providers without one.
+
+    Args:
+        limit: Max providers to process (for testing).
+        state_filter: Only process providers in this state.
+    """
+    db = get_db()
+
+    query = """
+        SELECT id, title, business_suburb, business_state, phone
+        FROM funeral_brand
+        WHERE website IS NULL AND verified = 0
+    """
+    params = []
+
+    if state_filter:
+        query += " AND business_state = ?"
+        params.append(state_filter)
+
+    query += " ORDER BY id"
+
+    if limit:
+        query += f" LIMIT {limit}"
+
+    providers = db.execute(query, params).fetchall()
+    print(f"Providers without websites: {len(providers)}")
+
+    found = 0
+    not_found = 0
+
+    for i, prov in enumerate(providers):
+        name = prov["title"]
+        suburb = prov["business_suburb"]
+        state = prov["business_state"]
+        phone = prov["phone"]
+
+        if (i + 1) % 10 == 0 or i == 0:
+            print(f"  [{i+1}/{len(providers)}] Processing: {name}")
+
+        result = discover_website(name, suburb, state, phone)
+
+        if result:
+            db.execute(
+                """UPDATE funeral_brand
+                   SET website = ?, updated_at = datetime('now')
+                   WHERE id = ?""",
+                (result["url"], prov["id"])
+            )
+            found += 1
+            if (i + 1) <= 20 or result["confidence"] == "confirmed":
+                print(f"    FOUND ({result['confidence']}, {result['method']}): "
+                      f"{result['url']}")
+        else:
+            not_found += 1
+
+        if (i + 1) % 20 == 0:
+            db.commit()
+
+        # Rate limit: ~2s between providers (DDG + validation requests)
+        time.sleep(CRAWL_DELAY * 2)
+
+    db.commit()
+    print(f"\nDone: {found} websites found, {not_found} not found")
+    print(f"  Success rate: {found/(found+not_found)*100:.1f}%" if found + not_found > 0 else "")
+
+    db.close()
+
+
+if __name__ == "__main__":
+    import sys
+    limit = None
+    state = None
+
+    for arg in sys.argv[1:]:
+        if arg.startswith("--state="):
+            state = arg.split("=")[1]
+        elif arg.startswith("--limit="):
+            limit = int(arg.split("=")[1])
+        else:
+            try:
+                limit = int(arg)
+            except ValueError:
+                pass
+
+    run(limit=limit, state_filter=state)