Provider-Crawl/crawlers/discover_websites.py

"""Website discovery module.

For each provider without a website URL, attempts to find their website
using multiple strategies (tried in order):

1. Serper.dev (2,500 free Google searches, no CC needed)
2. DuckDuckGo lite (free fallback, rate-limited)
3. URL pattern guessing (businessname.com.au)

Also validates discovered URLs to confirm they belong to the business.

Configuration:
  Set SERPER_API_KEY env var or in config.json to enable Serper.dev.
  Without it, falls back to DuckDuckGo.
"""

import json
import os
import re
import time
import urllib.parse
import urllib.request
import urllib.error
from pathlib import Path

from base import (
    fetch_url, get_db, normalize_phone, CRAWL_DELAY,
)

# Load Serper API key from env or config
SERPER_API_KEY = os.environ.get("SERPER_API_KEY")
if not SERPER_API_KEY:
    config_path = Path(__file__).parent / "config.json"
    if config_path.exists():
        with open(config_path) as f:
            config = json.load(f)
            SERPER_API_KEY = config.get("serper_api_key")

# Domains to skip when extracting search results
SKIP_DOMAINS = [
    "yellowpages", "whitepages", "truelocal", "yelp", "cylex",
    "australia247", "showmelocal", "hotfrog", "localsearch",
    "facebook.com", "linkedin.com", "instagram.com", "twitter.com",
    "gatheredhere", "ezifunerals", "funeralocity", "funeraldirectory",
    "deathsandfunerals", "mytributes", "obits.com",
    "duckduckgo.com", "google.com", "bing.com",
    "nfda.com.au", "funeralsaustralia.org",
    "wikipedia.org", "youtube.com",
]


def search_serper(query: str) -> list[str]:
    """Search via Serper.dev (Google results as JSON). 2,500 free queries."""
    if not SERPER_API_KEY:
        return []

    url = "https://google.serper.dev/search"
    data = json.dumps({"q": query, "gl": "au", "num": 10}).encode("utf-8")
    req = urllib.request.Request(url, data=data, headers={
        "X-API-KEY": SERPER_API_KEY,
        "Content-Type": "application/json",
    })

    try:
        with urllib.request.urlopen(req, timeout=15) as resp:
            result = json.loads(resp.read().decode("utf-8"))
    except Exception:
        return []

    results = []
    for item in result.get("organic", []):
        link = item.get("link", "")
        if not link:
            continue
        if any(d in link.lower() for d in SKIP_DOMAINS):
            continue
        results.append(link)

    return results


def search_ddg(query: str) -> list[str]:
    """Search DuckDuckGo lite and return result URLs (filtered)."""
    encoded = urllib.parse.quote(query)
    url = f"https://lite.duckduckgo.com/lite/?q={encoded}"

    try:
        html = fetch_url(url)
    except Exception:
        return []

    # Extract redirect URLs from DDG lite format
    raw_links = re.findall(
        r'href="//duckduckgo\.com/l/\?uddg=([^&"]+)', html
    )

    results = []
    for link in raw_links:
        decoded = urllib.parse.unquote(link)
        # Skip ads
        if "ad_domain" in decoded or "ad_provider" in decoded:
            continue
        # Skip directory/aggregator sites
        if any(d in decoded.lower() for d in SKIP_DOMAINS):
            continue
        results.append(decoded)

    return results


def validate_url(url: str, business_name: str) -> dict:
    """Validate that a URL is a real website belonging to this business.

    Returns: {valid: bool, confidence: str, reason: str}
    """
    try:
        html = fetch_url(url, timeout=15)
    except urllib.error.HTTPError as e:
        return {"valid": False, "confidence": "none", "reason": f"HTTP {e.code}"}
    except Exception as e:
        return {"valid": False, "confidence": "none", "reason": str(e)[:100]}

    html_lower = html.lower()

    # Check if it's a parked/for-sale domain
    parked_signals = ["domain is for sale", "buy this domain",
                      "parked domain", "this domain", "godaddy",
                      "domain parking"]
    if any(s in html_lower for s in parked_signals):
        return {"valid": False, "confidence": "none", "reason": "parked domain"}

    # Check if the page mentions the business name
    name_parts = business_name.lower().split()
    # Require at least 2 name parts to match (or all if name is 1-2 words)
    min_matches = min(2, len(name_parts))
    matches = sum(1 for part in name_parts
                  if len(part) > 2 and part in html_lower)

    if matches >= min_matches:
        return {"valid": True, "confidence": "confirmed", "reason": "name found in page"}

    # Check title tag
    title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
    if title_match:
        title = title_match.group(1).lower()
        if any(part in title for part in name_parts if len(part) > 2):
            return {"valid": True, "confidence": "probable",
                    "reason": "partial name in title"}

    # Check for funeral-related content (it's at least a funeral business)
    funeral_signals = ["funeral", "cremation", "burial", "memorial",
                       "chapel", "obituar", "condolence"]
    if any(s in html_lower for s in funeral_signals):
        return {"valid": True, "confidence": "probable",
                "reason": "funeral content found, name not confirmed"}

    return {"valid": False, "confidence": "low",
            "reason": "business name not found on page"}


def guess_urls(business_name: str) -> list[str]:
    """Generate candidate URLs from a business name."""
    # Clean name for domain guessing
    slug = business_name.lower().strip()
    slug = re.sub(r"[''`]", "", slug)
    slug = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug)
    slug = re.sub(r"[^a-z0-9]+", "", slug)

    # Also try hyphenated version
    slug_hyphen = business_name.lower().strip()
    slug_hyphen = re.sub(r"[''`]", "", slug_hyphen)
    slug_hyphen = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug_hyphen)
    slug_hyphen = re.sub(r"[^a-z0-9]+", "-", slug_hyphen).strip("-")

    candidates = []
    for s in [slug, slug_hyphen]:
        if s:
            candidates.append(f"https://www.{s}.com.au")
            candidates.append(f"https://{s}.com.au")

    return candidates


def discover_website(name: str, suburb: str | None, state: str | None,
                     phone: str | None = None) -> dict | None:
    """Attempt to discover a business website.

    Returns: {url, confidence, method, validation} or None.
    """
    # Build search query
    query_parts = [name]
    if suburb:
        query_parts.append(suburb)
    if state:
        query_parts.append(state)
    query = " ".join(query_parts)

    # Strategy 1: Serper.dev (Google results, 2500 free)
    results = search_serper(query)

    # Strategy 2: DuckDuckGo fallback
    if not results:
        results = search_ddg(query)

    for url in results[:3]:
        validation = validate_url(url, name)
        if validation["valid"]:
            return {
                "url": url.rstrip("/"),
                "confidence": validation["confidence"],
                "method": "search",
                "validation": validation,
            }
        time.sleep(0.5)

    # Strategy 2: URL guessing
    candidates = guess_urls(name)
    for url in candidates:
        try:
            validation = validate_url(url, name)
            if validation["valid"]:
                return {
                    "url": url.rstrip("/"),
                    "confidence": validation["confidence"],
                    "method": "guess",
                    "validation": validation,
                }
        except Exception:
            continue
        time.sleep(0.3)

    return None


def run(limit: int | None = None, state_filter: str | None = None):
    """Discover websites for all providers without one.

    Args:
        limit: Max providers to process (for testing).
        state_filter: Only process providers in this state.
    """
    db = get_db()

    query = """
        SELECT id, title, business_suburb, business_state, phone
        FROM funeral_brand
        WHERE website IS NULL AND verified = 0
    """
    params = []

    if state_filter:
        query += " AND business_state = ?"
        params.append(state_filter)

    query += " ORDER BY id"

    if limit:
        query += f" LIMIT {limit}"

    providers = db.execute(query, params).fetchall()
    print(f"Providers without websites: {len(providers)}")

    found = 0
    not_found = 0

    for i, prov in enumerate(providers):
        name = prov["title"]
        suburb = prov["business_suburb"]
        state = prov["business_state"]
        phone = prov["phone"]

        if (i + 1) % 10 == 0 or i == 0:
            print(f"  [{i+1}/{len(providers)}] Processing: {name}")

        result = discover_website(name, suburb, state, phone)

        if result:
            db.execute(
                """UPDATE funeral_brand
                   SET website = ?, updated_at = datetime('now')
                   WHERE id = ?""",
                (result["url"], prov["id"])
            )
            found += 1
            if (i + 1) <= 20 or result["confidence"] == "confirmed":
                print(f"    FOUND ({result['confidence']}, {result['method']}): "
                      f"{result['url']}")
        else:
            not_found += 1

        if (i + 1) % 20 == 0:
            db.commit()

        # Rate limit: ~2s between providers (DDG + validation requests)
        time.sleep(CRAWL_DELAY * 2)

    db.commit()
    print(f"\nDone: {found} websites found, {not_found} not found")
    print(f"  Success rate: {found/(found+not_found)*100:.1f}%" if found + not_found > 0 else "")

    db.close()


if __name__ == "__main__":
    import sys
    limit = None
    state = None

    for arg in sys.argv[1:]:
        if arg.startswith("--state="):
            state = arg.split("=")[1]
        elif arg.startswith("--limit="):
            limit = int(arg.split("=")[1])
        else:
            try:
                limit = int(arg)
            except ValueError:
                pass

    run(limit=limit, state_filter=state)