"""Website discovery module. For each provider without a website URL, attempts to find their website using multiple strategies (tried in order): 1. Serper.dev (2,500 free Google searches, no CC needed) 2. DuckDuckGo lite (free fallback, rate-limited) 3. URL pattern guessing (businessname.com.au) Also validates discovered URLs to confirm they belong to the business. Configuration: Set SERPER_API_KEY env var or in config.json to enable Serper.dev. Without it, falls back to DuckDuckGo. """ import json import os import re import time import urllib.parse import urllib.request import urllib.error from pathlib import Path from base import ( fetch_url, get_db, normalize_phone, CRAWL_DELAY, ) # Load Serper API key from env or config SERPER_API_KEY = os.environ.get("SERPER_API_KEY") if not SERPER_API_KEY: config_path = Path(__file__).parent / "config.json" if config_path.exists(): with open(config_path) as f: config = json.load(f) SERPER_API_KEY = config.get("serper_api_key") # Domains to skip when extracting search results SKIP_DOMAINS = [ "yellowpages", "whitepages", "truelocal", "yelp", "cylex", "australia247", "showmelocal", "hotfrog", "localsearch", "facebook.com", "linkedin.com", "instagram.com", "twitter.com", "gatheredhere", "ezifunerals", "funeralocity", "funeraldirectory", "deathsandfunerals", "mytributes", "obits.com", "duckduckgo.com", "google.com", "bing.com", "nfda.com.au", "funeralsaustralia.org", "wikipedia.org", "youtube.com", ] def search_serper(query: str) -> list[str]: """Search via Serper.dev (Google results as JSON). 2,500 free queries.""" if not SERPER_API_KEY: return [] url = "https://google.serper.dev/search" data = json.dumps({"q": query, "gl": "au", "num": 10}).encode("utf-8") req = urllib.request.Request(url, data=data, headers={ "X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json", }) try: with urllib.request.urlopen(req, timeout=15) as resp: result = json.loads(resp.read().decode("utf-8")) except Exception: return [] results = [] for item in result.get("organic", []): link = item.get("link", "") if not link: continue if any(d in link.lower() for d in SKIP_DOMAINS): continue results.append(link) return results def search_ddg(query: str) -> list[str]: """Search DuckDuckGo lite and return result URLs (filtered).""" encoded = urllib.parse.quote(query) url = f"https://lite.duckduckgo.com/lite/?q={encoded}" try: html = fetch_url(url) except Exception: return [] # Extract redirect URLs from DDG lite format raw_links = re.findall( r'href="//duckduckgo\.com/l/\?uddg=([^&"]+)', html ) results = [] for link in raw_links: decoded = urllib.parse.unquote(link) # Skip ads if "ad_domain" in decoded or "ad_provider" in decoded: continue # Skip directory/aggregator sites if any(d in decoded.lower() for d in SKIP_DOMAINS): continue results.append(decoded) return results def validate_url(url: str, business_name: str) -> dict: """Validate that a URL is a real website belonging to this business. Returns: {valid: bool, confidence: str, reason: str} """ try: html = fetch_url(url, timeout=15) except urllib.error.HTTPError as e: return {"valid": False, "confidence": "none", "reason": f"HTTP {e.code}"} except Exception as e: return {"valid": False, "confidence": "none", "reason": str(e)[:100]} html_lower = html.lower() # Check if it's a parked/for-sale domain parked_signals = ["domain is for sale", "buy this domain", "parked domain", "this domain", "godaddy", "domain parking"] if any(s in html_lower for s in parked_signals): return {"valid": False, "confidence": "none", "reason": "parked domain"} # Check if the page mentions the business name name_parts = business_name.lower().split() # Require at least 2 name parts to match (or all if name is 1-2 words) min_matches = min(2, len(name_parts)) matches = sum(1 for part in name_parts if len(part) > 2 and part in html_lower) if matches >= min_matches: return {"valid": True, "confidence": "confirmed", "reason": "name found in page"} # Check title tag title_match = re.search(r"]*>(.*?)", html, re.IGNORECASE | re.DOTALL) if title_match: title = title_match.group(1).lower() if any(part in title for part in name_parts if len(part) > 2): return {"valid": True, "confidence": "probable", "reason": "partial name in title"} # Check for funeral-related content (it's at least a funeral business) funeral_signals = ["funeral", "cremation", "burial", "memorial", "chapel", "obituar", "condolence"] if any(s in html_lower for s in funeral_signals): return {"valid": True, "confidence": "probable", "reason": "funeral content found, name not confirmed"} return {"valid": False, "confidence": "low", "reason": "business name not found on page"} def guess_urls(business_name: str) -> list[str]: """Generate candidate URLs from a business name.""" # Clean name for domain guessing slug = business_name.lower().strip() slug = re.sub(r"[''`]", "", slug) slug = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug) slug = re.sub(r"[^a-z0-9]+", "", slug) # Also try hyphenated version slug_hyphen = business_name.lower().strip() slug_hyphen = re.sub(r"[''`]", "", slug_hyphen) slug_hyphen = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug_hyphen) slug_hyphen = re.sub(r"[^a-z0-9]+", "-", slug_hyphen).strip("-") candidates = [] for s in [slug, slug_hyphen]: if s: candidates.append(f"https://www.{s}.com.au") candidates.append(f"https://{s}.com.au") return candidates def discover_website(name: str, suburb: str | None, state: str | None, phone: str | None = None) -> dict | None: """Attempt to discover a business website. Returns: {url, confidence, method, validation} or None. """ # Build search query query_parts = [name] if suburb: query_parts.append(suburb) if state: query_parts.append(state) query = " ".join(query_parts) # Strategy 1: Serper.dev (Google results, 2500 free) results = search_serper(query) # Strategy 2: DuckDuckGo fallback if not results: results = search_ddg(query) for url in results[:3]: validation = validate_url(url, name) if validation["valid"]: return { "url": url.rstrip("/"), "confidence": validation["confidence"], "method": "search", "validation": validation, } time.sleep(0.5) # Strategy 2: URL guessing candidates = guess_urls(name) for url in candidates: try: validation = validate_url(url, name) if validation["valid"]: return { "url": url.rstrip("/"), "confidence": validation["confidence"], "method": "guess", "validation": validation, } except Exception: continue time.sleep(0.3) return None def run(limit: int | None = None, state_filter: str | None = None): """Discover websites for all providers without one. Args: limit: Max providers to process (for testing). state_filter: Only process providers in this state. """ db = get_db() query = """ SELECT id, title, business_suburb, business_state, phone FROM funeral_brand WHERE website IS NULL AND verified = 0 """ params = [] if state_filter: query += " AND business_state = ?" params.append(state_filter) query += " ORDER BY id" if limit: query += f" LIMIT {limit}" providers = db.execute(query, params).fetchall() print(f"Providers without websites: {len(providers)}") found = 0 not_found = 0 for i, prov in enumerate(providers): name = prov["title"] suburb = prov["business_suburb"] state = prov["business_state"] phone = prov["phone"] if (i + 1) % 10 == 0 or i == 0: print(f" [{i+1}/{len(providers)}] Processing: {name}") result = discover_website(name, suburb, state, phone) if result: db.execute( """UPDATE funeral_brand SET website = ?, updated_at = datetime('now') WHERE id = ?""", (result["url"], prov["id"]) ) found += 1 if (i + 1) <= 20 or result["confidence"] == "confirmed": print(f" FOUND ({result['confidence']}, {result['method']}): " f"{result['url']}") else: not_found += 1 if (i + 1) % 20 == 0: db.commit() # Rate limit: ~2s between providers (DDG + validation requests) time.sleep(CRAWL_DELAY * 2) db.commit() print(f"\nDone: {found} websites found, {not_found} not found") print(f" Success rate: {found/(found+not_found)*100:.1f}%" if found + not_found > 0 else "") db.close() if __name__ == "__main__": import sys limit = None state = None for arg in sys.argv[1:]: if arg.startswith("--state="): state = arg.split("=")[1] elif arg.startswith("--limit="): limit = int(arg.split("=")[1]) else: try: limit = int(arg) except ValueError: pass run(limit=limit, state_filter=state)