Files
Provider-Crawl/crawlers/discover_websites.py
Richie cc91427789 Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA
n8n workflows for scheduled discovery and enrichment
SQLite schema and seeded dev database (1,463 providers)
End-to-end process documentation in n8n/PROCESS.md
2026-04-24 10:27:08 +10:00

321 lines
9.9 KiB
Python

"""Website discovery module.
For each provider without a website URL, attempts to find their website
using multiple strategies (tried in order):
1. Serper.dev (2,500 free Google searches, no CC needed)
2. DuckDuckGo lite (free fallback, rate-limited)
3. URL pattern guessing (businessname.com.au)
Also validates discovered URLs to confirm they belong to the business.
Configuration:
Set SERPER_API_KEY env var or in config.json to enable Serper.dev.
Without it, falls back to DuckDuckGo.
"""
import json
import os
import re
import time
import urllib.parse
import urllib.request
import urllib.error
from pathlib import Path
from base import (
fetch_url, get_db, normalize_phone, CRAWL_DELAY,
)
# Load Serper API key from env or config
SERPER_API_KEY = os.environ.get("SERPER_API_KEY")
if not SERPER_API_KEY:
config_path = Path(__file__).parent / "config.json"
if config_path.exists():
with open(config_path) as f:
config = json.load(f)
SERPER_API_KEY = config.get("serper_api_key")
# Domains to skip when extracting search results
SKIP_DOMAINS = [
"yellowpages", "whitepages", "truelocal", "yelp", "cylex",
"australia247", "showmelocal", "hotfrog", "localsearch",
"facebook.com", "linkedin.com", "instagram.com", "twitter.com",
"gatheredhere", "ezifunerals", "funeralocity", "funeraldirectory",
"deathsandfunerals", "mytributes", "obits.com",
"duckduckgo.com", "google.com", "bing.com",
"nfda.com.au", "funeralsaustralia.org",
"wikipedia.org", "youtube.com",
]
def search_serper(query: str) -> list[str]:
"""Search via Serper.dev (Google results as JSON). 2,500 free queries."""
if not SERPER_API_KEY:
return []
url = "https://google.serper.dev/search"
data = json.dumps({"q": query, "gl": "au", "num": 10}).encode("utf-8")
req = urllib.request.Request(url, data=data, headers={
"X-API-KEY": SERPER_API_KEY,
"Content-Type": "application/json",
})
try:
with urllib.request.urlopen(req, timeout=15) as resp:
result = json.loads(resp.read().decode("utf-8"))
except Exception:
return []
results = []
for item in result.get("organic", []):
link = item.get("link", "")
if not link:
continue
if any(d in link.lower() for d in SKIP_DOMAINS):
continue
results.append(link)
return results
def search_ddg(query: str) -> list[str]:
"""Search DuckDuckGo lite and return result URLs (filtered)."""
encoded = urllib.parse.quote(query)
url = f"https://lite.duckduckgo.com/lite/?q={encoded}"
try:
html = fetch_url(url)
except Exception:
return []
# Extract redirect URLs from DDG lite format
raw_links = re.findall(
r'href="//duckduckgo\.com/l/\?uddg=([^&"]+)', html
)
results = []
for link in raw_links:
decoded = urllib.parse.unquote(link)
# Skip ads
if "ad_domain" in decoded or "ad_provider" in decoded:
continue
# Skip directory/aggregator sites
if any(d in decoded.lower() for d in SKIP_DOMAINS):
continue
results.append(decoded)
return results
def validate_url(url: str, business_name: str) -> dict:
"""Validate that a URL is a real website belonging to this business.
Returns: {valid: bool, confidence: str, reason: str}
"""
try:
html = fetch_url(url, timeout=15)
except urllib.error.HTTPError as e:
return {"valid": False, "confidence": "none", "reason": f"HTTP {e.code}"}
except Exception as e:
return {"valid": False, "confidence": "none", "reason": str(e)[:100]}
html_lower = html.lower()
# Check if it's a parked/for-sale domain
parked_signals = ["domain is for sale", "buy this domain",
"parked domain", "this domain", "godaddy",
"domain parking"]
if any(s in html_lower for s in parked_signals):
return {"valid": False, "confidence": "none", "reason": "parked domain"}
# Check if the page mentions the business name
name_parts = business_name.lower().split()
# Require at least 2 name parts to match (or all if name is 1-2 words)
min_matches = min(2, len(name_parts))
matches = sum(1 for part in name_parts
if len(part) > 2 and part in html_lower)
if matches >= min_matches:
return {"valid": True, "confidence": "confirmed", "reason": "name found in page"}
# Check title tag
title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
if title_match:
title = title_match.group(1).lower()
if any(part in title for part in name_parts if len(part) > 2):
return {"valid": True, "confidence": "probable",
"reason": "partial name in title"}
# Check for funeral-related content (it's at least a funeral business)
funeral_signals = ["funeral", "cremation", "burial", "memorial",
"chapel", "obituar", "condolence"]
if any(s in html_lower for s in funeral_signals):
return {"valid": True, "confidence": "probable",
"reason": "funeral content found, name not confirmed"}
return {"valid": False, "confidence": "low",
"reason": "business name not found on page"}
def guess_urls(business_name: str) -> list[str]:
"""Generate candidate URLs from a business name."""
# Clean name for domain guessing
slug = business_name.lower().strip()
slug = re.sub(r"[''`]", "", slug)
slug = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug)
slug = re.sub(r"[^a-z0-9]+", "", slug)
# Also try hyphenated version
slug_hyphen = business_name.lower().strip()
slug_hyphen = re.sub(r"[''`]", "", slug_hyphen)
slug_hyphen = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug_hyphen)
slug_hyphen = re.sub(r"[^a-z0-9]+", "-", slug_hyphen).strip("-")
candidates = []
for s in [slug, slug_hyphen]:
if s:
candidates.append(f"https://www.{s}.com.au")
candidates.append(f"https://{s}.com.au")
return candidates
def discover_website(name: str, suburb: str | None, state: str | None,
phone: str | None = None) -> dict | None:
"""Attempt to discover a business website.
Returns: {url, confidence, method, validation} or None.
"""
# Build search query
query_parts = [name]
if suburb:
query_parts.append(suburb)
if state:
query_parts.append(state)
query = " ".join(query_parts)
# Strategy 1: Serper.dev (Google results, 2500 free)
results = search_serper(query)
# Strategy 2: DuckDuckGo fallback
if not results:
results = search_ddg(query)
for url in results[:3]:
validation = validate_url(url, name)
if validation["valid"]:
return {
"url": url.rstrip("/"),
"confidence": validation["confidence"],
"method": "search",
"validation": validation,
}
time.sleep(0.5)
# Strategy 2: URL guessing
candidates = guess_urls(name)
for url in candidates:
try:
validation = validate_url(url, name)
if validation["valid"]:
return {
"url": url.rstrip("/"),
"confidence": validation["confidence"],
"method": "guess",
"validation": validation,
}
except Exception:
continue
time.sleep(0.3)
return None
def run(limit: int | None = None, state_filter: str | None = None):
"""Discover websites for all providers without one.
Args:
limit: Max providers to process (for testing).
state_filter: Only process providers in this state.
"""
db = get_db()
query = """
SELECT id, title, business_suburb, business_state, phone
FROM funeral_brand
WHERE website IS NULL AND verified = 0
"""
params = []
if state_filter:
query += " AND business_state = ?"
params.append(state_filter)
query += " ORDER BY id"
if limit:
query += f" LIMIT {limit}"
providers = db.execute(query, params).fetchall()
print(f"Providers without websites: {len(providers)}")
found = 0
not_found = 0
for i, prov in enumerate(providers):
name = prov["title"]
suburb = prov["business_suburb"]
state = prov["business_state"]
phone = prov["phone"]
if (i + 1) % 10 == 0 or i == 0:
print(f" [{i+1}/{len(providers)}] Processing: {name}")
result = discover_website(name, suburb, state, phone)
if result:
db.execute(
"""UPDATE funeral_brand
SET website = ?, updated_at = datetime('now')
WHERE id = ?""",
(result["url"], prov["id"])
)
found += 1
if (i + 1) <= 20 or result["confidence"] == "confirmed":
print(f" FOUND ({result['confidence']}, {result['method']}): "
f"{result['url']}")
else:
not_found += 1
if (i + 1) % 20 == 0:
db.commit()
# Rate limit: ~2s between providers (DDG + validation requests)
time.sleep(CRAWL_DELAY * 2)
db.commit()
print(f"\nDone: {found} websites found, {not_found} not found")
print(f" Success rate: {found/(found+not_found)*100:.1f}%" if found + not_found > 0 else "")
db.close()
if __name__ == "__main__":
import sys
limit = None
state = None
for arg in sys.argv[1:]:
if arg.startswith("--state="):
state = arg.split("=")[1]
elif arg.startswith("--limit="):
limit = int(arg.split("=")[1])
else:
try:
limit = int(arg)
except ValueError:
pass
run(limit=limit, state_filter=state)