Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
320
crawlers/discover_websites.py
Normal file
320
crawlers/discover_websites.py
Normal file
@@ -0,0 +1,320 @@
|
||||
"""Website discovery module.
|
||||
|
||||
For each provider without a website URL, attempts to find their website
|
||||
using multiple strategies (tried in order):
|
||||
|
||||
1. Serper.dev (2,500 free Google searches, no CC needed)
|
||||
2. DuckDuckGo lite (free fallback, rate-limited)
|
||||
3. URL pattern guessing (businessname.com.au)
|
||||
|
||||
Also validates discovered URLs to confirm they belong to the business.
|
||||
|
||||
Configuration:
|
||||
Set SERPER_API_KEY env var or in config.json to enable Serper.dev.
|
||||
Without it, falls back to DuckDuckGo.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
from pathlib import Path
|
||||
|
||||
from base import (
|
||||
fetch_url, get_db, normalize_phone, CRAWL_DELAY,
|
||||
)
|
||||
|
||||
# Load Serper API key from env or config
|
||||
SERPER_API_KEY = os.environ.get("SERPER_API_KEY")
|
||||
if not SERPER_API_KEY:
|
||||
config_path = Path(__file__).parent / "config.json"
|
||||
if config_path.exists():
|
||||
with open(config_path) as f:
|
||||
config = json.load(f)
|
||||
SERPER_API_KEY = config.get("serper_api_key")
|
||||
|
||||
# Domains to skip when extracting search results
|
||||
SKIP_DOMAINS = [
|
||||
"yellowpages", "whitepages", "truelocal", "yelp", "cylex",
|
||||
"australia247", "showmelocal", "hotfrog", "localsearch",
|
||||
"facebook.com", "linkedin.com", "instagram.com", "twitter.com",
|
||||
"gatheredhere", "ezifunerals", "funeralocity", "funeraldirectory",
|
||||
"deathsandfunerals", "mytributes", "obits.com",
|
||||
"duckduckgo.com", "google.com", "bing.com",
|
||||
"nfda.com.au", "funeralsaustralia.org",
|
||||
"wikipedia.org", "youtube.com",
|
||||
]
|
||||
|
||||
|
||||
def search_serper(query: str) -> list[str]:
|
||||
"""Search via Serper.dev (Google results as JSON). 2,500 free queries."""
|
||||
if not SERPER_API_KEY:
|
||||
return []
|
||||
|
||||
url = "https://google.serper.dev/search"
|
||||
data = json.dumps({"q": query, "gl": "au", "num": 10}).encode("utf-8")
|
||||
req = urllib.request.Request(url, data=data, headers={
|
||||
"X-API-KEY": SERPER_API_KEY,
|
||||
"Content-Type": "application/json",
|
||||
})
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=15) as resp:
|
||||
result = json.loads(resp.read().decode("utf-8"))
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
results = []
|
||||
for item in result.get("organic", []):
|
||||
link = item.get("link", "")
|
||||
if not link:
|
||||
continue
|
||||
if any(d in link.lower() for d in SKIP_DOMAINS):
|
||||
continue
|
||||
results.append(link)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def search_ddg(query: str) -> list[str]:
|
||||
"""Search DuckDuckGo lite and return result URLs (filtered)."""
|
||||
encoded = urllib.parse.quote(query)
|
||||
url = f"https://lite.duckduckgo.com/lite/?q={encoded}"
|
||||
|
||||
try:
|
||||
html = fetch_url(url)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
# Extract redirect URLs from DDG lite format
|
||||
raw_links = re.findall(
|
||||
r'href="//duckduckgo\.com/l/\?uddg=([^&"]+)', html
|
||||
)
|
||||
|
||||
results = []
|
||||
for link in raw_links:
|
||||
decoded = urllib.parse.unquote(link)
|
||||
# Skip ads
|
||||
if "ad_domain" in decoded or "ad_provider" in decoded:
|
||||
continue
|
||||
# Skip directory/aggregator sites
|
||||
if any(d in decoded.lower() for d in SKIP_DOMAINS):
|
||||
continue
|
||||
results.append(decoded)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def validate_url(url: str, business_name: str) -> dict:
|
||||
"""Validate that a URL is a real website belonging to this business.
|
||||
|
||||
Returns: {valid: bool, confidence: str, reason: str}
|
||||
"""
|
||||
try:
|
||||
html = fetch_url(url, timeout=15)
|
||||
except urllib.error.HTTPError as e:
|
||||
return {"valid": False, "confidence": "none", "reason": f"HTTP {e.code}"}
|
||||
except Exception as e:
|
||||
return {"valid": False, "confidence": "none", "reason": str(e)[:100]}
|
||||
|
||||
html_lower = html.lower()
|
||||
|
||||
# Check if it's a parked/for-sale domain
|
||||
parked_signals = ["domain is for sale", "buy this domain",
|
||||
"parked domain", "this domain", "godaddy",
|
||||
"domain parking"]
|
||||
if any(s in html_lower for s in parked_signals):
|
||||
return {"valid": False, "confidence": "none", "reason": "parked domain"}
|
||||
|
||||
# Check if the page mentions the business name
|
||||
name_parts = business_name.lower().split()
|
||||
# Require at least 2 name parts to match (or all if name is 1-2 words)
|
||||
min_matches = min(2, len(name_parts))
|
||||
matches = sum(1 for part in name_parts
|
||||
if len(part) > 2 and part in html_lower)
|
||||
|
||||
if matches >= min_matches:
|
||||
return {"valid": True, "confidence": "confirmed", "reason": "name found in page"}
|
||||
|
||||
# Check title tag
|
||||
title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
|
||||
if title_match:
|
||||
title = title_match.group(1).lower()
|
||||
if any(part in title for part in name_parts if len(part) > 2):
|
||||
return {"valid": True, "confidence": "probable",
|
||||
"reason": "partial name in title"}
|
||||
|
||||
# Check for funeral-related content (it's at least a funeral business)
|
||||
funeral_signals = ["funeral", "cremation", "burial", "memorial",
|
||||
"chapel", "obituar", "condolence"]
|
||||
if any(s in html_lower for s in funeral_signals):
|
||||
return {"valid": True, "confidence": "probable",
|
||||
"reason": "funeral content found, name not confirmed"}
|
||||
|
||||
return {"valid": False, "confidence": "low",
|
||||
"reason": "business name not found on page"}
|
||||
|
||||
|
||||
def guess_urls(business_name: str) -> list[str]:
|
||||
"""Generate candidate URLs from a business name."""
|
||||
# Clean name for domain guessing
|
||||
slug = business_name.lower().strip()
|
||||
slug = re.sub(r"[''`]", "", slug)
|
||||
slug = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug)
|
||||
slug = re.sub(r"[^a-z0-9]+", "", slug)
|
||||
|
||||
# Also try hyphenated version
|
||||
slug_hyphen = business_name.lower().strip()
|
||||
slug_hyphen = re.sub(r"[''`]", "", slug_hyphen)
|
||||
slug_hyphen = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug_hyphen)
|
||||
slug_hyphen = re.sub(r"[^a-z0-9]+", "-", slug_hyphen).strip("-")
|
||||
|
||||
candidates = []
|
||||
for s in [slug, slug_hyphen]:
|
||||
if s:
|
||||
candidates.append(f"https://www.{s}.com.au")
|
||||
candidates.append(f"https://{s}.com.au")
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def discover_website(name: str, suburb: str | None, state: str | None,
|
||||
phone: str | None = None) -> dict | None:
|
||||
"""Attempt to discover a business website.
|
||||
|
||||
Returns: {url, confidence, method, validation} or None.
|
||||
"""
|
||||
# Build search query
|
||||
query_parts = [name]
|
||||
if suburb:
|
||||
query_parts.append(suburb)
|
||||
if state:
|
||||
query_parts.append(state)
|
||||
query = " ".join(query_parts)
|
||||
|
||||
# Strategy 1: Serper.dev (Google results, 2500 free)
|
||||
results = search_serper(query)
|
||||
|
||||
# Strategy 2: DuckDuckGo fallback
|
||||
if not results:
|
||||
results = search_ddg(query)
|
||||
|
||||
for url in results[:3]:
|
||||
validation = validate_url(url, name)
|
||||
if validation["valid"]:
|
||||
return {
|
||||
"url": url.rstrip("/"),
|
||||
"confidence": validation["confidence"],
|
||||
"method": "search",
|
||||
"validation": validation,
|
||||
}
|
||||
time.sleep(0.5)
|
||||
|
||||
# Strategy 2: URL guessing
|
||||
candidates = guess_urls(name)
|
||||
for url in candidates:
|
||||
try:
|
||||
validation = validate_url(url, name)
|
||||
if validation["valid"]:
|
||||
return {
|
||||
"url": url.rstrip("/"),
|
||||
"confidence": validation["confidence"],
|
||||
"method": "guess",
|
||||
"validation": validation,
|
||||
}
|
||||
except Exception:
|
||||
continue
|
||||
time.sleep(0.3)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def run(limit: int | None = None, state_filter: str | None = None):
|
||||
"""Discover websites for all providers without one.
|
||||
|
||||
Args:
|
||||
limit: Max providers to process (for testing).
|
||||
state_filter: Only process providers in this state.
|
||||
"""
|
||||
db = get_db()
|
||||
|
||||
query = """
|
||||
SELECT id, title, business_suburb, business_state, phone
|
||||
FROM funeral_brand
|
||||
WHERE website IS NULL AND verified = 0
|
||||
"""
|
||||
params = []
|
||||
|
||||
if state_filter:
|
||||
query += " AND business_state = ?"
|
||||
params.append(state_filter)
|
||||
|
||||
query += " ORDER BY id"
|
||||
|
||||
if limit:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
providers = db.execute(query, params).fetchall()
|
||||
print(f"Providers without websites: {len(providers)}")
|
||||
|
||||
found = 0
|
||||
not_found = 0
|
||||
|
||||
for i, prov in enumerate(providers):
|
||||
name = prov["title"]
|
||||
suburb = prov["business_suburb"]
|
||||
state = prov["business_state"]
|
||||
phone = prov["phone"]
|
||||
|
||||
if (i + 1) % 10 == 0 or i == 0:
|
||||
print(f" [{i+1}/{len(providers)}] Processing: {name}")
|
||||
|
||||
result = discover_website(name, suburb, state, phone)
|
||||
|
||||
if result:
|
||||
db.execute(
|
||||
"""UPDATE funeral_brand
|
||||
SET website = ?, updated_at = datetime('now')
|
||||
WHERE id = ?""",
|
||||
(result["url"], prov["id"])
|
||||
)
|
||||
found += 1
|
||||
if (i + 1) <= 20 or result["confidence"] == "confirmed":
|
||||
print(f" FOUND ({result['confidence']}, {result['method']}): "
|
||||
f"{result['url']}")
|
||||
else:
|
||||
not_found += 1
|
||||
|
||||
if (i + 1) % 20 == 0:
|
||||
db.commit()
|
||||
|
||||
# Rate limit: ~2s between providers (DDG + validation requests)
|
||||
time.sleep(CRAWL_DELAY * 2)
|
||||
|
||||
db.commit()
|
||||
print(f"\nDone: {found} websites found, {not_found} not found")
|
||||
print(f" Success rate: {found/(found+not_found)*100:.1f}%" if found + not_found > 0 else "")
|
||||
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
limit = None
|
||||
state = None
|
||||
|
||||
for arg in sys.argv[1:]:
|
||||
if arg.startswith("--state="):
|
||||
state = arg.split("=")[1]
|
||||
elif arg.startswith("--limit="):
|
||||
limit = int(arg.split("=")[1])
|
||||
else:
|
||||
try:
|
||||
limit = int(arg)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
run(limit=limit, state_filter=state)
|
||||
Reference in New Issue
Block a user