Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
321 lines
9.9 KiB
Python
321 lines
9.9 KiB
Python
"""Website discovery module.
|
|
|
|
For each provider without a website URL, attempts to find their website
|
|
using multiple strategies (tried in order):
|
|
|
|
1. Serper.dev (2,500 free Google searches, no CC needed)
|
|
2. DuckDuckGo lite (free fallback, rate-limited)
|
|
3. URL pattern guessing (businessname.com.au)
|
|
|
|
Also validates discovered URLs to confirm they belong to the business.
|
|
|
|
Configuration:
|
|
Set SERPER_API_KEY env var or in config.json to enable Serper.dev.
|
|
Without it, falls back to DuckDuckGo.
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import time
|
|
import urllib.parse
|
|
import urllib.request
|
|
import urllib.error
|
|
from pathlib import Path
|
|
|
|
from base import (
|
|
fetch_url, get_db, normalize_phone, CRAWL_DELAY,
|
|
)
|
|
|
|
# Load Serper API key from env or config
|
|
SERPER_API_KEY = os.environ.get("SERPER_API_KEY")
|
|
if not SERPER_API_KEY:
|
|
config_path = Path(__file__).parent / "config.json"
|
|
if config_path.exists():
|
|
with open(config_path) as f:
|
|
config = json.load(f)
|
|
SERPER_API_KEY = config.get("serper_api_key")
|
|
|
|
# Domains to skip when extracting search results
|
|
SKIP_DOMAINS = [
|
|
"yellowpages", "whitepages", "truelocal", "yelp", "cylex",
|
|
"australia247", "showmelocal", "hotfrog", "localsearch",
|
|
"facebook.com", "linkedin.com", "instagram.com", "twitter.com",
|
|
"gatheredhere", "ezifunerals", "funeralocity", "funeraldirectory",
|
|
"deathsandfunerals", "mytributes", "obits.com",
|
|
"duckduckgo.com", "google.com", "bing.com",
|
|
"nfda.com.au", "funeralsaustralia.org",
|
|
"wikipedia.org", "youtube.com",
|
|
]
|
|
|
|
|
|
def search_serper(query: str) -> list[str]:
|
|
"""Search via Serper.dev (Google results as JSON). 2,500 free queries."""
|
|
if not SERPER_API_KEY:
|
|
return []
|
|
|
|
url = "https://google.serper.dev/search"
|
|
data = json.dumps({"q": query, "gl": "au", "num": 10}).encode("utf-8")
|
|
req = urllib.request.Request(url, data=data, headers={
|
|
"X-API-KEY": SERPER_API_KEY,
|
|
"Content-Type": "application/json",
|
|
})
|
|
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
result = json.loads(resp.read().decode("utf-8"))
|
|
except Exception:
|
|
return []
|
|
|
|
results = []
|
|
for item in result.get("organic", []):
|
|
link = item.get("link", "")
|
|
if not link:
|
|
continue
|
|
if any(d in link.lower() for d in SKIP_DOMAINS):
|
|
continue
|
|
results.append(link)
|
|
|
|
return results
|
|
|
|
|
|
def search_ddg(query: str) -> list[str]:
|
|
"""Search DuckDuckGo lite and return result URLs (filtered)."""
|
|
encoded = urllib.parse.quote(query)
|
|
url = f"https://lite.duckduckgo.com/lite/?q={encoded}"
|
|
|
|
try:
|
|
html = fetch_url(url)
|
|
except Exception:
|
|
return []
|
|
|
|
# Extract redirect URLs from DDG lite format
|
|
raw_links = re.findall(
|
|
r'href="//duckduckgo\.com/l/\?uddg=([^&"]+)', html
|
|
)
|
|
|
|
results = []
|
|
for link in raw_links:
|
|
decoded = urllib.parse.unquote(link)
|
|
# Skip ads
|
|
if "ad_domain" in decoded or "ad_provider" in decoded:
|
|
continue
|
|
# Skip directory/aggregator sites
|
|
if any(d in decoded.lower() for d in SKIP_DOMAINS):
|
|
continue
|
|
results.append(decoded)
|
|
|
|
return results
|
|
|
|
|
|
def validate_url(url: str, business_name: str) -> dict:
|
|
"""Validate that a URL is a real website belonging to this business.
|
|
|
|
Returns: {valid: bool, confidence: str, reason: str}
|
|
"""
|
|
try:
|
|
html = fetch_url(url, timeout=15)
|
|
except urllib.error.HTTPError as e:
|
|
return {"valid": False, "confidence": "none", "reason": f"HTTP {e.code}"}
|
|
except Exception as e:
|
|
return {"valid": False, "confidence": "none", "reason": str(e)[:100]}
|
|
|
|
html_lower = html.lower()
|
|
|
|
# Check if it's a parked/for-sale domain
|
|
parked_signals = ["domain is for sale", "buy this domain",
|
|
"parked domain", "this domain", "godaddy",
|
|
"domain parking"]
|
|
if any(s in html_lower for s in parked_signals):
|
|
return {"valid": False, "confidence": "none", "reason": "parked domain"}
|
|
|
|
# Check if the page mentions the business name
|
|
name_parts = business_name.lower().split()
|
|
# Require at least 2 name parts to match (or all if name is 1-2 words)
|
|
min_matches = min(2, len(name_parts))
|
|
matches = sum(1 for part in name_parts
|
|
if len(part) > 2 and part in html_lower)
|
|
|
|
if matches >= min_matches:
|
|
return {"valid": True, "confidence": "confirmed", "reason": "name found in page"}
|
|
|
|
# Check title tag
|
|
title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
|
|
if title_match:
|
|
title = title_match.group(1).lower()
|
|
if any(part in title for part in name_parts if len(part) > 2):
|
|
return {"valid": True, "confidence": "probable",
|
|
"reason": "partial name in title"}
|
|
|
|
# Check for funeral-related content (it's at least a funeral business)
|
|
funeral_signals = ["funeral", "cremation", "burial", "memorial",
|
|
"chapel", "obituar", "condolence"]
|
|
if any(s in html_lower for s in funeral_signals):
|
|
return {"valid": True, "confidence": "probable",
|
|
"reason": "funeral content found, name not confirmed"}
|
|
|
|
return {"valid": False, "confidence": "low",
|
|
"reason": "business name not found on page"}
|
|
|
|
|
|
def guess_urls(business_name: str) -> list[str]:
|
|
"""Generate candidate URLs from a business name."""
|
|
# Clean name for domain guessing
|
|
slug = business_name.lower().strip()
|
|
slug = re.sub(r"[''`]", "", slug)
|
|
slug = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug)
|
|
slug = re.sub(r"[^a-z0-9]+", "", slug)
|
|
|
|
# Also try hyphenated version
|
|
slug_hyphen = business_name.lower().strip()
|
|
slug_hyphen = re.sub(r"[''`]", "", slug_hyphen)
|
|
slug_hyphen = re.sub(r"\b(pty|ltd|limited|proprietary|inc)\b", "", slug_hyphen)
|
|
slug_hyphen = re.sub(r"[^a-z0-9]+", "-", slug_hyphen).strip("-")
|
|
|
|
candidates = []
|
|
for s in [slug, slug_hyphen]:
|
|
if s:
|
|
candidates.append(f"https://www.{s}.com.au")
|
|
candidates.append(f"https://{s}.com.au")
|
|
|
|
return candidates
|
|
|
|
|
|
def discover_website(name: str, suburb: str | None, state: str | None,
|
|
phone: str | None = None) -> dict | None:
|
|
"""Attempt to discover a business website.
|
|
|
|
Returns: {url, confidence, method, validation} or None.
|
|
"""
|
|
# Build search query
|
|
query_parts = [name]
|
|
if suburb:
|
|
query_parts.append(suburb)
|
|
if state:
|
|
query_parts.append(state)
|
|
query = " ".join(query_parts)
|
|
|
|
# Strategy 1: Serper.dev (Google results, 2500 free)
|
|
results = search_serper(query)
|
|
|
|
# Strategy 2: DuckDuckGo fallback
|
|
if not results:
|
|
results = search_ddg(query)
|
|
|
|
for url in results[:3]:
|
|
validation = validate_url(url, name)
|
|
if validation["valid"]:
|
|
return {
|
|
"url": url.rstrip("/"),
|
|
"confidence": validation["confidence"],
|
|
"method": "search",
|
|
"validation": validation,
|
|
}
|
|
time.sleep(0.5)
|
|
|
|
# Strategy 2: URL guessing
|
|
candidates = guess_urls(name)
|
|
for url in candidates:
|
|
try:
|
|
validation = validate_url(url, name)
|
|
if validation["valid"]:
|
|
return {
|
|
"url": url.rstrip("/"),
|
|
"confidence": validation["confidence"],
|
|
"method": "guess",
|
|
"validation": validation,
|
|
}
|
|
except Exception:
|
|
continue
|
|
time.sleep(0.3)
|
|
|
|
return None
|
|
|
|
|
|
def run(limit: int | None = None, state_filter: str | None = None):
|
|
"""Discover websites for all providers without one.
|
|
|
|
Args:
|
|
limit: Max providers to process (for testing).
|
|
state_filter: Only process providers in this state.
|
|
"""
|
|
db = get_db()
|
|
|
|
query = """
|
|
SELECT id, title, business_suburb, business_state, phone
|
|
FROM funeral_brand
|
|
WHERE website IS NULL AND verified = 0
|
|
"""
|
|
params = []
|
|
|
|
if state_filter:
|
|
query += " AND business_state = ?"
|
|
params.append(state_filter)
|
|
|
|
query += " ORDER BY id"
|
|
|
|
if limit:
|
|
query += f" LIMIT {limit}"
|
|
|
|
providers = db.execute(query, params).fetchall()
|
|
print(f"Providers without websites: {len(providers)}")
|
|
|
|
found = 0
|
|
not_found = 0
|
|
|
|
for i, prov in enumerate(providers):
|
|
name = prov["title"]
|
|
suburb = prov["business_suburb"]
|
|
state = prov["business_state"]
|
|
phone = prov["phone"]
|
|
|
|
if (i + 1) % 10 == 0 or i == 0:
|
|
print(f" [{i+1}/{len(providers)}] Processing: {name}")
|
|
|
|
result = discover_website(name, suburb, state, phone)
|
|
|
|
if result:
|
|
db.execute(
|
|
"""UPDATE funeral_brand
|
|
SET website = ?, updated_at = datetime('now')
|
|
WHERE id = ?""",
|
|
(result["url"], prov["id"])
|
|
)
|
|
found += 1
|
|
if (i + 1) <= 20 or result["confidence"] == "confirmed":
|
|
print(f" FOUND ({result['confidence']}, {result['method']}): "
|
|
f"{result['url']}")
|
|
else:
|
|
not_found += 1
|
|
|
|
if (i + 1) % 20 == 0:
|
|
db.commit()
|
|
|
|
# Rate limit: ~2s between providers (DDG + validation requests)
|
|
time.sleep(CRAWL_DELAY * 2)
|
|
|
|
db.commit()
|
|
print(f"\nDone: {found} websites found, {not_found} not found")
|
|
print(f" Success rate: {found/(found+not_found)*100:.1f}%" if found + not_found > 0 else "")
|
|
|
|
db.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
limit = None
|
|
state = None
|
|
|
|
for arg in sys.argv[1:]:
|
|
if arg.startswith("--state="):
|
|
state = arg.split("=")[1]
|
|
elif arg.startswith("--limit="):
|
|
limit = int(arg.split("=")[1])
|
|
else:
|
|
try:
|
|
limit = int(arg)
|
|
except ValueError:
|
|
pass
|
|
|
|
run(limit=limit, state_filter=state)
|