Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
393
crawlers/enrich_websites.py
Normal file
393
crawlers/enrich_websites.py
Normal file
@@ -0,0 +1,393 @@
|
||||
"""Website enrichment module.
|
||||
|
||||
For each provider with a website but no packages yet, crawls their site
|
||||
to find pricing/packages pages and extracts structured data.
|
||||
|
||||
Two extraction modes:
|
||||
1. Direct HTML parsing (for sites with clear pricing structure)
|
||||
2. AI extraction via API call (for complex/varied layouts)
|
||||
|
||||
This module handles the crawling and page discovery.
|
||||
AI extraction is delegated to the N8N workflow (Claude Haiku node).
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import urllib.parse
|
||||
import urllib.error
|
||||
from pathlib import Path
|
||||
|
||||
from base import fetch_url, get_db, CRAWL_DELAY
|
||||
|
||||
# Common URL patterns for pricing/packages pages
|
||||
PRICING_PATHS = [
|
||||
"/pricing",
|
||||
"/prices",
|
||||
"/our-prices",
|
||||
"/packages",
|
||||
"/funeral-packages",
|
||||
"/services",
|
||||
"/our-services",
|
||||
"/funeral-costs",
|
||||
"/funeral-services",
|
||||
"/service-options",
|
||||
"/price-list",
|
||||
"/transparency",
|
||||
"/funeral-pricing",
|
||||
"/costs",
|
||||
"/cremation",
|
||||
"/cremation-packages",
|
||||
"/burial",
|
||||
"/plan-a-funeral",
|
||||
"/arrange",
|
||||
]
|
||||
|
||||
# Keywords that suggest a link leads to pricing
|
||||
PRICING_KEYWORDS = [
|
||||
"pric", "cost", "packag", "service", "plan",
|
||||
"cremation", "burial", "funeral",
|
||||
"transparency", "disclosure",
|
||||
]
|
||||
|
||||
|
||||
def find_pricing_page(base_url: str, homepage_html: str) -> str | None:
|
||||
"""Try to find the pricing/packages page URL.
|
||||
|
||||
Strategy:
|
||||
1. Try common URL patterns
|
||||
2. Parse homepage links for pricing-related keywords
|
||||
"""
|
||||
base = base_url.rstrip("/")
|
||||
|
||||
# Strategy 1: Try common paths
|
||||
for path in PRICING_PATHS:
|
||||
test_url = base + path
|
||||
try:
|
||||
html = fetch_url(test_url, timeout=10)
|
||||
# Verify it's not a 404 soft-redirect (check for pricing content)
|
||||
if len(html) > 1000 and ("$" in html or "price" in html.lower()):
|
||||
return test_url
|
||||
except (urllib.error.HTTPError, urllib.error.URLError, Exception):
|
||||
continue
|
||||
time.sleep(0.3)
|
||||
|
||||
# Strategy 2: Parse homepage links
|
||||
link_pattern = re.compile(
|
||||
r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>',
|
||||
re.IGNORECASE | re.DOTALL
|
||||
)
|
||||
|
||||
for match in link_pattern.finditer(homepage_html):
|
||||
href = match.group(1)
|
||||
text = re.sub(r"<[^>]+>", "", match.group(2)).lower().strip()
|
||||
href_lower = href.lower()
|
||||
|
||||
# Check if link text or URL contains pricing keywords
|
||||
if any(kw in text or kw in href_lower for kw in PRICING_KEYWORDS):
|
||||
# Resolve relative URLs
|
||||
if href.startswith("/"):
|
||||
full_url = base + href
|
||||
elif href.startswith("http"):
|
||||
# Only follow links to the same domain
|
||||
if urllib.parse.urlparse(base).netloc in href:
|
||||
full_url = href
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
full_url = base + "/" + href
|
||||
|
||||
try:
|
||||
html = fetch_url(full_url, timeout=10)
|
||||
if len(html) > 500:
|
||||
return full_url
|
||||
except Exception:
|
||||
continue
|
||||
time.sleep(0.3)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_description(html: str) -> str | None:
|
||||
"""Extract a business description from homepage HTML."""
|
||||
# Try meta description first
|
||||
meta_match = re.search(
|
||||
r'<meta\s+(?:name="description"\s+content="([^"]+)"|content="([^"]+)"\s+name="description")',
|
||||
html, re.IGNORECASE
|
||||
)
|
||||
if meta_match:
|
||||
desc = meta_match.group(1) or meta_match.group(2)
|
||||
if desc and len(desc) > 20:
|
||||
return desc.strip()
|
||||
|
||||
# Try OG description
|
||||
og_match = re.search(
|
||||
r'<meta\s+property="og:description"\s+content="([^"]+)"',
|
||||
html, re.IGNORECASE
|
||||
)
|
||||
if og_match and len(og_match.group(1)) > 20:
|
||||
return og_match.group(1).strip()
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_contact_info(html: str) -> dict:
|
||||
"""Extract contact details from HTML."""
|
||||
info = {}
|
||||
|
||||
# Phone
|
||||
phone_match = re.search(r'href="tel:([^"]+)"', html)
|
||||
if phone_match:
|
||||
info["phone"] = phone_match.group(1).strip()
|
||||
|
||||
# Email
|
||||
email_match = re.search(r'href="mailto:([^"?]+)"', html)
|
||||
if email_match:
|
||||
info["email"] = email_match.group(1).strip()
|
||||
|
||||
# Address from JSON-LD
|
||||
addr_match = re.search(r'"streetAddress"\s*:\s*"([^"]*)"', html)
|
||||
if addr_match:
|
||||
info["address"] = addr_match.group(1)
|
||||
|
||||
return info
|
||||
|
||||
|
||||
def check_has_pricing(html: str) -> bool:
|
||||
"""Quick check whether a page contains pricing information."""
|
||||
# Look for dollar signs near numbers
|
||||
price_pattern = re.compile(r'\$[\d,]+(?:\.\d{2})?')
|
||||
prices_found = price_pattern.findall(html)
|
||||
|
||||
# Filter out tiny amounts (likely not funeral pricing)
|
||||
significant_prices = []
|
||||
for p in prices_found:
|
||||
cleaned = p.replace("$", "").replace(",", "").strip()
|
||||
if not cleaned:
|
||||
continue
|
||||
try:
|
||||
amount = float(cleaned)
|
||||
except ValueError:
|
||||
continue
|
||||
if amount >= 100:
|
||||
significant_prices.append(amount)
|
||||
|
||||
return len(significant_prices) >= 1
|
||||
|
||||
|
||||
def prepare_for_ai_extraction(html: str) -> str:
|
||||
"""Clean HTML for AI extraction — remove noise, keep content."""
|
||||
# Remove script and style tags
|
||||
cleaned = re.sub(r"<script[^>]*>.*?</script>", "", html,
|
||||
flags=re.DOTALL | re.IGNORECASE)
|
||||
cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned,
|
||||
flags=re.DOTALL | re.IGNORECASE)
|
||||
|
||||
# Remove HTML comments
|
||||
cleaned = re.sub(r"<!--.*?-->", "", cleaned, flags=re.DOTALL)
|
||||
|
||||
# Remove nav, header, footer elements
|
||||
for tag in ["nav", "header", "footer"]:
|
||||
cleaned = re.sub(
|
||||
rf"<{tag}[^>]*>.*?</{tag}>", "", cleaned,
|
||||
flags=re.DOTALL | re.IGNORECASE
|
||||
)
|
||||
|
||||
# Strip remaining tags but keep text
|
||||
text = re.sub(r"<[^>]+>", " ", cleaned)
|
||||
# Collapse whitespace
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
# Truncate to ~8000 chars (fits well within Haiku context)
|
||||
if len(text) > 8000:
|
||||
text = text[:8000] + "..."
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def enrich_provider(provider_id: int, website: str, db) -> dict:
|
||||
"""Crawl a provider's website and extract enrichment data.
|
||||
|
||||
Returns a dict with what was found.
|
||||
"""
|
||||
result = {
|
||||
"homepage_fetched": False,
|
||||
"description": None,
|
||||
"contact_info": {},
|
||||
"pricing_page_url": None,
|
||||
"has_pricing": False,
|
||||
"pricing_page_text": None, # cleaned text for AI extraction
|
||||
"pdf_links": [],
|
||||
}
|
||||
|
||||
# Step 1: Fetch homepage
|
||||
try:
|
||||
homepage = fetch_url(website, timeout=15)
|
||||
result["homepage_fetched"] = True
|
||||
except Exception as e:
|
||||
result["error"] = str(e)[:200]
|
||||
return result
|
||||
|
||||
# Step 2: Extract description and contact info
|
||||
result["description"] = extract_description(homepage)
|
||||
result["contact_info"] = extract_contact_info(homepage)
|
||||
|
||||
# Step 3: Find pricing page
|
||||
time.sleep(CRAWL_DELAY)
|
||||
pricing_url = find_pricing_page(website, homepage)
|
||||
|
||||
if pricing_url:
|
||||
result["pricing_page_url"] = pricing_url
|
||||
try:
|
||||
pricing_html = fetch_url(pricing_url, timeout=15)
|
||||
result["has_pricing"] = check_has_pricing(pricing_html)
|
||||
result["pricing_page_text"] = prepare_for_ai_extraction(pricing_html)
|
||||
|
||||
# Check for PDF links
|
||||
pdf_links = re.findall(
|
||||
r'href="([^"]*\.pdf[^"]*)"', pricing_html, re.IGNORECASE
|
||||
)
|
||||
for pdf_href in pdf_links:
|
||||
if pdf_href.startswith("/"):
|
||||
pdf_href = website.rstrip("/") + pdf_href
|
||||
elif not pdf_href.startswith("http"):
|
||||
pdf_href = website.rstrip("/") + "/" + pdf_href
|
||||
result["pdf_links"].append(pdf_href)
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
# Check homepage itself for pricing
|
||||
if check_has_pricing(homepage):
|
||||
result["has_pricing"] = True
|
||||
result["pricing_page_url"] = website
|
||||
result["pricing_page_text"] = prepare_for_ai_extraction(homepage)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def run(limit: int | None = None, state_filter: str | None = None):
|
||||
"""Enrich all providers that have a website but no packages."""
|
||||
db = get_db()
|
||||
|
||||
query = """
|
||||
SELECT fb.id, fb.title, fb.website, fb.business_state
|
||||
FROM funeral_brand fb
|
||||
LEFT JOIN package p ON p.brand_id = fb.id
|
||||
WHERE fb.website IS NOT NULL
|
||||
AND fb.verified = 0
|
||||
AND p.id IS NULL
|
||||
"""
|
||||
params = []
|
||||
|
||||
if state_filter:
|
||||
query += " AND fb.business_state = ?"
|
||||
params.append(state_filter)
|
||||
|
||||
query += " ORDER BY fb.id"
|
||||
|
||||
if limit:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
providers = db.execute(query, params).fetchall()
|
||||
print(f"Providers to enrich: {len(providers)}")
|
||||
|
||||
enriched = 0
|
||||
pricing_found = 0
|
||||
failed = 0
|
||||
|
||||
for i, prov in enumerate(providers):
|
||||
if (i + 1) % 5 == 0 or i == 0:
|
||||
print(f" [{i+1}/{len(providers)}] {prov['title']}")
|
||||
|
||||
result = enrich_provider(prov["id"], prov["website"], db)
|
||||
|
||||
if not result["homepage_fetched"]:
|
||||
failed += 1
|
||||
db.execute(
|
||||
"""UPDATE funeral_brand
|
||||
SET enrichment_status = 'failed', updated_at = datetime('now')
|
||||
WHERE id = ?""",
|
||||
(prov["id"],)
|
||||
)
|
||||
continue
|
||||
|
||||
enriched += 1
|
||||
|
||||
# Update brand with discovered info
|
||||
updates = {}
|
||||
if result["description"] and not db.execute(
|
||||
"SELECT description FROM funeral_brand WHERE id = ?", (prov["id"],)
|
||||
).fetchone()["description"]:
|
||||
updates["description"] = result["description"]
|
||||
|
||||
contact = result["contact_info"]
|
||||
brand = db.execute("SELECT * FROM funeral_brand WHERE id = ?",
|
||||
(prov["id"],)).fetchone()
|
||||
if contact.get("email") and not brand["email"]:
|
||||
updates["email"] = contact["email"]
|
||||
if contact.get("phone") and not brand["phone"]:
|
||||
updates["phone"] = contact["phone"]
|
||||
|
||||
if result["has_pricing"]:
|
||||
pricing_found += 1
|
||||
updates["enrichment_status"] = "partial" # has pricing, needs AI extraction
|
||||
else:
|
||||
updates["enrichment_status"] = "partial" # homepage enriched, no pricing
|
||||
|
||||
if updates:
|
||||
set_parts = [f"{k} = ?" for k in updates]
|
||||
values = list(updates.values()) + [prov["id"]]
|
||||
db.execute(
|
||||
f"UPDATE funeral_brand SET {', '.join(set_parts)}, "
|
||||
f"updated_at = datetime('now') WHERE id = ?",
|
||||
values
|
||||
)
|
||||
|
||||
# Store pricing page text for later AI extraction
|
||||
if result["pricing_page_text"]:
|
||||
db.execute(
|
||||
"""INSERT OR REPLACE INTO source_record
|
||||
(source_name, source_id, source_url, raw_data,
|
||||
matched_brand_id, match_type)
|
||||
VALUES ('website_crawl', ?, ?, ?, ?, 'enrichment')""",
|
||||
(
|
||||
f"brand_{prov['id']}",
|
||||
result["pricing_page_url"],
|
||||
json.dumps({
|
||||
"pricing_text": result["pricing_page_text"],
|
||||
"pdf_links": result["pdf_links"],
|
||||
"has_pricing": result["has_pricing"],
|
||||
}),
|
||||
prov["id"],
|
||||
)
|
||||
)
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
db.commit()
|
||||
|
||||
time.sleep(CRAWL_DELAY)
|
||||
|
||||
db.commit()
|
||||
print(f"\nDone: {enriched} enriched, {pricing_found} with pricing, {failed} failed")
|
||||
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
limit = None
|
||||
state = None
|
||||
|
||||
for arg in sys.argv[1:]:
|
||||
if arg.startswith("--state="):
|
||||
state = arg.split("=")[1]
|
||||
elif arg.startswith("--limit="):
|
||||
limit = int(arg.split("=")[1])
|
||||
else:
|
||||
try:
|
||||
limit = int(arg)
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
run(limit=limit, state_filter=state)
|
||||
Reference in New Issue
Block a user