Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
394 lines
12 KiB
Python
394 lines
12 KiB
Python
"""Website enrichment module.
|
|
|
|
For each provider with a website but no packages yet, crawls their site
|
|
to find pricing/packages pages and extracts structured data.
|
|
|
|
Two extraction modes:
|
|
1. Direct HTML parsing (for sites with clear pricing structure)
|
|
2. AI extraction via API call (for complex/varied layouts)
|
|
|
|
This module handles the crawling and page discovery.
|
|
AI extraction is delegated to the N8N workflow (Claude Haiku node).
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
import urllib.parse
|
|
import urllib.error
|
|
from pathlib import Path
|
|
|
|
from base import fetch_url, get_db, CRAWL_DELAY
|
|
|
|
# Common URL patterns for pricing/packages pages
|
|
PRICING_PATHS = [
|
|
"/pricing",
|
|
"/prices",
|
|
"/our-prices",
|
|
"/packages",
|
|
"/funeral-packages",
|
|
"/services",
|
|
"/our-services",
|
|
"/funeral-costs",
|
|
"/funeral-services",
|
|
"/service-options",
|
|
"/price-list",
|
|
"/transparency",
|
|
"/funeral-pricing",
|
|
"/costs",
|
|
"/cremation",
|
|
"/cremation-packages",
|
|
"/burial",
|
|
"/plan-a-funeral",
|
|
"/arrange",
|
|
]
|
|
|
|
# Keywords that suggest a link leads to pricing
|
|
PRICING_KEYWORDS = [
|
|
"pric", "cost", "packag", "service", "plan",
|
|
"cremation", "burial", "funeral",
|
|
"transparency", "disclosure",
|
|
]
|
|
|
|
|
|
def find_pricing_page(base_url: str, homepage_html: str) -> str | None:
|
|
"""Try to find the pricing/packages page URL.
|
|
|
|
Strategy:
|
|
1. Try common URL patterns
|
|
2. Parse homepage links for pricing-related keywords
|
|
"""
|
|
base = base_url.rstrip("/")
|
|
|
|
# Strategy 1: Try common paths
|
|
for path in PRICING_PATHS:
|
|
test_url = base + path
|
|
try:
|
|
html = fetch_url(test_url, timeout=10)
|
|
# Verify it's not a 404 soft-redirect (check for pricing content)
|
|
if len(html) > 1000 and ("$" in html or "price" in html.lower()):
|
|
return test_url
|
|
except (urllib.error.HTTPError, urllib.error.URLError, Exception):
|
|
continue
|
|
time.sleep(0.3)
|
|
|
|
# Strategy 2: Parse homepage links
|
|
link_pattern = re.compile(
|
|
r'<a[^>]*href="([^"]*)"[^>]*>(.*?)</a>',
|
|
re.IGNORECASE | re.DOTALL
|
|
)
|
|
|
|
for match in link_pattern.finditer(homepage_html):
|
|
href = match.group(1)
|
|
text = re.sub(r"<[^>]+>", "", match.group(2)).lower().strip()
|
|
href_lower = href.lower()
|
|
|
|
# Check if link text or URL contains pricing keywords
|
|
if any(kw in text or kw in href_lower for kw in PRICING_KEYWORDS):
|
|
# Resolve relative URLs
|
|
if href.startswith("/"):
|
|
full_url = base + href
|
|
elif href.startswith("http"):
|
|
# Only follow links to the same domain
|
|
if urllib.parse.urlparse(base).netloc in href:
|
|
full_url = href
|
|
else:
|
|
continue
|
|
else:
|
|
full_url = base + "/" + href
|
|
|
|
try:
|
|
html = fetch_url(full_url, timeout=10)
|
|
if len(html) > 500:
|
|
return full_url
|
|
except Exception:
|
|
continue
|
|
time.sleep(0.3)
|
|
|
|
return None
|
|
|
|
|
|
def extract_description(html: str) -> str | None:
|
|
"""Extract a business description from homepage HTML."""
|
|
# Try meta description first
|
|
meta_match = re.search(
|
|
r'<meta\s+(?:name="description"\s+content="([^"]+)"|content="([^"]+)"\s+name="description")',
|
|
html, re.IGNORECASE
|
|
)
|
|
if meta_match:
|
|
desc = meta_match.group(1) or meta_match.group(2)
|
|
if desc and len(desc) > 20:
|
|
return desc.strip()
|
|
|
|
# Try OG description
|
|
og_match = re.search(
|
|
r'<meta\s+property="og:description"\s+content="([^"]+)"',
|
|
html, re.IGNORECASE
|
|
)
|
|
if og_match and len(og_match.group(1)) > 20:
|
|
return og_match.group(1).strip()
|
|
|
|
return None
|
|
|
|
|
|
def extract_contact_info(html: str) -> dict:
|
|
"""Extract contact details from HTML."""
|
|
info = {}
|
|
|
|
# Phone
|
|
phone_match = re.search(r'href="tel:([^"]+)"', html)
|
|
if phone_match:
|
|
info["phone"] = phone_match.group(1).strip()
|
|
|
|
# Email
|
|
email_match = re.search(r'href="mailto:([^"?]+)"', html)
|
|
if email_match:
|
|
info["email"] = email_match.group(1).strip()
|
|
|
|
# Address from JSON-LD
|
|
addr_match = re.search(r'"streetAddress"\s*:\s*"([^"]*)"', html)
|
|
if addr_match:
|
|
info["address"] = addr_match.group(1)
|
|
|
|
return info
|
|
|
|
|
|
def check_has_pricing(html: str) -> bool:
|
|
"""Quick check whether a page contains pricing information."""
|
|
# Look for dollar signs near numbers
|
|
price_pattern = re.compile(r'\$[\d,]+(?:\.\d{2})?')
|
|
prices_found = price_pattern.findall(html)
|
|
|
|
# Filter out tiny amounts (likely not funeral pricing)
|
|
significant_prices = []
|
|
for p in prices_found:
|
|
cleaned = p.replace("$", "").replace(",", "").strip()
|
|
if not cleaned:
|
|
continue
|
|
try:
|
|
amount = float(cleaned)
|
|
except ValueError:
|
|
continue
|
|
if amount >= 100:
|
|
significant_prices.append(amount)
|
|
|
|
return len(significant_prices) >= 1
|
|
|
|
|
|
def prepare_for_ai_extraction(html: str) -> str:
|
|
"""Clean HTML for AI extraction — remove noise, keep content."""
|
|
# Remove script and style tags
|
|
cleaned = re.sub(r"<script[^>]*>.*?</script>", "", html,
|
|
flags=re.DOTALL | re.IGNORECASE)
|
|
cleaned = re.sub(r"<style[^>]*>.*?</style>", "", cleaned,
|
|
flags=re.DOTALL | re.IGNORECASE)
|
|
|
|
# Remove HTML comments
|
|
cleaned = re.sub(r"<!--.*?-->", "", cleaned, flags=re.DOTALL)
|
|
|
|
# Remove nav, header, footer elements
|
|
for tag in ["nav", "header", "footer"]:
|
|
cleaned = re.sub(
|
|
rf"<{tag}[^>]*>.*?</{tag}>", "", cleaned,
|
|
flags=re.DOTALL | re.IGNORECASE
|
|
)
|
|
|
|
# Strip remaining tags but keep text
|
|
text = re.sub(r"<[^>]+>", " ", cleaned)
|
|
# Collapse whitespace
|
|
text = re.sub(r"\s+", " ", text).strip()
|
|
|
|
# Truncate to ~8000 chars (fits well within Haiku context)
|
|
if len(text) > 8000:
|
|
text = text[:8000] + "..."
|
|
|
|
return text
|
|
|
|
|
|
def enrich_provider(provider_id: int, website: str, db) -> dict:
|
|
"""Crawl a provider's website and extract enrichment data.
|
|
|
|
Returns a dict with what was found.
|
|
"""
|
|
result = {
|
|
"homepage_fetched": False,
|
|
"description": None,
|
|
"contact_info": {},
|
|
"pricing_page_url": None,
|
|
"has_pricing": False,
|
|
"pricing_page_text": None, # cleaned text for AI extraction
|
|
"pdf_links": [],
|
|
}
|
|
|
|
# Step 1: Fetch homepage
|
|
try:
|
|
homepage = fetch_url(website, timeout=15)
|
|
result["homepage_fetched"] = True
|
|
except Exception as e:
|
|
result["error"] = str(e)[:200]
|
|
return result
|
|
|
|
# Step 2: Extract description and contact info
|
|
result["description"] = extract_description(homepage)
|
|
result["contact_info"] = extract_contact_info(homepage)
|
|
|
|
# Step 3: Find pricing page
|
|
time.sleep(CRAWL_DELAY)
|
|
pricing_url = find_pricing_page(website, homepage)
|
|
|
|
if pricing_url:
|
|
result["pricing_page_url"] = pricing_url
|
|
try:
|
|
pricing_html = fetch_url(pricing_url, timeout=15)
|
|
result["has_pricing"] = check_has_pricing(pricing_html)
|
|
result["pricing_page_text"] = prepare_for_ai_extraction(pricing_html)
|
|
|
|
# Check for PDF links
|
|
pdf_links = re.findall(
|
|
r'href="([^"]*\.pdf[^"]*)"', pricing_html, re.IGNORECASE
|
|
)
|
|
for pdf_href in pdf_links:
|
|
if pdf_href.startswith("/"):
|
|
pdf_href = website.rstrip("/") + pdf_href
|
|
elif not pdf_href.startswith("http"):
|
|
pdf_href = website.rstrip("/") + "/" + pdf_href
|
|
result["pdf_links"].append(pdf_href)
|
|
|
|
except Exception:
|
|
pass
|
|
else:
|
|
# Check homepage itself for pricing
|
|
if check_has_pricing(homepage):
|
|
result["has_pricing"] = True
|
|
result["pricing_page_url"] = website
|
|
result["pricing_page_text"] = prepare_for_ai_extraction(homepage)
|
|
|
|
return result
|
|
|
|
|
|
def run(limit: int | None = None, state_filter: str | None = None):
|
|
"""Enrich all providers that have a website but no packages."""
|
|
db = get_db()
|
|
|
|
query = """
|
|
SELECT fb.id, fb.title, fb.website, fb.business_state
|
|
FROM funeral_brand fb
|
|
LEFT JOIN package p ON p.brand_id = fb.id
|
|
WHERE fb.website IS NOT NULL
|
|
AND fb.verified = 0
|
|
AND p.id IS NULL
|
|
"""
|
|
params = []
|
|
|
|
if state_filter:
|
|
query += " AND fb.business_state = ?"
|
|
params.append(state_filter)
|
|
|
|
query += " ORDER BY fb.id"
|
|
|
|
if limit:
|
|
query += f" LIMIT {limit}"
|
|
|
|
providers = db.execute(query, params).fetchall()
|
|
print(f"Providers to enrich: {len(providers)}")
|
|
|
|
enriched = 0
|
|
pricing_found = 0
|
|
failed = 0
|
|
|
|
for i, prov in enumerate(providers):
|
|
if (i + 1) % 5 == 0 or i == 0:
|
|
print(f" [{i+1}/{len(providers)}] {prov['title']}")
|
|
|
|
result = enrich_provider(prov["id"], prov["website"], db)
|
|
|
|
if not result["homepage_fetched"]:
|
|
failed += 1
|
|
db.execute(
|
|
"""UPDATE funeral_brand
|
|
SET enrichment_status = 'failed', updated_at = datetime('now')
|
|
WHERE id = ?""",
|
|
(prov["id"],)
|
|
)
|
|
continue
|
|
|
|
enriched += 1
|
|
|
|
# Update brand with discovered info
|
|
updates = {}
|
|
if result["description"] and not db.execute(
|
|
"SELECT description FROM funeral_brand WHERE id = ?", (prov["id"],)
|
|
).fetchone()["description"]:
|
|
updates["description"] = result["description"]
|
|
|
|
contact = result["contact_info"]
|
|
brand = db.execute("SELECT * FROM funeral_brand WHERE id = ?",
|
|
(prov["id"],)).fetchone()
|
|
if contact.get("email") and not brand["email"]:
|
|
updates["email"] = contact["email"]
|
|
if contact.get("phone") and not brand["phone"]:
|
|
updates["phone"] = contact["phone"]
|
|
|
|
if result["has_pricing"]:
|
|
pricing_found += 1
|
|
updates["enrichment_status"] = "partial" # has pricing, needs AI extraction
|
|
else:
|
|
updates["enrichment_status"] = "partial" # homepage enriched, no pricing
|
|
|
|
if updates:
|
|
set_parts = [f"{k} = ?" for k in updates]
|
|
values = list(updates.values()) + [prov["id"]]
|
|
db.execute(
|
|
f"UPDATE funeral_brand SET {', '.join(set_parts)}, "
|
|
f"updated_at = datetime('now') WHERE id = ?",
|
|
values
|
|
)
|
|
|
|
# Store pricing page text for later AI extraction
|
|
if result["pricing_page_text"]:
|
|
db.execute(
|
|
"""INSERT OR REPLACE INTO source_record
|
|
(source_name, source_id, source_url, raw_data,
|
|
matched_brand_id, match_type)
|
|
VALUES ('website_crawl', ?, ?, ?, ?, 'enrichment')""",
|
|
(
|
|
f"brand_{prov['id']}",
|
|
result["pricing_page_url"],
|
|
json.dumps({
|
|
"pricing_text": result["pricing_page_text"],
|
|
"pdf_links": result["pdf_links"],
|
|
"has_pricing": result["has_pricing"],
|
|
}),
|
|
prov["id"],
|
|
)
|
|
)
|
|
|
|
if (i + 1) % 10 == 0:
|
|
db.commit()
|
|
|
|
time.sleep(CRAWL_DELAY)
|
|
|
|
db.commit()
|
|
print(f"\nDone: {enriched} enriched, {pricing_found} with pricing, {failed} failed")
|
|
|
|
db.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
limit = None
|
|
state = None
|
|
|
|
for arg in sys.argv[1:]:
|
|
if arg.startswith("--state="):
|
|
state = arg.split("=")[1]
|
|
elif arg.startswith("--limit="):
|
|
limit = int(arg.split("=")[1])
|
|
else:
|
|
try:
|
|
limit = int(arg)
|
|
except ValueError:
|
|
pass
|
|
|
|
run(limit=limit, state_filter=state)
|