Initial commit: funeral provider discovery pipeline

Python crawlers for VIC Register, Funerals Australia, NFDA
n8n workflows for scheduled discovery and enrichment
SQLite schema and seeded dev database (1,463 providers)
End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
Richie
2026-04-24 10:27:08 +10:00
commit cc91427789
30 changed files with 4706 additions and 0 deletions

164
crawlers/base.py Normal file
View File

@@ -0,0 +1,164 @@
"""Base crawler module with shared utilities."""
import gzip
import io
import json
import time
import sqlite3
import urllib.request
import urllib.parse
import urllib.error
from datetime import datetime, timezone
from pathlib import Path
DB_PATH = Path(__file__).parent.parent / "database" / "providers.db"
CRAWL_DELAY = 1.0 # seconds between requests (courtesy)
USER_AGENT = (
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)
def fetch_url(url: str, method: str = "GET", data: dict | None = None,
headers: dict | None = None, timeout: int = 30) -> str:
"""Fetch a URL and return the response body as text."""
hdrs = {"User-Agent": USER_AGENT}
if headers:
hdrs.update(headers)
body = None
if data and method == "POST":
body = urllib.parse.urlencode(data, doseq=True).encode("utf-8")
hdrs.setdefault("Content-Type", "application/x-www-form-urlencoded")
elif data and method == "GET":
url = url + "?" + urllib.parse.urlencode(data, doseq=True)
req = urllib.request.Request(url, data=body, headers=hdrs, method=method)
with urllib.request.urlopen(req, timeout=timeout) as resp:
raw = resp.read()
# Handle gzip-compressed responses
if resp.headers.get("Content-Encoding") == "gzip" or raw[:2] == b"\x1f\x8b":
raw = gzip.decompress(raw)
charset = resp.headers.get_content_charset() or "utf-8"
return raw.decode(charset)
def fetch_json(url: str, method: str = "GET", data: dict | None = None,
headers: dict | None = None) -> dict:
"""Fetch a URL and parse the response as JSON."""
text = fetch_url(url, method=method, data=data, headers=headers)
return json.loads(text)
def get_db() -> sqlite3.Connection:
"""Get a connection to the SQLite database."""
conn = sqlite3.connect(str(DB_PATH))
conn.execute("PRAGMA journal_mode=WAL")
conn.execute("PRAGMA foreign_keys=ON")
conn.row_factory = sqlite3.Row
return conn
def start_crawl_log(db: sqlite3.Connection, source_name: str) -> int:
"""Create a source_log entry and return its ID."""
cur = db.execute(
"INSERT INTO source_log (source_name) VALUES (?)",
(source_name,)
)
db.commit()
return cur.lastrowid
def finish_crawl_log(db: sqlite3.Connection, log_id: int,
found: int, new: int, updated: int, skipped: int,
status: str = "completed", error: str | None = None):
"""Update a source_log entry with results."""
db.execute(
"""UPDATE source_log
SET run_finished_at = datetime('now'),
records_found = ?, records_new = ?,
records_updated = ?, records_skipped = ?,
status = ?, error_message = ?
WHERE id = ?""",
(found, new, updated, skipped, status, error, log_id)
)
db.commit()
def store_source_record(db: sqlite3.Connection, source_name: str,
source_id: str, source_url: str | None,
raw_data: dict, log_id: int) -> int | None:
"""Store a raw source record. Returns the row ID, or None if duplicate."""
try:
cur = db.execute(
"""INSERT INTO source_record
(source_name, source_id, source_url, raw_data, log_id)
VALUES (?, ?, ?, ?, ?)""",
(source_name, source_id, source_url, json.dumps(raw_data), log_id)
)
db.commit()
return cur.lastrowid
except sqlite3.IntegrityError:
# Duplicate source_name + source_id — already have this record
return None
def normalize_phone(phone: str | None) -> str | None:
"""Basic phone normalization."""
if not phone:
return None
# Remove common noise
phone = phone.strip().replace("\xa0", " ")
# If multiple numbers, take the first
for sep in [";", "/", "|", ","]:
if sep in phone:
phone = phone.split(sep)[0].strip()
return phone or None
def normalize_state(state: str | None) -> str | None:
"""Normalize Australian state names to abbreviations."""
if not state:
return None
state = state.strip().upper()
mapping = {
"NEW SOUTH WALES": "NSW",
"VICTORIA": "VIC",
"QUEENSLAND": "QLD",
"SOUTH AUSTRALIA": "SA",
"WESTERN AUSTRALIA": "WA",
"TASMANIA": "TAS",
"NORTHERN TERRITORY": "NT",
"AUSTRALIAN CAPITAL TERRITORY": "ACT",
"AUSTRALIA CAPITAL TERRITORY": "ACT",
}
result = mapping.get(state, state)
# Only return valid Australian states
valid = {"NSW", "VIC", "QLD", "SA", "WA", "TAS", "NT", "ACT"}
return result if result in valid else None
def generate_slug(name: str) -> str:
"""Generate a URL-safe slug from a business name."""
import re
slug = name.lower().strip()
slug = re.sub(r"[''`]", "", slug) # remove apostrophes
slug = re.sub(r"[^a-z0-9]+", "-", slug) # non-alphanum -> hyphen
slug = slug.strip("-")
return slug
def to_intermediate(source: str, source_id: str, source_url: str | None,
business: dict, locations: list[dict],
packages: list[dict] | None = None) -> dict:
"""Build the normalized intermediate format record."""
return {
"source": source,
"sourceId": source_id,
"sourceUrl": source_url,
"scrapedAt": datetime.now(timezone.utc).isoformat(),
"business": business,
"locations": locations,
"packages": packages or [],
}