Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
164
crawlers/base.py
Normal file
164
crawlers/base.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""Base crawler module with shared utilities."""
|
||||
|
||||
import gzip
|
||||
import io
|
||||
import json
|
||||
import time
|
||||
import sqlite3
|
||||
import urllib.request
|
||||
import urllib.parse
|
||||
import urllib.error
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = Path(__file__).parent.parent / "database" / "providers.db"
|
||||
CRAWL_DELAY = 1.0 # seconds between requests (courtesy)
|
||||
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
def fetch_url(url: str, method: str = "GET", data: dict | None = None,
|
||||
headers: dict | None = None, timeout: int = 30) -> str:
|
||||
"""Fetch a URL and return the response body as text."""
|
||||
hdrs = {"User-Agent": USER_AGENT}
|
||||
if headers:
|
||||
hdrs.update(headers)
|
||||
|
||||
body = None
|
||||
if data and method == "POST":
|
||||
body = urllib.parse.urlencode(data, doseq=True).encode("utf-8")
|
||||
hdrs.setdefault("Content-Type", "application/x-www-form-urlencoded")
|
||||
elif data and method == "GET":
|
||||
url = url + "?" + urllib.parse.urlencode(data, doseq=True)
|
||||
|
||||
req = urllib.request.Request(url, data=body, headers=hdrs, method=method)
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
raw = resp.read()
|
||||
# Handle gzip-compressed responses
|
||||
if resp.headers.get("Content-Encoding") == "gzip" or raw[:2] == b"\x1f\x8b":
|
||||
raw = gzip.decompress(raw)
|
||||
charset = resp.headers.get_content_charset() or "utf-8"
|
||||
return raw.decode(charset)
|
||||
|
||||
|
||||
def fetch_json(url: str, method: str = "GET", data: dict | None = None,
|
||||
headers: dict | None = None) -> dict:
|
||||
"""Fetch a URL and parse the response as JSON."""
|
||||
text = fetch_url(url, method=method, data=data, headers=headers)
|
||||
return json.loads(text)
|
||||
|
||||
|
||||
def get_db() -> sqlite3.Connection:
|
||||
"""Get a connection to the SQLite database."""
|
||||
conn = sqlite3.connect(str(DB_PATH))
|
||||
conn.execute("PRAGMA journal_mode=WAL")
|
||||
conn.execute("PRAGMA foreign_keys=ON")
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
|
||||
def start_crawl_log(db: sqlite3.Connection, source_name: str) -> int:
|
||||
"""Create a source_log entry and return its ID."""
|
||||
cur = db.execute(
|
||||
"INSERT INTO source_log (source_name) VALUES (?)",
|
||||
(source_name,)
|
||||
)
|
||||
db.commit()
|
||||
return cur.lastrowid
|
||||
|
||||
|
||||
def finish_crawl_log(db: sqlite3.Connection, log_id: int,
|
||||
found: int, new: int, updated: int, skipped: int,
|
||||
status: str = "completed", error: str | None = None):
|
||||
"""Update a source_log entry with results."""
|
||||
db.execute(
|
||||
"""UPDATE source_log
|
||||
SET run_finished_at = datetime('now'),
|
||||
records_found = ?, records_new = ?,
|
||||
records_updated = ?, records_skipped = ?,
|
||||
status = ?, error_message = ?
|
||||
WHERE id = ?""",
|
||||
(found, new, updated, skipped, status, error, log_id)
|
||||
)
|
||||
db.commit()
|
||||
|
||||
|
||||
def store_source_record(db: sqlite3.Connection, source_name: str,
|
||||
source_id: str, source_url: str | None,
|
||||
raw_data: dict, log_id: int) -> int | None:
|
||||
"""Store a raw source record. Returns the row ID, or None if duplicate."""
|
||||
try:
|
||||
cur = db.execute(
|
||||
"""INSERT INTO source_record
|
||||
(source_name, source_id, source_url, raw_data, log_id)
|
||||
VALUES (?, ?, ?, ?, ?)""",
|
||||
(source_name, source_id, source_url, json.dumps(raw_data), log_id)
|
||||
)
|
||||
db.commit()
|
||||
return cur.lastrowid
|
||||
except sqlite3.IntegrityError:
|
||||
# Duplicate source_name + source_id — already have this record
|
||||
return None
|
||||
|
||||
|
||||
def normalize_phone(phone: str | None) -> str | None:
|
||||
"""Basic phone normalization."""
|
||||
if not phone:
|
||||
return None
|
||||
# Remove common noise
|
||||
phone = phone.strip().replace("\xa0", " ")
|
||||
# If multiple numbers, take the first
|
||||
for sep in [";", "/", "|", ","]:
|
||||
if sep in phone:
|
||||
phone = phone.split(sep)[0].strip()
|
||||
return phone or None
|
||||
|
||||
|
||||
def normalize_state(state: str | None) -> str | None:
|
||||
"""Normalize Australian state names to abbreviations."""
|
||||
if not state:
|
||||
return None
|
||||
state = state.strip().upper()
|
||||
mapping = {
|
||||
"NEW SOUTH WALES": "NSW",
|
||||
"VICTORIA": "VIC",
|
||||
"QUEENSLAND": "QLD",
|
||||
"SOUTH AUSTRALIA": "SA",
|
||||
"WESTERN AUSTRALIA": "WA",
|
||||
"TASMANIA": "TAS",
|
||||
"NORTHERN TERRITORY": "NT",
|
||||
"AUSTRALIAN CAPITAL TERRITORY": "ACT",
|
||||
"AUSTRALIA CAPITAL TERRITORY": "ACT",
|
||||
}
|
||||
result = mapping.get(state, state)
|
||||
# Only return valid Australian states
|
||||
valid = {"NSW", "VIC", "QLD", "SA", "WA", "TAS", "NT", "ACT"}
|
||||
return result if result in valid else None
|
||||
|
||||
|
||||
def generate_slug(name: str) -> str:
|
||||
"""Generate a URL-safe slug from a business name."""
|
||||
import re
|
||||
slug = name.lower().strip()
|
||||
slug = re.sub(r"[''`]", "", slug) # remove apostrophes
|
||||
slug = re.sub(r"[^a-z0-9]+", "-", slug) # non-alphanum -> hyphen
|
||||
slug = slug.strip("-")
|
||||
return slug
|
||||
|
||||
|
||||
def to_intermediate(source: str, source_id: str, source_url: str | None,
|
||||
business: dict, locations: list[dict],
|
||||
packages: list[dict] | None = None) -> dict:
|
||||
"""Build the normalized intermediate format record."""
|
||||
return {
|
||||
"source": source,
|
||||
"sourceId": source_id,
|
||||
"sourceUrl": source_url,
|
||||
"scrapedAt": datetime.now(timezone.utc).isoformat(),
|
||||
"business": business,
|
||||
"locations": locations,
|
||||
"packages": packages or [],
|
||||
}
|
||||
Reference in New Issue
Block a user