Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
165 lines
5.5 KiB
Python
165 lines
5.5 KiB
Python
"""Base crawler module with shared utilities."""
|
|
|
|
import gzip
|
|
import io
|
|
import json
|
|
import time
|
|
import sqlite3
|
|
import urllib.request
|
|
import urllib.parse
|
|
import urllib.error
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
DB_PATH = Path(__file__).parent.parent / "database" / "providers.db"
|
|
CRAWL_DELAY = 1.0 # seconds between requests (courtesy)
|
|
|
|
USER_AGENT = (
|
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
|
"(KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
|
|
def fetch_url(url: str, method: str = "GET", data: dict | None = None,
|
|
headers: dict | None = None, timeout: int = 30) -> str:
|
|
"""Fetch a URL and return the response body as text."""
|
|
hdrs = {"User-Agent": USER_AGENT}
|
|
if headers:
|
|
hdrs.update(headers)
|
|
|
|
body = None
|
|
if data and method == "POST":
|
|
body = urllib.parse.urlencode(data, doseq=True).encode("utf-8")
|
|
hdrs.setdefault("Content-Type", "application/x-www-form-urlencoded")
|
|
elif data and method == "GET":
|
|
url = url + "?" + urllib.parse.urlencode(data, doseq=True)
|
|
|
|
req = urllib.request.Request(url, data=body, headers=hdrs, method=method)
|
|
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
raw = resp.read()
|
|
# Handle gzip-compressed responses
|
|
if resp.headers.get("Content-Encoding") == "gzip" or raw[:2] == b"\x1f\x8b":
|
|
raw = gzip.decompress(raw)
|
|
charset = resp.headers.get_content_charset() or "utf-8"
|
|
return raw.decode(charset)
|
|
|
|
|
|
def fetch_json(url: str, method: str = "GET", data: dict | None = None,
|
|
headers: dict | None = None) -> dict:
|
|
"""Fetch a URL and parse the response as JSON."""
|
|
text = fetch_url(url, method=method, data=data, headers=headers)
|
|
return json.loads(text)
|
|
|
|
|
|
def get_db() -> sqlite3.Connection:
|
|
"""Get a connection to the SQLite database."""
|
|
conn = sqlite3.connect(str(DB_PATH))
|
|
conn.execute("PRAGMA journal_mode=WAL")
|
|
conn.execute("PRAGMA foreign_keys=ON")
|
|
conn.row_factory = sqlite3.Row
|
|
return conn
|
|
|
|
|
|
def start_crawl_log(db: sqlite3.Connection, source_name: str) -> int:
|
|
"""Create a source_log entry and return its ID."""
|
|
cur = db.execute(
|
|
"INSERT INTO source_log (source_name) VALUES (?)",
|
|
(source_name,)
|
|
)
|
|
db.commit()
|
|
return cur.lastrowid
|
|
|
|
|
|
def finish_crawl_log(db: sqlite3.Connection, log_id: int,
|
|
found: int, new: int, updated: int, skipped: int,
|
|
status: str = "completed", error: str | None = None):
|
|
"""Update a source_log entry with results."""
|
|
db.execute(
|
|
"""UPDATE source_log
|
|
SET run_finished_at = datetime('now'),
|
|
records_found = ?, records_new = ?,
|
|
records_updated = ?, records_skipped = ?,
|
|
status = ?, error_message = ?
|
|
WHERE id = ?""",
|
|
(found, new, updated, skipped, status, error, log_id)
|
|
)
|
|
db.commit()
|
|
|
|
|
|
def store_source_record(db: sqlite3.Connection, source_name: str,
|
|
source_id: str, source_url: str | None,
|
|
raw_data: dict, log_id: int) -> int | None:
|
|
"""Store a raw source record. Returns the row ID, or None if duplicate."""
|
|
try:
|
|
cur = db.execute(
|
|
"""INSERT INTO source_record
|
|
(source_name, source_id, source_url, raw_data, log_id)
|
|
VALUES (?, ?, ?, ?, ?)""",
|
|
(source_name, source_id, source_url, json.dumps(raw_data), log_id)
|
|
)
|
|
db.commit()
|
|
return cur.lastrowid
|
|
except sqlite3.IntegrityError:
|
|
# Duplicate source_name + source_id — already have this record
|
|
return None
|
|
|
|
|
|
def normalize_phone(phone: str | None) -> str | None:
|
|
"""Basic phone normalization."""
|
|
if not phone:
|
|
return None
|
|
# Remove common noise
|
|
phone = phone.strip().replace("\xa0", " ")
|
|
# If multiple numbers, take the first
|
|
for sep in [";", "/", "|", ","]:
|
|
if sep in phone:
|
|
phone = phone.split(sep)[0].strip()
|
|
return phone or None
|
|
|
|
|
|
def normalize_state(state: str | None) -> str | None:
|
|
"""Normalize Australian state names to abbreviations."""
|
|
if not state:
|
|
return None
|
|
state = state.strip().upper()
|
|
mapping = {
|
|
"NEW SOUTH WALES": "NSW",
|
|
"VICTORIA": "VIC",
|
|
"QUEENSLAND": "QLD",
|
|
"SOUTH AUSTRALIA": "SA",
|
|
"WESTERN AUSTRALIA": "WA",
|
|
"TASMANIA": "TAS",
|
|
"NORTHERN TERRITORY": "NT",
|
|
"AUSTRALIAN CAPITAL TERRITORY": "ACT",
|
|
"AUSTRALIA CAPITAL TERRITORY": "ACT",
|
|
}
|
|
result = mapping.get(state, state)
|
|
# Only return valid Australian states
|
|
valid = {"NSW", "VIC", "QLD", "SA", "WA", "TAS", "NT", "ACT"}
|
|
return result if result in valid else None
|
|
|
|
|
|
def generate_slug(name: str) -> str:
|
|
"""Generate a URL-safe slug from a business name."""
|
|
import re
|
|
slug = name.lower().strip()
|
|
slug = re.sub(r"[''`]", "", slug) # remove apostrophes
|
|
slug = re.sub(r"[^a-z0-9]+", "-", slug) # non-alphanum -> hyphen
|
|
slug = slug.strip("-")
|
|
return slug
|
|
|
|
|
|
def to_intermediate(source: str, source_id: str, source_url: str | None,
|
|
business: dict, locations: list[dict],
|
|
packages: list[dict] | None = None) -> dict:
|
|
"""Build the normalized intermediate format record."""
|
|
return {
|
|
"source": source,
|
|
"sourceId": source_id,
|
|
"sourceUrl": source_url,
|
|
"scrapedAt": datetime.now(timezone.utc).isoformat(),
|
|
"business": business,
|
|
"locations": locations,
|
|
"packages": packages or [],
|
|
}
|