Initial commit: funeral provider discovery pipeline

Python crawlers for VIC Register, Funerals Australia, NFDA
n8n workflows for scheduled discovery and enrichment
SQLite schema and seeded dev database (1,463 providers)
End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
Richie
2026-04-24 10:27:08 +10:00
commit cc91427789
30 changed files with 4706 additions and 0 deletions

285
database/schema.sql Normal file
View File

@@ -0,0 +1,285 @@
-- Provider Discovery Pipeline - Database Schema
-- Designed for Postgres. Compatible with SilverStripe CMS adaptation.
--
-- This schema covers the provider-facing tables needed for both
-- verified (signed-up) and unverified (auto-discovered) providers.
-- Product catalog tables (coffins, venues, etc.) are NOT included here —
-- those only apply to verified providers and live in the main CMS.
BEGIN;
-- ============================================================
-- ENUMS
-- ============================================================
CREATE TYPE enrichment_status AS ENUM ('pending', 'partial', 'complete', 'failed');
-- Listing tier determines how a provider appears on the platform.
-- Computed from data quality: verified status + packages + inclusions.
CREATE TYPE listing_tier AS ENUM (
'verified', -- Tier 1: Signed up, full branding, arrangements enabled
'priced', -- Tier 2: Unverified, 2+ packages with itemized inclusion prices
'estimated', -- Tier 3: Unverified, at least one total package price
'listed' -- Tier 4: Unverified, contact info only, no pricing
);
CREATE TYPE funeral_type_enum AS ENUM (
'Service & Cremation',
'Service & Burial',
'Cremation Only',
'Graveside Burial',
'Water Cremation'
);
-- ============================================================
-- 1. FUNERAL HOME (parent organisation)
-- ============================================================
CREATE TABLE funeral_home (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
website TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- ============================================================
-- 2. FUNERAL BRAND (customer-facing provider)
-- ============================================================
CREATE TABLE funeral_brand (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
description TEXT,
modal_description TEXT,
email TEXT,
phone TEXT,
website TEXT,
abn TEXT,
code TEXT UNIQUE, -- URL slug (e.g. "hparsons")
sort INTEGER DEFAULT 0,
hidden BOOLEAN NOT NULL DEFAULT TRUE, -- unverified start hidden
-- Address
business_address TEXT,
business_suburb TEXT,
business_state TEXT,
business_postcode TEXT,
-- Branding (nullable — unverified providers have no images)
background_colour TEXT,
foreground_colour TEXT,
-- Organisation
funeral_home_id INTEGER REFERENCES funeral_home(id) ON DELETE SET NULL,
-- Verified vs auto-discovered
verified BOOLEAN NOT NULL DEFAULT FALSE,
-- Provenance tracking
source_key TEXT UNIQUE, -- "{source}:{externalId}" for dedup
source_url TEXT, -- where this record was found
last_enriched_at TIMESTAMPTZ,
enrichment_status enrichment_status NOT NULL DEFAULT 'pending',
-- Listing tier (computed from data quality)
listing_tier listing_tier NOT NULL DEFAULT 'listed',
-- Funeral types offered (comma-separated IDs, same as existing CMS)
available_funeral_types TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Deduplication indexes
CREATE INDEX idx_brand_abn ON funeral_brand(abn) WHERE abn IS NOT NULL;
CREATE INDEX idx_brand_listing_tier ON funeral_brand(listing_tier);
CREATE INDEX idx_brand_source_key ON funeral_brand(source_key) WHERE source_key IS NOT NULL;
CREATE INDEX idx_brand_name_postcode ON funeral_brand(title, business_postcode);
CREATE INDEX idx_brand_verified ON funeral_brand(verified);
CREATE INDEX idx_brand_hidden ON funeral_brand(hidden);
CREATE INDEX idx_brand_enrichment ON funeral_brand(enrichment_status) WHERE verified = FALSE;
-- ============================================================
-- 3. LOCATION (physical office/chapel)
-- ============================================================
CREATE TABLE location (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL, -- display name (e.g. "Kingaroy, QLD")
address TEXT,
suburb TEXT,
state TEXT,
postcode TEXT,
country TEXT DEFAULT 'Australia',
lat DOUBLE PRECISION,
lng DOUBLE PRECISION,
rating REAL, -- Google rating 0-5
rating_num INTEGER, -- number of Google reviews
google_place_key TEXT, -- Google Places ID
brand_id INTEGER NOT NULL REFERENCES funeral_brand(id) ON DELETE CASCADE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_location_brand ON location(brand_id);
CREATE INDEX idx_location_state ON location(state);
CREATE INDEX idx_location_postcode ON location(postcode);
CREATE INDEX idx_location_coords ON location(lat, lng);
CREATE INDEX idx_location_google ON location(google_place_key) WHERE google_place_key IS NOT NULL;
-- ============================================================
-- 4. FUNERAL AREA (service region)
-- ============================================================
CREATE TABLE funeral_area (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
code TEXT,
description TEXT,
postcodes TEXT, -- comma-separated postcode list
sort INTEGER DEFAULT 0,
hidden BOOLEAN DEFAULT FALSE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Junction: brand <-> funeral_area
CREATE TABLE brand_funeral_area (
brand_id INTEGER NOT NULL REFERENCES funeral_brand(id) ON DELETE CASCADE,
funeral_area_id INTEGER NOT NULL REFERENCES funeral_area(id) ON DELETE CASCADE,
PRIMARY KEY (brand_id, funeral_area_id)
);
-- ============================================================
-- 5. PACKAGE (funeral plan bundle)
-- ============================================================
CREATE TABLE package (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
description TEXT,
sort INTEGER DEFAULT 0,
hidden BOOLEAN DEFAULT FALSE,
for_whom TEXT, -- 'myself' / 'someone' / null (both)
religion TEXT, -- comma-separated supported religions
funeral_type funeral_type_enum,
brand_id INTEGER NOT NULL REFERENCES funeral_brand(id) ON DELETE CASCADE,
-- Provenance (for AI-extracted packages)
source_url TEXT, -- page this was extracted from
extraction_confidence REAL, -- 0-1 confidence score from AI
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_package_brand ON package(brand_id);
CREATE INDEX idx_package_type ON package(funeral_type);
-- Junction: package <-> funeral_area
CREATE TABLE package_funeral_area (
package_id INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
funeral_area_id INTEGER NOT NULL REFERENCES funeral_area(id) ON DELETE CASCADE,
PRIMARY KEY (package_id, funeral_area_id)
);
-- ============================================================
-- 6. PACKAGE INCLUSION (fee line item within a package)
-- ============================================================
CREATE TABLE package_inclusion (
id SERIAL PRIMARY KEY,
price NUMERIC(10,2) NOT NULL,
optional BOOLEAN NOT NULL DEFAULT FALSE,
complimentary BOOLEAN NOT NULL DEFAULT FALSE,
display BOOLEAN NOT NULL DEFAULT TRUE,
description TEXT,
sort INTEGER DEFAULT 0,
inclusion_type_title TEXT NOT NULL, -- category label (e.g. "Professional Service Fee")
package_id INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_inclusion_package ON package_inclusion(package_id);
-- ============================================================
-- 7. KNOWN FOR (feature badges on provider cards)
-- ============================================================
CREATE TABLE known_for (
id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
brand_id INTEGER NOT NULL REFERENCES funeral_brand(id) ON DELETE CASCADE
);
CREATE INDEX idx_known_for_brand ON known_for(brand_id);
-- ============================================================
-- 8. SOURCE LOG (audit trail of scrape runs)
-- ============================================================
CREATE TABLE source_log (
id SERIAL PRIMARY KEY,
source_name TEXT NOT NULL, -- 'vic_register', 'gathered_here', 'nfda', 'funerals_australia'
run_started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
run_finished_at TIMESTAMPTZ,
records_found INTEGER DEFAULT 0,
records_new INTEGER DEFAULT 0,
records_updated INTEGER DEFAULT 0,
records_skipped INTEGER DEFAULT 0,
status TEXT DEFAULT 'running', -- 'running', 'completed', 'failed'
error_message TEXT,
metadata JSONB -- any extra run info
);
-- ============================================================
-- 9. SOURCE RECORD (raw scraped data, kept for audit)
-- ============================================================
CREATE TABLE source_record (
id SERIAL PRIMARY KEY,
source_name TEXT NOT NULL,
source_id TEXT NOT NULL, -- external ID from the source
source_url TEXT,
raw_data JSONB NOT NULL, -- original scraped data
normalized_data JSONB, -- mapped to intermediate format
matched_brand_id INTEGER REFERENCES funeral_brand(id) ON DELETE SET NULL,
match_type TEXT, -- 'source_key', 'abn', 'name_postcode', 'fuzzy', 'new'
processed_at TIMESTAMPTZ,
log_id INTEGER REFERENCES source_log(id) ON DELETE SET NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
UNIQUE(source_name, source_id)
);
CREATE INDEX idx_source_record_source ON source_record(source_name, source_id);
CREATE INDEX idx_source_record_brand ON source_record(matched_brand_id) WHERE matched_brand_id IS NOT NULL;
-- ============================================================
-- UPDATED_AT TRIGGER
-- ============================================================
CREATE OR REPLACE FUNCTION update_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER trg_funeral_home_updated BEFORE UPDATE ON funeral_home FOR EACH ROW EXECUTE FUNCTION update_updated_at();
CREATE TRIGGER trg_funeral_brand_updated BEFORE UPDATE ON funeral_brand FOR EACH ROW EXECUTE FUNCTION update_updated_at();
CREATE TRIGGER trg_location_updated BEFORE UPDATE ON location FOR EACH ROW EXECUTE FUNCTION update_updated_at();
CREATE TRIGGER trg_funeral_area_updated BEFORE UPDATE ON funeral_area FOR EACH ROW EXECUTE FUNCTION update_updated_at();
CREATE TRIGGER trg_package_updated BEFORE UPDATE ON package FOR EACH ROW EXECUTE FUNCTION update_updated_at();
CREATE TRIGGER trg_package_inclusion_updated BEFORE UPDATE ON package_inclusion FOR EACH ROW EXECUTE FUNCTION update_updated_at();
COMMIT;