Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
286 lines
12 KiB
PL/PgSQL
286 lines
12 KiB
PL/PgSQL
-- Provider Discovery Pipeline - Database Schema
|
|
-- Designed for Postgres. Compatible with SilverStripe CMS adaptation.
|
|
--
|
|
-- This schema covers the provider-facing tables needed for both
|
|
-- verified (signed-up) and unverified (auto-discovered) providers.
|
|
-- Product catalog tables (coffins, venues, etc.) are NOT included here —
|
|
-- those only apply to verified providers and live in the main CMS.
|
|
|
|
BEGIN;
|
|
|
|
-- ============================================================
|
|
-- ENUMS
|
|
-- ============================================================
|
|
|
|
CREATE TYPE enrichment_status AS ENUM ('pending', 'partial', 'complete', 'failed');
|
|
|
|
-- Listing tier determines how a provider appears on the platform.
|
|
-- Computed from data quality: verified status + packages + inclusions.
|
|
CREATE TYPE listing_tier AS ENUM (
|
|
'verified', -- Tier 1: Signed up, full branding, arrangements enabled
|
|
'priced', -- Tier 2: Unverified, 2+ packages with itemized inclusion prices
|
|
'estimated', -- Tier 3: Unverified, at least one total package price
|
|
'listed' -- Tier 4: Unverified, contact info only, no pricing
|
|
);
|
|
|
|
CREATE TYPE funeral_type_enum AS ENUM (
|
|
'Service & Cremation',
|
|
'Service & Burial',
|
|
'Cremation Only',
|
|
'Graveside Burial',
|
|
'Water Cremation'
|
|
);
|
|
|
|
-- ============================================================
|
|
-- 1. FUNERAL HOME (parent organisation)
|
|
-- ============================================================
|
|
|
|
CREATE TABLE funeral_home (
|
|
id SERIAL PRIMARY KEY,
|
|
title TEXT NOT NULL,
|
|
website TEXT,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
-- ============================================================
|
|
-- 2. FUNERAL BRAND (customer-facing provider)
|
|
-- ============================================================
|
|
|
|
CREATE TABLE funeral_brand (
|
|
id SERIAL PRIMARY KEY,
|
|
title TEXT NOT NULL,
|
|
description TEXT,
|
|
modal_description TEXT,
|
|
email TEXT,
|
|
phone TEXT,
|
|
website TEXT,
|
|
abn TEXT,
|
|
code TEXT UNIQUE, -- URL slug (e.g. "hparsons")
|
|
sort INTEGER DEFAULT 0,
|
|
hidden BOOLEAN NOT NULL DEFAULT TRUE, -- unverified start hidden
|
|
|
|
-- Address
|
|
business_address TEXT,
|
|
business_suburb TEXT,
|
|
business_state TEXT,
|
|
business_postcode TEXT,
|
|
|
|
-- Branding (nullable — unverified providers have no images)
|
|
background_colour TEXT,
|
|
foreground_colour TEXT,
|
|
|
|
-- Organisation
|
|
funeral_home_id INTEGER REFERENCES funeral_home(id) ON DELETE SET NULL,
|
|
|
|
-- Verified vs auto-discovered
|
|
verified BOOLEAN NOT NULL DEFAULT FALSE,
|
|
|
|
-- Provenance tracking
|
|
source_key TEXT UNIQUE, -- "{source}:{externalId}" for dedup
|
|
source_url TEXT, -- where this record was found
|
|
last_enriched_at TIMESTAMPTZ,
|
|
enrichment_status enrichment_status NOT NULL DEFAULT 'pending',
|
|
|
|
-- Listing tier (computed from data quality)
|
|
listing_tier listing_tier NOT NULL DEFAULT 'listed',
|
|
|
|
-- Funeral types offered (comma-separated IDs, same as existing CMS)
|
|
available_funeral_types TEXT,
|
|
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
-- Deduplication indexes
|
|
CREATE INDEX idx_brand_abn ON funeral_brand(abn) WHERE abn IS NOT NULL;
|
|
CREATE INDEX idx_brand_listing_tier ON funeral_brand(listing_tier);
|
|
CREATE INDEX idx_brand_source_key ON funeral_brand(source_key) WHERE source_key IS NOT NULL;
|
|
CREATE INDEX idx_brand_name_postcode ON funeral_brand(title, business_postcode);
|
|
CREATE INDEX idx_brand_verified ON funeral_brand(verified);
|
|
CREATE INDEX idx_brand_hidden ON funeral_brand(hidden);
|
|
CREATE INDEX idx_brand_enrichment ON funeral_brand(enrichment_status) WHERE verified = FALSE;
|
|
|
|
-- ============================================================
|
|
-- 3. LOCATION (physical office/chapel)
|
|
-- ============================================================
|
|
|
|
CREATE TABLE location (
|
|
id SERIAL PRIMARY KEY,
|
|
title TEXT NOT NULL, -- display name (e.g. "Kingaroy, QLD")
|
|
address TEXT,
|
|
suburb TEXT,
|
|
state TEXT,
|
|
postcode TEXT,
|
|
country TEXT DEFAULT 'Australia',
|
|
lat DOUBLE PRECISION,
|
|
lng DOUBLE PRECISION,
|
|
rating REAL, -- Google rating 0-5
|
|
rating_num INTEGER, -- number of Google reviews
|
|
google_place_key TEXT, -- Google Places ID
|
|
|
|
brand_id INTEGER NOT NULL REFERENCES funeral_brand(id) ON DELETE CASCADE,
|
|
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_location_brand ON location(brand_id);
|
|
CREATE INDEX idx_location_state ON location(state);
|
|
CREATE INDEX idx_location_postcode ON location(postcode);
|
|
CREATE INDEX idx_location_coords ON location(lat, lng);
|
|
CREATE INDEX idx_location_google ON location(google_place_key) WHERE google_place_key IS NOT NULL;
|
|
|
|
-- ============================================================
|
|
-- 4. FUNERAL AREA (service region)
|
|
-- ============================================================
|
|
|
|
CREATE TABLE funeral_area (
|
|
id SERIAL PRIMARY KEY,
|
|
title TEXT NOT NULL,
|
|
code TEXT,
|
|
description TEXT,
|
|
postcodes TEXT, -- comma-separated postcode list
|
|
sort INTEGER DEFAULT 0,
|
|
hidden BOOLEAN DEFAULT FALSE,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
-- Junction: brand <-> funeral_area
|
|
CREATE TABLE brand_funeral_area (
|
|
brand_id INTEGER NOT NULL REFERENCES funeral_brand(id) ON DELETE CASCADE,
|
|
funeral_area_id INTEGER NOT NULL REFERENCES funeral_area(id) ON DELETE CASCADE,
|
|
PRIMARY KEY (brand_id, funeral_area_id)
|
|
);
|
|
|
|
-- ============================================================
|
|
-- 5. PACKAGE (funeral plan bundle)
|
|
-- ============================================================
|
|
|
|
CREATE TABLE package (
|
|
id SERIAL PRIMARY KEY,
|
|
title TEXT NOT NULL,
|
|
description TEXT,
|
|
sort INTEGER DEFAULT 0,
|
|
hidden BOOLEAN DEFAULT FALSE,
|
|
for_whom TEXT, -- 'myself' / 'someone' / null (both)
|
|
religion TEXT, -- comma-separated supported religions
|
|
funeral_type funeral_type_enum,
|
|
|
|
brand_id INTEGER NOT NULL REFERENCES funeral_brand(id) ON DELETE CASCADE,
|
|
|
|
-- Provenance (for AI-extracted packages)
|
|
source_url TEXT, -- page this was extracted from
|
|
extraction_confidence REAL, -- 0-1 confidence score from AI
|
|
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_package_brand ON package(brand_id);
|
|
CREATE INDEX idx_package_type ON package(funeral_type);
|
|
|
|
-- Junction: package <-> funeral_area
|
|
CREATE TABLE package_funeral_area (
|
|
package_id INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
|
|
funeral_area_id INTEGER NOT NULL REFERENCES funeral_area(id) ON DELETE CASCADE,
|
|
PRIMARY KEY (package_id, funeral_area_id)
|
|
);
|
|
|
|
-- ============================================================
|
|
-- 6. PACKAGE INCLUSION (fee line item within a package)
|
|
-- ============================================================
|
|
|
|
CREATE TABLE package_inclusion (
|
|
id SERIAL PRIMARY KEY,
|
|
price NUMERIC(10,2) NOT NULL,
|
|
optional BOOLEAN NOT NULL DEFAULT FALSE,
|
|
complimentary BOOLEAN NOT NULL DEFAULT FALSE,
|
|
display BOOLEAN NOT NULL DEFAULT TRUE,
|
|
description TEXT,
|
|
sort INTEGER DEFAULT 0,
|
|
inclusion_type_title TEXT NOT NULL, -- category label (e.g. "Professional Service Fee")
|
|
|
|
package_id INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
|
|
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
|
);
|
|
|
|
CREATE INDEX idx_inclusion_package ON package_inclusion(package_id);
|
|
|
|
-- ============================================================
|
|
-- 7. KNOWN FOR (feature badges on provider cards)
|
|
-- ============================================================
|
|
|
|
CREATE TABLE known_for (
|
|
id SERIAL PRIMARY KEY,
|
|
title TEXT NOT NULL,
|
|
brand_id INTEGER NOT NULL REFERENCES funeral_brand(id) ON DELETE CASCADE
|
|
);
|
|
|
|
CREATE INDEX idx_known_for_brand ON known_for(brand_id);
|
|
|
|
-- ============================================================
|
|
-- 8. SOURCE LOG (audit trail of scrape runs)
|
|
-- ============================================================
|
|
|
|
CREATE TABLE source_log (
|
|
id SERIAL PRIMARY KEY,
|
|
source_name TEXT NOT NULL, -- 'vic_register', 'gathered_here', 'nfda', 'funerals_australia'
|
|
run_started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
run_finished_at TIMESTAMPTZ,
|
|
records_found INTEGER DEFAULT 0,
|
|
records_new INTEGER DEFAULT 0,
|
|
records_updated INTEGER DEFAULT 0,
|
|
records_skipped INTEGER DEFAULT 0,
|
|
status TEXT DEFAULT 'running', -- 'running', 'completed', 'failed'
|
|
error_message TEXT,
|
|
metadata JSONB -- any extra run info
|
|
);
|
|
|
|
-- ============================================================
|
|
-- 9. SOURCE RECORD (raw scraped data, kept for audit)
|
|
-- ============================================================
|
|
|
|
CREATE TABLE source_record (
|
|
id SERIAL PRIMARY KEY,
|
|
source_name TEXT NOT NULL,
|
|
source_id TEXT NOT NULL, -- external ID from the source
|
|
source_url TEXT,
|
|
raw_data JSONB NOT NULL, -- original scraped data
|
|
normalized_data JSONB, -- mapped to intermediate format
|
|
matched_brand_id INTEGER REFERENCES funeral_brand(id) ON DELETE SET NULL,
|
|
match_type TEXT, -- 'source_key', 'abn', 'name_postcode', 'fuzzy', 'new'
|
|
processed_at TIMESTAMPTZ,
|
|
log_id INTEGER REFERENCES source_log(id) ON DELETE SET NULL,
|
|
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
|
|
|
UNIQUE(source_name, source_id)
|
|
);
|
|
|
|
CREATE INDEX idx_source_record_source ON source_record(source_name, source_id);
|
|
CREATE INDEX idx_source_record_brand ON source_record(matched_brand_id) WHERE matched_brand_id IS NOT NULL;
|
|
|
|
-- ============================================================
|
|
-- UPDATED_AT TRIGGER
|
|
-- ============================================================
|
|
|
|
CREATE OR REPLACE FUNCTION update_updated_at()
|
|
RETURNS TRIGGER AS $$
|
|
BEGIN
|
|
NEW.updated_at = NOW();
|
|
RETURN NEW;
|
|
END;
|
|
$$ LANGUAGE plpgsql;
|
|
|
|
CREATE TRIGGER trg_funeral_home_updated BEFORE UPDATE ON funeral_home FOR EACH ROW EXECUTE FUNCTION update_updated_at();
|
|
CREATE TRIGGER trg_funeral_brand_updated BEFORE UPDATE ON funeral_brand FOR EACH ROW EXECUTE FUNCTION update_updated_at();
|
|
CREATE TRIGGER trg_location_updated BEFORE UPDATE ON location FOR EACH ROW EXECUTE FUNCTION update_updated_at();
|
|
CREATE TRIGGER trg_funeral_area_updated BEFORE UPDATE ON funeral_area FOR EACH ROW EXECUTE FUNCTION update_updated_at();
|
|
CREATE TRIGGER trg_package_updated BEFORE UPDATE ON package FOR EACH ROW EXECUTE FUNCTION update_updated_at();
|
|
CREATE TRIGGER trg_package_inclusion_updated BEFORE UPDATE ON package_inclusion FOR EACH ROW EXECUTE FUNCTION update_updated_at();
|
|
|
|
COMMIT;
|