Initial commit: funeral provider discovery pipeline
Python crawlers for VIC Register, Funerals Australia, NFDA n8n workflows for scheduled discovery and enrichment SQLite schema and seeded dev database (1,463 providers) End-to-end process documentation in n8n/PROCESS.md
This commit is contained in:
220
crawlers/crawl_vic_register.py
Normal file
220
crawlers/crawl_vic_register.py
Normal file
@@ -0,0 +1,220 @@
|
||||
"""Crawler for the VIC Consumer Affairs Public Register of Funeral Providers.
|
||||
|
||||
Source: https://registers.consumer.vic.gov.au/fpsearch
|
||||
Method: HTTP GET per letter A-Z, parse HTML tables
|
||||
Fields: name, place of business, postcode, postal address, phone
|
||||
"""
|
||||
|
||||
import re
|
||||
import time
|
||||
import json
|
||||
import html.parser
|
||||
from pathlib import Path
|
||||
|
||||
from base import (
|
||||
fetch_url, get_db, start_crawl_log, finish_crawl_log,
|
||||
store_source_record, normalize_phone, generate_slug,
|
||||
to_intermediate, CRAWL_DELAY,
|
||||
)
|
||||
|
||||
SOURCE_NAME = "vic_register"
|
||||
BASE_URL = "https://registers.consumer.vic.gov.au/FpSearch/PerformSearch"
|
||||
|
||||
|
||||
class VICTableParser(html.parser.HTMLParser):
|
||||
"""Parse the VIC register HTML table into records."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.records = []
|
||||
self._in_table = False
|
||||
self._in_tbody = False
|
||||
self._in_row = False
|
||||
self._in_cell = False
|
||||
self._current_row = []
|
||||
self._current_cell = ""
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == "table":
|
||||
self._in_table = True
|
||||
elif tag == "tbody" and self._in_table:
|
||||
self._in_tbody = True
|
||||
elif tag == "tr" and self._in_tbody:
|
||||
self._in_row = True
|
||||
self._current_row = []
|
||||
elif tag == "td" and self._in_row:
|
||||
self._in_cell = True
|
||||
self._current_cell = ""
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag == "td" and self._in_cell:
|
||||
self._in_cell = False
|
||||
self._current_row.append(self._current_cell.strip())
|
||||
elif tag == "tr" and self._in_row:
|
||||
self._in_row = False
|
||||
if len(self._current_row) >= 4:
|
||||
self.records.append(self._current_row)
|
||||
elif tag == "tbody":
|
||||
self._in_tbody = False
|
||||
elif tag == "table":
|
||||
self._in_table = False
|
||||
|
||||
def handle_data(self, data):
|
||||
if self._in_cell:
|
||||
self._current_cell += data
|
||||
|
||||
|
||||
def parse_address(place_of_business: str) -> dict:
|
||||
"""Parse a VIC register address into components."""
|
||||
parts = place_of_business.strip()
|
||||
# Try to extract postcode from the end
|
||||
postcode_match = re.search(r'\b(\d{4})\s*$', parts)
|
||||
postcode = postcode_match.group(1) if postcode_match else None
|
||||
|
||||
# Try to extract suburb (usually the last word(s) before postcode)
|
||||
suburb = None
|
||||
if postcode:
|
||||
before_postcode = parts[:postcode_match.start()].strip().rstrip(",").strip()
|
||||
# Last segment after comma is usually suburb
|
||||
if "," in before_postcode:
|
||||
suburb = before_postcode.split(",")[-1].strip()
|
||||
else:
|
||||
# Take last 1-2 words as suburb
|
||||
words = before_postcode.split()
|
||||
if len(words) >= 2:
|
||||
suburb = " ".join(words[-2:]) if words[-1][0].isupper() else words[-1]
|
||||
|
||||
return {
|
||||
"address": parts,
|
||||
"suburb": suburb,
|
||||
"state": "VIC",
|
||||
"postcode": postcode,
|
||||
}
|
||||
|
||||
|
||||
def crawl_letter(letter: str) -> list[dict]:
|
||||
"""Crawl all records for a single letter."""
|
||||
url = f"{BASE_URL}?Letter={letter}"
|
||||
html_text = fetch_url(url)
|
||||
|
||||
parser = VICTableParser()
|
||||
parser.feed(html_text)
|
||||
|
||||
records = []
|
||||
for row in parser.records:
|
||||
# Columns: Name, Place of Business, Postcode, Postal Address, Phone
|
||||
name = row[0] if len(row) > 0 else ""
|
||||
place = row[1] if len(row) > 1 else ""
|
||||
postcode = row[2] if len(row) > 2 else ""
|
||||
postal = row[3] if len(row) > 3 else ""
|
||||
phone = row[4] if len(row) > 4 else ""
|
||||
|
||||
if not name:
|
||||
continue
|
||||
|
||||
records.append({
|
||||
"name": name.strip(),
|
||||
"place_of_business": place.strip(),
|
||||
"postcode": postcode.strip(),
|
||||
"postal_address": postal.strip(),
|
||||
"phone": phone.strip(),
|
||||
})
|
||||
|
||||
return records
|
||||
|
||||
|
||||
def make_source_id(record: dict) -> str:
|
||||
"""Create a stable source ID from name + address."""
|
||||
name = record["name"].lower().strip()
|
||||
addr = record["place_of_business"].lower().strip()
|
||||
return f"{generate_slug(name)}_{record['postcode']}"
|
||||
|
||||
|
||||
def to_normalized(record: dict) -> dict:
|
||||
"""Convert a VIC register record to intermediate format."""
|
||||
addr = parse_address(record["place_of_business"])
|
||||
|
||||
business = {
|
||||
"name": record["name"],
|
||||
"abn": None,
|
||||
"phone": normalize_phone(record["phone"]),
|
||||
"email": None,
|
||||
"website": None,
|
||||
"description": None,
|
||||
}
|
||||
|
||||
locations = [{
|
||||
"address": record["place_of_business"],
|
||||
"suburb": addr["suburb"],
|
||||
"state": "VIC",
|
||||
"postcode": record["postcode"] or addr["postcode"],
|
||||
"lat": None,
|
||||
"lng": None,
|
||||
"phone": normalize_phone(record["phone"]),
|
||||
}]
|
||||
|
||||
source_id = make_source_id(record)
|
||||
return to_intermediate(
|
||||
source=SOURCE_NAME,
|
||||
source_id=source_id,
|
||||
source_url=f"{BASE_URL}?Letter={record['name'][0].upper()}",
|
||||
business=business,
|
||||
locations=locations,
|
||||
)
|
||||
|
||||
|
||||
def run():
|
||||
"""Run the full VIC register crawl."""
|
||||
db = get_db()
|
||||
log_id = start_crawl_log(db, SOURCE_NAME)
|
||||
print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})")
|
||||
|
||||
all_records = []
|
||||
found = 0
|
||||
new = 0
|
||||
skipped = 0
|
||||
|
||||
try:
|
||||
for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
|
||||
print(f" Crawling letter {letter}...", end=" ", flush=True)
|
||||
records = crawl_letter(letter)
|
||||
print(f"{len(records)} records")
|
||||
all_records.extend(records)
|
||||
found += len(records)
|
||||
|
||||
if letter != "Z":
|
||||
time.sleep(CRAWL_DELAY)
|
||||
|
||||
# Store and normalize
|
||||
for record in all_records:
|
||||
source_id = make_source_id(record)
|
||||
row_id = store_source_record(
|
||||
db, SOURCE_NAME, source_id,
|
||||
f"{BASE_URL}?Letter={record['name'][0].upper()}",
|
||||
record, log_id
|
||||
)
|
||||
if row_id:
|
||||
normalized = to_normalized(record)
|
||||
db.execute(
|
||||
"UPDATE source_record SET normalized_data = ? WHERE id = ?",
|
||||
(json.dumps(normalized), row_id)
|
||||
)
|
||||
new += 1
|
||||
else:
|
||||
skipped += 1
|
||||
|
||||
db.commit()
|
||||
finish_crawl_log(db, log_id, found, new, 0, skipped)
|
||||
print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, {skipped} skipped")
|
||||
|
||||
except Exception as e:
|
||||
finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e))
|
||||
raise
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
return all_records
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
Reference in New Issue
Block a user