"""Crawler for the VIC Consumer Affairs Public Register of Funeral Providers. Source: https://registers.consumer.vic.gov.au/fpsearch Method: HTTP GET per letter A-Z, parse HTML tables Fields: name, place of business, postcode, postal address, phone """ import re import time import json import html.parser from pathlib import Path from base import ( fetch_url, get_db, start_crawl_log, finish_crawl_log, store_source_record, normalize_phone, generate_slug, to_intermediate, CRAWL_DELAY, ) SOURCE_NAME = "vic_register" BASE_URL = "https://registers.consumer.vic.gov.au/FpSearch/PerformSearch" class VICTableParser(html.parser.HTMLParser): """Parse the VIC register HTML table into records.""" def __init__(self): super().__init__() self.records = [] self._in_table = False self._in_tbody = False self._in_row = False self._in_cell = False self._current_row = [] self._current_cell = "" def handle_starttag(self, tag, attrs): if tag == "table": self._in_table = True elif tag == "tbody" and self._in_table: self._in_tbody = True elif tag == "tr" and self._in_tbody: self._in_row = True self._current_row = [] elif tag == "td" and self._in_row: self._in_cell = True self._current_cell = "" def handle_endtag(self, tag): if tag == "td" and self._in_cell: self._in_cell = False self._current_row.append(self._current_cell.strip()) elif tag == "tr" and self._in_row: self._in_row = False if len(self._current_row) >= 4: self.records.append(self._current_row) elif tag == "tbody": self._in_tbody = False elif tag == "table": self._in_table = False def handle_data(self, data): if self._in_cell: self._current_cell += data def parse_address(place_of_business: str) -> dict: """Parse a VIC register address into components.""" parts = place_of_business.strip() # Try to extract postcode from the end postcode_match = re.search(r'\b(\d{4})\s*$', parts) postcode = postcode_match.group(1) if postcode_match else None # Try to extract suburb (usually the last word(s) before postcode) suburb = None if postcode: before_postcode = parts[:postcode_match.start()].strip().rstrip(",").strip() # Last segment after comma is usually suburb if "," in before_postcode: suburb = before_postcode.split(",")[-1].strip() else: # Take last 1-2 words as suburb words = before_postcode.split() if len(words) >= 2: suburb = " ".join(words[-2:]) if words[-1][0].isupper() else words[-1] return { "address": parts, "suburb": suburb, "state": "VIC", "postcode": postcode, } def crawl_letter(letter: str) -> list[dict]: """Crawl all records for a single letter.""" url = f"{BASE_URL}?Letter={letter}" html_text = fetch_url(url) parser = VICTableParser() parser.feed(html_text) records = [] for row in parser.records: # Columns: Name, Place of Business, Postcode, Postal Address, Phone name = row[0] if len(row) > 0 else "" place = row[1] if len(row) > 1 else "" postcode = row[2] if len(row) > 2 else "" postal = row[3] if len(row) > 3 else "" phone = row[4] if len(row) > 4 else "" if not name: continue records.append({ "name": name.strip(), "place_of_business": place.strip(), "postcode": postcode.strip(), "postal_address": postal.strip(), "phone": phone.strip(), }) return records def make_source_id(record: dict) -> str: """Create a stable source ID from name + address.""" name = record["name"].lower().strip() addr = record["place_of_business"].lower().strip() return f"{generate_slug(name)}_{record['postcode']}" def to_normalized(record: dict) -> dict: """Convert a VIC register record to intermediate format.""" addr = parse_address(record["place_of_business"]) business = { "name": record["name"], "abn": None, "phone": normalize_phone(record["phone"]), "email": None, "website": None, "description": None, } locations = [{ "address": record["place_of_business"], "suburb": addr["suburb"], "state": "VIC", "postcode": record["postcode"] or addr["postcode"], "lat": None, "lng": None, "phone": normalize_phone(record["phone"]), }] source_id = make_source_id(record) return to_intermediate( source=SOURCE_NAME, source_id=source_id, source_url=f"{BASE_URL}?Letter={record['name'][0].upper()}", business=business, locations=locations, ) def run(): """Run the full VIC register crawl.""" db = get_db() log_id = start_crawl_log(db, SOURCE_NAME) print(f"[{SOURCE_NAME}] Starting crawl (log_id={log_id})") all_records = [] found = 0 new = 0 skipped = 0 try: for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": print(f" Crawling letter {letter}...", end=" ", flush=True) records = crawl_letter(letter) print(f"{len(records)} records") all_records.extend(records) found += len(records) if letter != "Z": time.sleep(CRAWL_DELAY) # Store and normalize for record in all_records: source_id = make_source_id(record) row_id = store_source_record( db, SOURCE_NAME, source_id, f"{BASE_URL}?Letter={record['name'][0].upper()}", record, log_id ) if row_id: normalized = to_normalized(record) db.execute( "UPDATE source_record SET normalized_data = ? WHERE id = ?", (json.dumps(normalized), row_id) ) new += 1 else: skipped += 1 db.commit() finish_crawl_log(db, log_id, found, new, 0, skipped) print(f"[{SOURCE_NAME}] Done: {found} found, {new} new, {skipped} skipped") except Exception as e: finish_crawl_log(db, log_id, found, new, 0, skipped, "failed", str(e)) raise finally: db.close() return all_records if __name__ == "__main__": run()