"""ABN Lookup module via the Australian Business Register (ABR) API. Enriches providers with their ABN (strongest dedup key) and validates that they are active registered businesses. The ABR API is FREE. Requires a GUID (authentication token) from: https://abr.business.gov.au/Tools/WebServices Configuration: Set ABR_GUID env var or in config.json. """ import json import os import re import time import urllib.parse import xml.etree.ElementTree as ET from base import fetch_url, get_db, CRAWL_DELAY # Load ABR GUID from env or config ABR_GUID = os.environ.get("ABR_GUID") if not ABR_GUID: config_path = os.path.join(os.path.dirname(__file__), "config.json") if os.path.exists(config_path): with open(config_path) as f: config = json.load(f) ABR_GUID = config.get("abr_guid") ABR_BASE = "https://abr.business.gov.au/abrxmlsearch/AbrXmlSearch.asmx" def search_by_name(name: str, state: str | None = None, postcode: str | None = None) -> list[dict]: """Search ABR by business name. Returns matching records.""" if not ABR_GUID: print(" WARNING: ABR_GUID not configured. Skipping ABN lookup.") return [] params = { "name": name, "postcode": postcode or "", "legalName": "Y", "tradingName": "Y", "NSW": "Y", "SA": "Y", "ACT": "Y", "VIC": "Y", "WA": "Y", "NT": "Y", "QLD": "Y", "TAS": "Y", "authenticationGuid": ABR_GUID, } # If state specified, only search that state if state: for s in ["NSW", "SA", "ACT", "VIC", "WA", "NT", "QLD", "TAS"]: params[s] = "Y" if s == state else "N" url = f"{ABR_BASE}/ABRSearchByNameSimpleProtocol" try: text = fetch_url(url, method="GET", data=params, timeout=15) except Exception as e: return [] # Parse XML response results = [] try: root = ET.fromstring(text) # The ABR response uses a default namespace ns = {"abr": "http://abr.business.gov.au/ABRXMLSearch/"} for record in root.findall(".//abr:searchResultsRecord", ns): abn_elem = record.find(".//abr:ABN/abr:identifierValue", ns) status_elem = record.find(".//abr:ABN/abr:identifierStatus", ns) name_elem = ( record.find(".//abr:mainName/abr:organisationName", ns) or record.find(".//abr:mainTradingName/abr:organisationName", ns) or record.find(".//abr:businessName/abr:organisationName", ns) ) state_elem = record.find(".//abr:mainBusinessPhysicalAddress/abr:stateCode", ns) postcode_elem = record.find(".//abr:mainBusinessPhysicalAddress/abr:postcode", ns) score_elem = record.find(".//abr:nameScore", ns) if abn_elem is not None: results.append({ "abn": abn_elem.text, "status": status_elem.text if status_elem is not None else None, "name": name_elem.text if name_elem is not None else None, "state": state_elem.text if state_elem is not None else None, "postcode": postcode_elem.text if postcode_elem is not None else None, "score": int(score_elem.text) if score_elem is not None else 0, }) except ET.ParseError: return [] return results def find_best_match(name: str, state: str | None = None, postcode: str | None = None) -> dict | None: """Find the best ABR match for a business name. Returns the highest-scoring active match, or None. """ results = search_by_name(name, state, postcode) # Filter to active businesses active = [r for r in results if r.get("status") == "Active"] if not active: return None # Sort by score descending active.sort(key=lambda r: r.get("score", 0), reverse=True) # Return best match if score is reasonable best = active[0] if best.get("score", 0) >= 80: return best return None def run(limit: int | None = None, state_filter: str | None = None): """Look up ABNs for all providers that don't have one.""" db = get_db() query = """ SELECT id, title, business_state, business_postcode FROM funeral_brand WHERE abn IS NULL AND verified = 0 """ params = [] if state_filter: query += " AND business_state = ?" params.append(state_filter) query += " ORDER BY id" if limit: query += f" LIMIT {limit}" providers = db.execute(query, params).fetchall() print(f"Providers without ABN: {len(providers)}") if not ABR_GUID: print("ERROR: ABR_GUID not configured.") print(" Register at: https://abr.business.gov.au/Tools/WebServices") print(" Then set ABR_GUID env var or add 'abr_guid' to config.json") return found = 0 not_found = 0 for i, prov in enumerate(providers): if (i + 1) % 20 == 0 or i == 0: print(f" [{i+1}/{len(providers)}] {prov['title']}") match = find_best_match( prov["title"], prov["business_state"], prov["business_postcode"] ) if match: db.execute( "UPDATE funeral_brand SET abn = ?, updated_at = datetime('now') WHERE id = ?", (match["abn"], prov["id"]) ) found += 1 else: not_found += 1 if (i + 1) % 50 == 0: db.commit() time.sleep(0.5) # Be gentle with the government API db.commit() print(f"\nDone: {found} ABNs found, {not_found} not found") print(f" Success rate: {found/(found+not_found)*100:.1f}%" if found + not_found > 0 else "") db.close() if __name__ == "__main__": import sys limit = None state = None for arg in sys.argv[1:]: if arg.startswith("--state="): state = arg.split("=")[1] elif arg.startswith("--limit="): limit = int(arg.split("=")[1]) else: try: limit = int(arg) except ValueError: pass run(limit=limit, state_filter=state)