"""GBIF Backbone Taxonomy resolver, with on-disk cache.

Uses the public species/match endpoint, which fuzzy matches a scientific name
against the GBIF backbone and returns a stable usageKey. We turn that into
a canonical, resolvable URL: https://www.gbif.org/species/{usageKey}.

No external dependency, stdlib only.
"""
from __future__ import annotations
import json
import urllib.parse
import urllib.request
from pathlib import Path
from typing import Optional


GBIF_MATCH_URL = "https://api.gbif.org/v1/species/match"
GBIF_SPECIES_PAGE = "https://www.gbif.org/species/{key}"


class GbifResolver:
    """Resolve scientific names to GBIF species URLs, cached to disk."""

    def __init__(self, cache_path: Path, timeout: float = 10.0):
        self.cache_path = Path(cache_path)
        self.timeout = timeout
        self._cache: dict[str, dict] = {}
        if self.cache_path.exists():
            self._cache = json.loads(self.cache_path.read_text(encoding="utf-8"))

    def resolve(self, scientific_name: str) -> Optional[dict]:
        """Return {url, usageKey, canonicalName, matchType} or None on failure.

        Result is cached, including misses (stored as {"miss": true}) so a
        re-run does not pound the API for unmatched names.
        """
        key = scientific_name.strip()
        if not key:
            return None
        if key in self._cache:
            entry = self._cache[key]
            return None if entry.get("miss") else entry

        params = urllib.parse.urlencode({"name": key, "verbose": "false"})
        url = f"{GBIF_MATCH_URL}?{params}"
        try:
            with urllib.request.urlopen(url, timeout=self.timeout) as r:
                payload = json.loads(r.read().decode("utf-8"))
        except Exception as e:
            print(f"  ! GBIF lookup failed for {key!r}: {e}")
            return None

        usage_key = payload.get("usageKey")
        if not usage_key or payload.get("matchType") == "NONE":
            self._cache[key] = {"miss": True}
            self._flush()
            return None

        entry = {
            "url": GBIF_SPECIES_PAGE.format(key=usage_key),
            "usageKey": usage_key,
            "canonicalName": payload.get("canonicalName") or payload.get("scientificName"),
            "matchType": payload.get("matchType"),
            "rank": payload.get("rank"),
        }
        self._cache[key] = entry
        self._flush()
        return entry

    def _flush(self) -> None:
        self.cache_path.parent.mkdir(parents=True, exist_ok=True)
        self.cache_path.write_text(
            json.dumps(self._cache, ensure_ascii=False, indent=2, sort_keys=True),
            encoding="utf-8",
        )