# -*- coding: utf-8 -*-
# SDR: sdr-0017-site-html-rastreabilidade
"""
Varre HTML em site/, valida links internos no filesystem e gera:
  - site/diagnostico/relatorio-links-falhas.md e .json
  - site/notebook-lm-urls.txt — uma URL absoluta por linha, vírgula no final (copiar/colar no Notebook LM)

Uso (na raiz do repositório), após sync + gerar_html_estatico_sdrs:
  python arquitetura-contratual/scripts/verificar_links_e_inventario_site.py
  python arquitetura-contratual/scripts/verificar_links_e_inventario_site.py \\
      --base-url https://cs.sinergia.dev.br --include-repo-md

Saída: exit code 1 se houver ao menos um link interno inválido.
"""
from __future__ import annotations

import argparse
import json
import sys
from datetime import datetime, timezone
from html.parser import HTMLParser
from pathlib import Path
from urllib.parse import parse_qs, unquote, urljoin

REPO = Path(__file__).resolve().parents[2]
SITE = REPO / "arquitetura-contratual" / "site"
FAKE_BASE = "https://__local-site__/"
DIAG = "diagnostico"
DEFAULT_NOTEBOOK_LM_BASE = "https://centralservicos.web.app"


def should_skip_href(raw: str) -> bool:
    r = (raw or "").strip()
    if not r or r == "#":
        return True
    lo = r.lower()
    if lo.startswith(("mailto:", "tel:", "javascript:", "data:")):
        return True
    if lo.startswith(("http://", "https://")):
        return True
    if r.startswith("#") and "?" not in r:
        return True
    return False


def resolve_internal_target(site_root: Path, from_html: Path, raw_href: str) -> tuple[Path | None, str | None]:
    """
    Retorna (caminho_absoluto_a_verificar, motivo_erro).
    motivo_erro: None = OK; 'skip' = não verificar; 'outside-site' = path traversal.
    """
    raw_href = raw_href.strip()
    if should_skip_href(raw_href):
        return None, "skip"

    hash_pos = raw_href.find("#")
    main = raw_href[:hash_pos] if hash_pos >= 0 else raw_href
    qpos = main.find("?")
    path_part = main[:qpos] if qpos >= 0 else main
    query = main[qpos + 1 :] if qpos >= 0 else ""

    site_root = site_root.resolve()
    rel_from = from_html.relative_to(site_root).as_posix()
    joined = urljoin(FAKE_BASE + rel_from, path_part)
    if not joined.startswith(FAKE_BASE):
        return None, "skip"
    rel_resolved = unquote(joined[len(FAKE_BASE) :].lstrip("/"))
    if not rel_resolved:
        return None, "skip"

    target = (site_root / rel_resolved).resolve()
    try:
        target.relative_to(site_root)
    except ValueError:
        return None, "outside-site"

    if target.name == "ver-md.html" and query:
        qs = parse_qs(query, strict_parsing=False)
        vals = qs.get("f") or []
        if vals:
            fpath = unquote(vals[0]).replace("\\", "/").strip().lstrip("/")
            md_target = (site_root / fpath).resolve()
            try:
                md_target.relative_to(site_root)
            except ValueError:
                return None, "outside-site"
            return md_target, None

    return target, None


def target_exists(p: Path) -> bool:
    return p.is_file() or p.is_dir()


class LinkCollector(HTMLParser):
    def __init__(self) -> None:
        super().__init__(convert_charrefs=True)
        self.found: list[tuple[str, str, str, int]] = []

    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
        ad = {k: v for k, v in attrs if v is not None}
        line, _ = self.getpos()
        if tag == "a" and "href" in ad:
            self.found.append(("a", "href", ad["href"], line))
        elif tag == "link" and "href" in ad:
            self.found.append(("link", "href", ad["href"], line))
        elif tag == "script" and "src" in ad:
            self.found.append(("script", "src", ad["src"], line))
        elif tag == "img" and "src" in ad:
            self.found.append(("img", "src", ad["src"], line))
        elif tag == "iframe" and "src" in ad:
            self.found.append(("iframe", "src", ad["src"], line))


def collect_html_files(site_root: Path) -> list[Path]:
    out: list[Path] = []
    for p in sorted(site_root.rglob("*.html")):
        rel = p.relative_to(site_root)
        if rel.parts and rel.parts[0] == DIAG:
            continue
        out.append(p)
    return out


def repo_sdr_md_to_sdr_html_url(site_root: Path, md_path: Path, base: str) -> str | None:
    """
    Espelho repositorio/SDRs/sdr-*.md → página HTML estática site/sdr/sdr-*.html (Notebook LM: corpo no GET).
    Ignora .md que não tenham HTML gerado correspondente.
    """
    try:
        rel = md_path.relative_to(site_root).as_posix().replace("\\", "/")
    except ValueError:
        return None
    if not rel.startswith("repositorio/SDRs/sdr-") or not rel.endswith(".md"):
        return None
    stem = Path(rel).stem
    if not stem.startswith("sdr-"):
        return None
    html_rel = f"sdr/{stem}.html"
    if not (site_root / html_rel).is_file():
        return None
    return f"{base}/{html_rel}"


def write_notebook_lm_txt(
    out_path: Path,
    base_url: str,
    site_root: Path,
    html_pages: list[Path],
    repo_md: list[Path] | None,
) -> None:
    """Uma URL absoluta por linha, vírgula no final, sem outro texto (copiar/colar no Notebook LM). Só .html, nunca .md."""
    base = base_url.rstrip("/")
    urls: list[str] = []
    for p in html_pages:
        rel = p.relative_to(site_root).as_posix().replace("\\", "/")
        urls.append(f"{base}/{rel}")
    if repo_md:
        for p in repo_md:
            u = repo_sdr_md_to_sdr_html_url(site_root, p, base)
            if u:
                urls.append(u)
    urls = sorted(set(urls), key=str.lower)
    body = "\n".join(f"{u}," for u in urls)
    if body:
        body += "\n"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(body, encoding="utf-8")


def main() -> int:
    ap = argparse.ArgumentParser(description="Verifica links internos no site estático e gera inventário Notebook LM.")
    ap.add_argument(
        "--site-root",
        type=Path,
        default=SITE,
        help="Raiz do site (default: arquitetura-contratual/site)",
    )
    ap.add_argument(
        "--base-url",
        default=DEFAULT_NOTEBOOK_LM_BASE,
        help=f"URL do host sem barra final para notebook-lm-urls.txt (default: {DEFAULT_NOTEBOOK_LM_BASE})",
    )
    ap.add_argument(
        "--include-repo-md",
        action="store_true",
        help="Incluir URLs dos SDRs a partir do espelho .md (emite /sdr/sdr-*.html equivalente, não .md)",
    )
    ap.add_argument(
        "--repo-md-glob",
        default="repositorio/SDRs/sdr-*.md",
        help="Padrão relativo ao site-root para .md do espelho (default: repositorio/SDRs/sdr-*.md)",
    )
    args = ap.parse_args()
    site_root: Path = args.site_root.resolve()
    if not site_root.is_dir():
        print(f"Erro: pasta não encontrada: {site_root}", file=sys.stderr)
        return 2

    base_url = (args.base_url or "").strip() or DEFAULT_NOTEBOOK_LM_BASE

    failures: list[dict[str, str | int]] = []
    all_html = collect_html_files(site_root)

    for html_path in all_html:
        try:
            text = html_path.read_text(encoding="utf-8")
        except OSError as e:
            failures.append(
                {
                    "source": html_path.relative_to(site_root).as_posix(),
                    "tag": "?",
                    "attr": "?",
                    "href": "",
                    "resolved": "",
                    "line": 0,
                    "error": f"leitura: {e}",
                }
            )
            continue

        parser = LinkCollector()
        try:
            parser.feed(text)
        except Exception as e:
            failures.append(
                {
                    "source": html_path.relative_to(site_root).as_posix(),
                    "tag": "?",
                    "attr": "?",
                    "href": "",
                    "resolved": "",
                    "line": 0,
                    "error": f"parse HTML: {e}",
                }
            )
            continue

        for tag, attr, href, line in parser.found:
            target, err = resolve_internal_target(site_root, html_path, href)
            if err == "skip":
                continue
            if err == "outside-site":
                failures.append(
                    {
                        "source": html_path.relative_to(site_root).as_posix(),
                        "tag": tag,
                        "attr": attr,
                        "href": href,
                        "resolved": "(fora do site)",
                        "line": line,
                        "error": "path resolve fora da raiz do site",
                    }
                )
                continue
            if target is None:
                continue
            if not target_exists(target):
                try:
                    rel_t = target.relative_to(site_root).as_posix()
                except ValueError:
                    rel_t = str(target)
                failures.append(
                    {
                        "source": html_path.relative_to(site_root).as_posix(),
                        "tag": tag,
                        "attr": attr,
                        "href": href,
                        "resolved": rel_t,
                        "line": line,
                        "error": "arquivo ou pasta inexistente",
                    }
                )

    diag_dir = site_root / DIAG
    diag_dir.mkdir(parents=True, exist_ok=True)
    ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    payload = {
        "generatedAt": ts,
        "siteRoot": str(site_root),
        "pagesScanned": len(all_html),
        "failureCount": len(failures),
        "failures": failures,
    }
    (diag_dir / "relatorio-links-falhas.json").write_text(
        json.dumps(payload, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )

    md_lines = [
        f"# Relatório de links internos quebrados",
        "",
        f"- Gerado em: `{ts}` (UTC)",
        f"- Raiz do site: `{site_root}`",
        f"- Páginas HTML analisadas: {len(all_html)}",
        f"- Falhas: **{len(failures)}**",
        "",
    ]
    if not failures:
        md_lines.append("Nenhuma falha encontrada (links internos resolvíveis no disco).")
    else:
        md_lines.extend(["| Origem | Linha | Tag | href | Resolvido | Erro |", "|--------|-------|-----|------|-----------|------|"])
        for f in failures:
            h = str(f["href"]).replace("|", "\\|")
            r = str(f["resolved"]).replace("|", "\\|")
            md_lines.append(
                f"| `{f['source']}` | {f['line']} | {f['tag']} | `{h}` | `{r}` | {f['error']} |"
            )
    (diag_dir / "relatorio-links-falhas.md").write_text("\n".join(md_lines), encoding="utf-8")

    repo_md_list: list[Path] | None = None
    if args.include_repo_md:
        pat = args.repo_md_glob.replace("\\", "/").lstrip("/")
        repo_md_list = sorted(site_root.glob(pat))

    inv_pages = sorted(
        p
        for p in site_root.rglob("*.html")
        if not (p.relative_to(site_root).parts[:1] == (DIAG,))
    )
    write_notebook_lm_txt(
        site_root / "notebook-lm-urls.txt",
        base_url,
        site_root,
        inv_pages,
        repo_md_list,
    )

    print(f"OK: {len(all_html)} HTML analisados; falhas: {len(failures)}")
    print(f"     JSON: {diag_dir / 'relatorio-links-falhas.json'}")
    print(f"     MD:   {diag_dir / 'relatorio-links-falhas.md'}")
    print(f"     Inventário Notebook LM: {site_root / 'notebook-lm-urls.txt'} (base {base_url})")

    return 1 if failures else 0


if __name__ == "__main__":
    sys.exit(main())