"""cloakbrowser_tutorial.py — learn CloakBrowser by running it.

Setup, usage, and the "why a script and not a notebook" note all live on the
lesson page: https://krueng.ai/cloakbrowser.html
"""
import re
import sys
import json
import statistics
from pathlib import Path
from cloakbrowser import launch

# Windows consoles default to cp1252, which can't encode ฿ or — . Force UTF-8 so
# printing prices doesn't crash. (Same family as the Thai-font codec gotchas.)
sys.stdout.reconfigure(encoding="utf-8")


# Step 1: the minimal recipe
def minimal():
    """Launch -> open a tab -> navigate -> wait for JS -> read the HTML."""
    browser = launch()
    try:
        page = browser.new_page()
        page.goto("https://www.fazwaz.com/condo-for-rent/thailand/chiang-mai",
                  timeout=60000, wait_until="domcontentloaded")
        page.wait_for_timeout(4000)        # let JavaScript-rendered content settle
        html = page.content()
        page.close()
    finally:
        browser.close()                    # always close — it's a real process
    print(len(html), "bytes of real, rendered HTML")
    print("contains a baht price:", "฿" in html)


# Step 2: one browser, many pages, save raw
def multipage():
    """Launch once, open a FRESH page per URL, save each raw page before parsing."""
    urls = [
        "https://www.fazwaz.com/condo-for-rent/thailand/chiang-mai/mueang-chiang-mai/tha-sala",
        "https://www.fazwaz.com/condo-for-rent/thailand/chiang-mai/mueang-chiang-mai/wat-ket",
    ]
    polite_delay = 2000   # ms — courtesy gap between hits to the SAME site. Etiquette
                          # and load-reduction, NOT a rate-limit cure (see fetch()).
    browser = launch()
    try:
        for i, u in enumerate(urls):
            page = browser.new_page()      # fresh tab per URL (avoids nav races)
            try:
                if i:
                    page.wait_for_timeout(polite_delay)   # don't hammer the same host
                page.goto(u, timeout=60000, wait_until="domcontentloaded")
                page.wait_for_timeout(5000)
                name = u.rstrip("/").rsplit("/", 1)[-1]
                Path(f"raw_{name}.html").write_text(page.content(), encoding="utf-8")
                print(f"ok  {name}")
            except Exception as e:
                print(f"FAIL {u}: {e}")     # one bad URL shouldn't kill the run
            finally:
                page.close()               # close the tab, keep the browser
    finally:
        browser.close()                    # close the browser exactly once


# Step 3: pull data out — JSON-LD first
def extract_jsonld(html: str) -> list:
    """Pull every application/ld+json block out of a page as Python objects.

    Most listing/shopping sites embed a machine-readable copy of their data for
    Google. It's far more stable than the visible markup, which changes constantly.
    """
    blocks = re.findall(
        r'<script[^>]+type=["\']application/ld\+json["\'][^>]*>(.*?)</script>',
        html, flags=re.DOTALL | re.IGNORECASE,
    )
    out = []
    for raw in blocks:
        try:
            out.append(json.loads(raw.strip()))
        except json.JSONDecodeError:
            continue                        # skip malformed blocks, don't crash
    return out


def jsonld_products(html: str) -> list:
    """Every schema.org Product in the page's JSON-LD, as {name, price, url}.

    A clean, fairly site-agnostic way to get product names + links: many shops
    embed Product objects for Google. Price is often absent here (see detail_items).
    """
    out, seen = [], set()
    for block in extract_jsonld(html):
        stack = [block]
        while stack:
            d = stack.pop()
            if isinstance(d, list):
                stack.extend(d)
            elif isinstance(d, dict):
                if d.get("@type") == "Product":
                    url = d.get("url") or d.get("@id") or ""
                    if url.startswith("//"):
                        url = "https:" + url
                    offers = d.get("offers") or {}
                    if isinstance(offers, list):
                        offers = offers[0] if offers else {}
                    price = offers.get("price") if isinstance(offers, dict) else None
                    name = (d.get("name") or "").strip()
                    if name and url and url not in seen:
                        seen.add(url)
                        out.append({"name": name,
                                    "price": float(price) if price else None,
                                    "url": url})
                stack.extend(d.values())
    return out


def jsonld():
    """Fetch one page and show the first structured-data block."""
    browser = launch()
    try:
        page = browser.new_page()
        page.goto("https://www.fazwaz.com/condo-for-rent/thailand/chiang-mai",
                  timeout=60000, wait_until="domcontentloaded")
        page.wait_for_timeout(4000)
        blocks = extract_jsonld(page.content())
        page.close()
    finally:
        browser.close()
    print(f"found {len(blocks)} JSON-LD block(s)")
    if blocks:
        print(json.dumps(blocks[0], indent=2, ensure_ascii=False)[:600])


# Step 4 (capstone): MSI EdgeXpert shopping results, with links
QUERY = "MSI EdgeXpert"

# One row per storefront. `patterns` finds bare prices (for a page price range);
# `item_pattern` (optional) captures name + price + product link together via
# named groups. Each site buries data differently, so this lives in the config
# and the scraping loop stays generic. rate = THB per unit.
SITES = [
    {"name": "Amazon", "rate": 36.0,
     "url": "https://www.amazon.com/s?k=MSI+EdgeXpert",
     "patterns": [r'a-price-whole[^>]*>([\d,]{3,9})', r'\$\s?([\d,]{3,9}(?:\.\d{2})?)'],
     "item_pattern": None},
    {"name": "Shopee", "rate": 1.0,
     "url": "https://shopee.co.th/search?keyword=MSI%20EdgeXpert",
     "patterns": [r'฿\s?([\d,]{3,9})'],
     "item_pattern": None},
    {"name": "Lazada", "rate": 1.0,
     "url": "https://www.lazada.co.th/catalog/?q=MSI+EdgeXpert",
     "patterns": [r'฿\s?([\d,]{3,9})'],
     # name + price + link are adjacent in Lazada's result grid:
     "item_pattern": r'href="(?P<url>//www\.lazada\.co\.th/products/[^"]+)"\s+'
                     r'title="(?P<name>[^"]+)".{0,1500}?'
                     r'<span class="ooOxS">฿(?P<price>[\d,]+(?:\.\d{2})?)'},
]


def prices_in(html: str, patterns: list) -> list:
    """Collect every bare price the site's patterns match, as a list of floats."""
    out = []
    for pat in patterns:
        for raw in re.findall(pat, html):
            try:
                out.append(float(raw.replace(",", "")))
            except ValueError:
                continue
    return out


def detail_items(html: str, site: dict, limit: int = 8) -> list:
    """Top product results as {name, price, url}.

    Uses the site's grid pattern when given (name + price + link together);
    otherwise falls back to JSON-LD products (name + link, price if present).
    """
    pat = site.get("item_pattern")
    if pat:
        items, seen = [], set()
        for m in re.finditer(pat, html, re.DOTALL):
            g = m.groupdict()
            url = g["url"]
            if url.startswith("//"):
                url = "https:" + url
            if url in seen:
                continue
            seen.add(url)
            try:
                price = float(g["price"].replace(",", ""))
            except (KeyError, ValueError, TypeError, AttributeError):
                price = None
            items.append({"name": g.get("name", "").strip(), "price": price, "url": url})
            if len(items) >= limit:
                break
        if items:
            return items
    return jsonld_products(html)[:limit]


# Markers a storefront serves when it turns a scripted hit away.
BLOCK_MARKERS = ("Robot Check", "validateCaptcha", "api-services-support@amazon",
                 "Enter the characters you see below", "automated access")


def is_blocked(html: str) -> bool:
    """A tiny page or a CAPTCHA / Robot-Check marker means we were turned away."""
    return len(html) < 5000 or any(m in html for m in BLOCK_MARKERS)


def fetch(browser, url: str, tries: int = 3) -> str:
    """Load a URL, retrying past a transient bot-block with growing back-off.

    This catches LIGHT, transient blocks (a fresh-ish IP that gets challenged
    once). It will NOT clear a deep rate-limit — hit Amazon enough times from one
    IP and it puts you in a cooldown that no amount of back-off fixes in seconds.
    For anything you actually depend on, use a storefront's official API instead.
    """
    html = ""
    for i in range(tries):
        page = browser.new_page()
        try:
            page.goto(url, timeout=60000, wait_until="domcontentloaded")
            page.wait_for_timeout(6000)        # search grids render late
            html = page.content()
            if not is_blocked(html):
                return html
            if i < tries - 1:
                page.wait_for_timeout(6000 * (i + 1))   # 6s, then 12s back-off
        except Exception:
            pass                               # timeout/nav error → just retry
        finally:
            page.close()
    return html                                # still blocked; caller checks is_blocked()


def scrape(browser, site: dict) -> dict:
    """Load one storefront's search page; return its product results + status."""
    html = fetch(browser, site["url"])
    return {"name": site["name"],
            "items": detail_items(html, site),
            "prices": prices_in(html, site["patterns"]),
            "blocked": is_blocked(html),
            "note": ""}


def compare():
    """Scrape all three storefronts and print product results with links."""
    browser = launch()
    try:
        rows = [scrape(browser, s) for s in SITES]
    finally:
        browser.close()

    print(f"\nMSI EdgeXpert — shopping results ('{QUERY}')\n" + "=" * 60)
    for r in rows:
        nm = r["name"]
        if r["blocked"]:
            print(f"\n{nm} — blocked even after retries: an IP rate-limit from too many")
            print(f"{'':>9}recent runs. It clears on its own — wait a while, or use the API.")
            continue
        if not r["items"]:
            print(f"\n{nm} — no product details in the page HTML "
                  f"(results load via API after render)")
            continue
        rng = ""
        if r["prices"]:
            rng = f"  ·  page prices ฿{min(r['prices']):,.0f}–฿{max(r['prices']):,.0f}"
        shown = r["items"][:6]
        print(f"\n{nm} — {len(r['items'])}+ result(s){rng}; top {len(shown)}:")
        for it in shown:
            pr = f"฿{it['price']:,.0f}" if it.get("price") else "฿—"
            print(f"  {pr:>11}  {it['name'][:52]}")
            print(f"  {'':>11}  {it['url']}")
        priced = [it for it in r["items"] if it.get("price")]
        if priced:
            c = min(priced, key=lambda it: it["price"])
            print(f"  → cheapest listed: ฿{c['price']:,.0f}  {c['url']}")

    print("\nThis is a demonstration of scraping, blocks and all. When a storefront")
    print("offers an official API (e.g. Amazon's Product Advertising API), prefer it —")
    print("it's stable and won't rate-limit you like this. But APIs often need approval,")
    print("cost money, or don't exist for the site you care about; that's exactly where")
    print("a stealth browser like CloakBrowser earns its place. API when you can,")
    print("scrape when you can't.")
    print("\nResults are a mix and shift run to run: the real MSI EdgeXpert (a rebadged")
    print("NVIDIA DGX Spark) shows up on Lazada when in stock, next to related GPUs.")
    print("Open a link to confirm the actual product and its current price.")


STEPS = {"minimal": minimal, "multipage": multipage, "jsonld": jsonld, "compare": compare}


def main():
    step = sys.argv[1] if len(sys.argv) > 1 else "compare"
    if step not in STEPS:
        print(f"Unknown step '{step}'. Choose one of: {', '.join(STEPS)}")
        return
    STEPS[step]()


if __name__ == "__main__":
    main()