// Package scrape fetches and parses external web pages with SSRF guards,
// size + timeout caps, and block-page detection. Returns markdown + metadata
// suited for an AI agent's context window.
package scrape

import (
	"fmt"
	"io"
	"net"
	"net/http"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
	"webby-builder/internal/security/urlguard"
)

// validateURL and safeDialer are package-level hooks so tests in the same
// package can swap in permissive versions when hitting httptest.Server (which
// listens on loopback). Production callers always get the real urlguard.
var (
	validateURL = urlguard.Validate
	safeDialer  = func() *net.Dialer { return urlguard.SafeDialer() }
)

// Config tunes the HTTP scraper.
type Config struct {
	TimeoutSeconds    int
	MaxResponseSizeMB int
	UserAgent         string
}

// Client fetches and parses external web pages.
type Client struct {
	cfg  Config
	http *http.Client
}

// NewClient builds a scraper Client with SSRF-safe dialer.
func NewClient(cfg Config) *Client {
	if cfg.TimeoutSeconds <= 0 {
		cfg.TimeoutSeconds = 15
	}
	if cfg.MaxResponseSizeMB <= 0 {
		cfg.MaxResponseSizeMB = 5
	}
	if cfg.UserAgent == "" {
		cfg.UserAgent = "Mozilla/5.0 (compatible; WebbyAgent/1.0; +https://titansys.dev)"
	}
	tr := &http.Transport{
		DialContext:           safeDialer().DialContext,
		TLSHandshakeTimeout:   10 * time.Second,
		ResponseHeaderTimeout: time.Duration(cfg.TimeoutSeconds) * time.Second,
	}
	return &Client{
		cfg: cfg,
		http: &http.Client{
			Transport: tr,
			Timeout:   time.Duration(cfg.TimeoutSeconds) * time.Second,
		},
	}
}

// FetchResult is the success payload returned by Fetch.
type FetchResult struct {
	URL      string   `json:"url"`
	Status   int      `json:"status"`
	Title    string   `json:"title"`
	Markdown string   `json:"markdown"`
	Metadata Metadata `json:"metadata"`
}

// FetchError standardizes scraper failures with a stable error code.
type FetchError struct {
	Code         string `json:"error_code"`
	Message      string `json:"error"`
	Retryable    bool   `json:"retryable"`
	FallbackHint string `json:"fallback_hint,omitempty"`
}

func (e *FetchError) Error() string { return e.Code + ": " + e.Message }

// Fetch retrieves a URL, validates SSRF, applies size cap, and returns markdown + metadata.
// On obvious block pages (Cloudflare, captcha, 403/429), returns FetchError with
// Code="blocked_by_site" and a fallback hint pointing at the browser tier.
func (c *Client) Fetch(rawURL string) (*FetchResult, error) {
	if err := validateURL(rawURL); err != nil {
		if ge, ok := err.(*urlguard.GuardError); ok {
			return nil, &FetchError{Code: "ssrf_blocked", Message: ge.Message}
		}
		return nil, &FetchError{Code: "ssrf_blocked", Message: err.Error()}
	}
	req, err := http.NewRequest("GET", rawURL, nil)
	if err != nil {
		return nil, &FetchError{Code: "invalid_url", Message: err.Error()}
	}
	req.Header.Set("User-Agent", c.cfg.UserAgent)
	req.Header.Set("Accept", "text/html,application/xhtml+xml")
	resp, err := c.http.Do(req)
	if err != nil {
		msg := err.Error()
		if strings.Contains(msg, "Timeout") || strings.Contains(msg, "deadline exceeded") || strings.Contains(msg, "timeout") {
			return nil, &FetchError{Code: "timeout", Message: msg, Retryable: true}
		}
		if strings.Contains(msg, "private_ip") {
			return nil, &FetchError{Code: "ssrf_blocked", Message: msg}
		}
		return nil, &FetchError{Code: "network_error", Message: msg, Retryable: true}
	}
	defer func() { _ = resp.Body.Close() }()
	limit := int64(c.cfg.MaxResponseSizeMB) * 1024 * 1024
	body, err := io.ReadAll(io.LimitReader(resp.Body, limit+1))
	if err != nil {
		return nil, &FetchError{Code: "network_error", Message: err.Error()}
	}
	if int64(len(body)) > limit {
		return nil, &FetchError{
			Code:    "response_too_large",
			Message: fmt.Sprintf("response exceeded %d MB cap", c.cfg.MaxResponseSizeMB),
		}
	}
	bodyStr := string(body)
	if blocked, reason := detectBlock(resp.StatusCode, bodyStr); blocked {
		return nil, &FetchError{
			Code:         "blocked_by_site",
			Message:      "site appears to block plain HTTP fetching: " + reason,
			FallbackHint: "use webBrowserOpen for JS-rendered or anti-bot-protected sites",
		}
	}
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(bodyStr))
	if err != nil {
		return nil, &FetchError{Code: "parse_error", Message: err.Error()}
	}
	title := strings.TrimSpace(doc.Find("title").First().Text())
	md, _ := htmlToMarkdown(bodyStr)
	return &FetchResult{
		URL:      rawURL,
		Status:   resp.StatusCode,
		Title:    title,
		Markdown: md,
		Metadata: extractMetadata(doc),
	}, nil
}

// detectBlock returns (true, reason) if the response looks like a bot-block page.
// 403 and 429 are treated as blocks unconditionally (almost always bot rejection).
// 503 is only treated as a block when the body carries a bot-detection marker —
// otherwise it's likely a real service outage and falling back to a browser
// session would just waste resources on a down site.
func detectBlock(status int, body string) (bool, string) {
	if status != 403 && status != 429 && status != 503 {
		return false, ""
	}
	lower := strings.ToLower(body)
	markers := []string{"cloudflare", "just a moment", "captcha", "attention required", "are you human", "access denied"}
	for _, m := range markers {
		if strings.Contains(lower, m) {
			return true, "marker:" + m
		}
	}
	if status == 503 {
		return false, "" // service outage, not a block
	}
	return true, fmt.Sprintf("status %d", status)
}

// ExtractFromHTML runs the same markdown + metadata extraction on already-rendered HTML
// (e.g., from a chromedp session). Returns a FetchResult with URL/Status left blank.
func ExtractFromHTML(html string) *FetchResult {
	doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
	if err != nil {
		return &FetchResult{Markdown: html}
	}
	title := strings.TrimSpace(doc.Find("title").First().Text())
	md, _ := htmlToMarkdown(html)
	return &FetchResult{
		Title:    title,
		Markdown: md,
		Metadata: extractMetadata(doc),
	}
}
