from fastapi import FastAPI, HTTPException, BackgroundTasks
from pydantic import BaseModel
from fastapi.responses import JSONResponse
import cohere
import os
import logging
from dotenv import load_dotenv
import json
import re
import subprocess
import sys  # <- You missed this import

# Custom modules for scraping/parsing
from funder_scraper import fetch_page_html, extract_text_from_html
from funder_parser import parse_funding_text

# Load environment variables early
load_dotenv()

# Initialize logging
logger = logging.getLogger("uvicorn.error")
logging.basicConfig(level=logging.INFO)

# Initialize FastAPI app
app = FastAPI(title="QAAPT Funders Microservice", version="1.0.0")

# Validate Cohere API key
COHERE_API_KEY = os.getenv("COHERE_API_KEY")
if not COHERE_API_KEY:
    raise RuntimeError("❌ COHERE_API_KEY is not set in environment.")

# Initialize Cohere client
co = cohere.Client(COHERE_API_KEY)

# ----------------- Request Models -----------------

class QueryRequest(BaseModel):
    variables: list[str]
    datasetName: str
    query: str

class FundingRequest(BaseModel):
    url: str | None = None
    html: str | None = None

# ----------------- SQL Generation Endpoint -----------------

@app.post("/api/sql/generate", summary="Generate SQL from Natural Language")
async def generate_sql(request: QueryRequest):
    """
    Convert natural language query into MySQL SELECT statement using Cohere.
    """
    prompt = f"""
You are a data analyst. Translate the following natural language query into a valid MySQL SELECT statement:
Dataset: {request.datasetName}
Columns: {', '.join(request.variables)}
User Query: {request.query}
Provide only the SQL SELECT query without any explanation or code block formatting.
""".strip()

    try:
        response = co.generate(
            model="command-xlarge-nightly",
            prompt=prompt,
            max_tokens=150,
            temperature=0.2,
            stop_sequences=[";"]
        )

        sql_text = response.generations[0].text.strip()

        if not sql_text.lower().startswith("select"):
            logger.warning(f"Invalid SQL generated: {sql_text}")
            raise ValueError("Generated SQL does not start with SELECT.")

        return {"sql": sql_text}

    except Exception as e:
        logger.exception("❌ Failed to generate SQL")
        raise HTTPException(status_code=500, detail="Failed to generate SQL from query.")

# ----------------- Funding Parsing Endpoint -----------------

@app.post("/api/funding/parse", summary="Extract structured funding info")
async def parse_funding(request: FundingRequest):
    """
    Accepts a funding webpage (URL or raw HTML), extracts plain text,
    and uses LLM to parse key funding opportunity metadata.
    """
    if not (request.url or request.html):
        raise HTTPException(status_code=400, detail="Either 'url' or 'html' must be provided.")

    try:
        html = request.html or fetch_page_html(request.url)
        if not html:
            raise ValueError("No HTML content found from provided source.")

        plain_text = extract_text_from_html(html)
        if not plain_text or len(plain_text.strip()) < 50:
            raise ValueError("Extracted text appears too short or empty.")

        parsed_result = parse_funding_text(plain_text, co)
        logger.info("Raw LLM response received.")

        # Sanitize LLM output (remove markdown, code fences, etc.)
        cleaned_text = re.sub(r"^```(json)?|```$", "", parsed_result.strip(), flags=re.MULTILINE)

        try:
            parsed_json = json.loads(cleaned_text)
        except Exception as json_err:
            logger.error(f"⚠️ JSON parse error: {json_err}")
            raise HTTPException(status_code=500, detail="Failed to parse funding info JSON.")

        return {"parsed": parsed_json}

    except Exception as e:
        logger.exception("❌ Funding parsing failed")
        raise HTTPException(status_code=500, detail=f"Funding parsing failed: {str(e)}")

@app.post("/api/funding/scrape")
async def run_scrapy_spider():
    cwd = os.path.join(os.getcwd(), "funderspider", "funderspider")
    output_file = os.path.join(cwd, "funding_data.json")

    cmd = [
        sys.executable,
        "-m", "scrapy",
        "crawl", "funder_spider",
        "-o", "funding_data.json:jsonlines"
    ]

    result = subprocess.run(cmd, cwd=cwd, capture_output=True, text=True)

    if result.returncode != 0:
        return JSONResponse(
            status_code=500,
            content={
                "error": "Scrapy spider failed",
                "exit_code": result.returncode,
                "stderr": result.stderr,
                "stdout": result.stdout
            }
        )

    if not os.path.exists(output_file):
        return JSONResponse(
            status_code=500,
            content={
                "error": "Scrapy output file not found",
                "message": f"{output_file} does not exist."
            }
        )

    try:
        data = []
        with open(output_file, "r") as f:
            for line in f:
                data.append(json.loads(line))
    except Exception as e:
        return JSONResponse(
            status_code=500,
            content={
                "error": "Failed to read Scrapy JSON output",
                "details": str(e)
            }
        )
    finally:
        if os.path.exists(output_file):
            os.remove(output_file)

    return {"results": data}

# ----------------- Development Entry Point -----------------

if __name__ == "__main__":
    import uvicorn
    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
