import io, requests, tempfile
from rest_framework.decorators import api_view
from rest_framework.response import Response
from pdf2image import convert_from_path
import pytesseract
from PyPDF2 import PdfReader
from PIL import Image


def extract_text_from_pdf(pdf_path):
    """Extract text from a normal (non-scanned) PDF."""
    text = ""
    try:
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text += page.extract_text() or ""
    except Exception:
        pass
    return text.strip()


def extract_text_with_ocr(pdf_path):
    """Perform OCR on scanned PDFs."""
    text = ""
    images = convert_from_path(pdf_path, dpi=300)
    for img in images:
        text += pytesseract.image_to_string(img, lang='eng') + "\n"
    return text.strip()


@api_view(["POST"])
def extract_from_file(request):
    """Accepts an uploaded PDF file and extracts text."""
    file = request.FILES.get("file")
    if not file:
        return Response({"error": "No file provided"}, status=400)

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
        for chunk in file.chunks():
            tmp.write(chunk)
        tmp_path = tmp.name

    text = extract_text_from_pdf(tmp_path)
    if len(text) < 50:
        text = extract_text_with_ocr(tmp_path)

    return Response({"text": text})


@api_view(["POST"])
def extract_from_url(request):
    """Accepts a PDF URL and extracts text."""
    url = request.data.get("url")
    if not url:
        return Response({"error": "No URL provided"}, status=400)

    r = requests.get(url)
    if r.status_code != 200:
        return Response({"error": "Unable to fetch file"}, status=400)

    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
        tmp.write(r.content)
        tmp_path = tmp.name

    text = extract_text_from_pdf(tmp_path)
    if len(text) < 50:
        text = extract_text_with_ocr(tmp_path)

    return Response({"text": text})
