# ==============================================================================
# Pathology Report Processing Pipeline
#
# Extracts text from pathology reports (PDFs/images), then uses a cascade of
# AI models to structure the data into JSON.
#
# Tries providers in this order:
# 1. Google Vertex AI (production-grade)
# 2. Google AI Studio (API key)
# 3. OpenAI (API key)
# ==============================================================================

import os
import json
import re
import fitz  # PyMuPDF
import vertexai
import openai
import google.generativeai as genai
from google.cloud import vision
from vertexai.generative_models import GenerativeModel

# --- Configuration ---

# -- Google Cloud (for Vision OCR & Vertex AI)
# Needs a service account JSON file.
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = "APIs/modules/healiomenew-94ee24f33f41.json"
GCP_PROJECT_ID = "healiomenew"
GCP_LOCATION = "us-central1"

# -- Google AI Studio (Fallback 1)
# Get a key from AI Studio: https://makersuite.google.com/app/apikey
GEMINI_API_KEY = "AIzaSyB1XeO92Pis8xdgCoFnY3JEkkxHVilLBZE"

# -- OpenAI (Fallback 2)
OPENAI_API_KEY = "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"

# -- Models
VERTEX_AI_MODEL = "models/gemini-2.5-pro"
GEMINI_MODEL = 'models/gemini-2.5-pro'
OPENAI_MODEL = "gpt-4o"

# --- LLM Clients ---

try:
    genai.configure(api_key=GEMINI_API_KEY)
except Exception as e:
    print(f"[CONFIG WARNING] Gemini API Key (Fallback 1) not configured: {e}")

try:
    openai_client = openai.OpenAI(api_key=OPENAI_API_KEY)
except Exception as e:
    print(f"[CONFIG ERROR] OpenAI client (Fallback 2) not configured: {e}")
    openai_client = None


# ==============================================================================
# STAGE 1: PDF/Image to Raw Text (OCR)
# ==============================================================================

def convert_pdf_to_images(pdf_path, images_dir):
    """Converts each page of a PDF to a high-res JPG."""
    os.makedirs(images_dir, exist_ok=True)
    image_paths = []
    try:
        # Check if PDF file exists and is readable
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
        
        file_size = os.path.getsize(pdf_path)
        if file_size == 0:
            raise ValueError("PDF file is empty")
        
        print(f"[INFO] Opening PDF: {pdf_path} (size: {file_size} bytes)")
        
        doc = fitz.open(pdf_path)
        page_count = doc.page_count
        
        if page_count == 0:
            raise ValueError("PDF has no pages")
        
        print(f"[INFO] Converting {page_count} pages to images...")
        
        for i, page in enumerate(doc, start=1):
            try:
                pix = page.get_pixmap(dpi=300)
                image_path = os.path.join(images_dir, f"page_{i}.jpg")
                pix.save(image_path)
                image_paths.append(image_path)
                print(f"[INFO] Converted page {i}/{page_count}")
            except Exception as page_error:
                print(f"[ERROR] Failed to convert page {i}: {page_error}")
                continue
        
        doc.close()
        print(f"[INFO] Saved {len(image_paths)} pages as images.")
        
        if len(image_paths) == 0:
            raise ValueError("No pages could be converted to images")
            
    except ImportError as ie:
        print(f"[ERROR] PyMuPDF (fitz) not installed: {ie}")
        raise Exception("PyMuPDF library not available on server. Please install: pip install PyMuPDF")
    except Exception as e:
        print(f"[ERROR] PDF to image conversion failed: {e}")
        import traceback
        traceback.print_exc()
        raise Exception(f"PDF conversion error: {str(e)}")
    
    return image_paths


def run_ocr_on_images(image_paths):
    """Uses Google Vision to get raw text from a list of images."""
    client = vision.ImageAnnotatorClient()
    all_text = ""
    print("[INFO] Starting OCR...")
    for idx, img_path in enumerate(image_paths, start=1):
        try:
            with open(img_path, "rb") as image_file:
                content = image_file.read()
            image = vision.Image(content=content)
            response = client.text_detection(image=image)

            if response.error.message:
                raise Exception(f"Vision API Error: {response.error.message}")

            page_text = response.full_text_annotation.text or ""
            all_text += " " + page_text.replace("\n", " ")
            print(f"[INFO] OCR complete for page {idx}/{len(image_paths)}.")
        except Exception as e:
            print(f"[ERROR] OCR failed on {img_path}: {e}")
            continue
    return all_text.strip()


# ==============================================================================
# STAGE 2: Raw Text to Structured JSON (3-Tier Fallback)
# ==============================================================================

def get_llm_prompt(raw_text):
    """
    Builds the prompt for the LLM to structure the pathology report.
    Uses standardized reference_range and standard_label for risk assessment.
    """
    return f"""
You are a medical laboratory data parser. Your goal is to extract structured laboratory data from OCR text and output a clean, standardized JSON.

⚠️ **CRITICAL: You MUST follow the exact JSON structure defined below. Any deviation will cause system failure.**

---

**MANDATORY JSON STRUCTURE:**

You MUST return a JSON object with EXACTLY this structure (no extra or missing fields):

{{
  "patient": {{
    "name": "string",
    "gender": "string",
    "date_of_birth": "string or null",
    "age": "string or null",
    "registration_no": "string or null",
    "collection_date": "string or null",
    "confirm_date": "string or null"
  }},
  "tests": [
    {{
      "name": "string (required)",
      "value": number_or_string_or_null,
      "unit": "string or null",
      "method": "string or null",
      "reference_range": [
        {{
          "low": number_or_null,
          "high": number_or_null,
          "label": "string (required)"
        }}
      ],
      "standard_label": "No risk" | "Moderate risk" | "High risk" | "Critical risk" | "Undefined"
    }}
  ]
}}

---

**STRICT RULES (MUST FOLLOW):**

1. **Patient Information:**
   - MUST include ALL 7 fields: name, gender, date_of_birth, age, registration_no, collection_date, confirm_date
   - Use null for missing values, never omit fields
   - gender: use "Male", "Female", or "Unknown"

2. **Tests Array:**
   - MUST be a flat array (not nested by category)
   - Each test MUST have ALL 6 fields: name, value, unit, method, reference_range, standard_label
   - Never omit any field - use null if not available

3. **Reference Range (CRITICAL):**
   - MUST be an array of objects (even if only one range)
   - Each range object MUST have exactly 3 keys: "low", "high", "label"
   - NEVER omit any of these 3 keys
   - For numeric ranges: provide low and high as numbers
   - For qualitative tests: set low=null, high=null, label="Expected value"
   - If no range available: use [{{"low": null, "high": null, "label": "Undefined"}}]

3. **Convert ALL reference ranges to numeric low/high values when possible:**
   - "70-99 mg/dL" → {{"low": 70, "high": 99, "label": "Normal"}}
   - "Less than 200" → {{"low": 0, "high": 200, "label": "Desirable"}}
   - "More than 159" → {{"low": 159, "high": 999, "label": "High"}}
   - "<100" → {{"low": 0, "high": 100, "label": "Normal"}}
   - ">40" → {{"low": 40, "high": 999, "label": "Normal"}}
   - Multiple ranges: create multiple objects in array
   - Example: "Desirable: <200, Borderline: 200-239, High: ≥240" →
     [
       {{"low": 0, "high": 200, "label": "Desirable"}},
       {{"low": 200, "high": 239, "label": "Borderline"}},
       {{"low": 240, "high": 999, "label": "High"}}
     ]

4. **For QUALITATIVE tests** (e.g., Negative/Positive):
   - Set value to the actual result string (e.g., "Negative")
   - Set reference_range with label only: [{{"low": null, "high": null, "label": "Negative"}}]

5. **Map ALL test results to standard_label based on comparison with reference_range:**
   - If result is in "Desirable", "Normal", "Optimal", "Protective", "Negative" range → **"No risk"**
   - If result is in "Borderline", "Slightly elevated", "Medium risk", "Pre-diabetic" range → **"Moderate risk"**
   - If result is in "High", "Elevated", "Dangerous", "Abnormal", "Positive" range → **"High risk"**
   - If result is in "Very high", "Critical", "Severe", "Extremely high" range → **"Critical risk"**
   - If cannot determine or no reference range → **"Undefined"**

6. **Important notes:**
   - Remove ALL irrelevant text (lab addresses, disclaimers, headers/footers, page numbers)
   - Ignore historical/previous test results - only extract the CURRENT/LATEST results
   - If a test has multiple sub-tests (e.g., CBC with WBC, RBC, etc.), list each as separate test
   - For tests with differential counts (e.g., Neutrophil %, Lymphocyte %), list each separately

7. **Final output format MUST be valid JSON:**
   {{
     "patient": {{
       "name": "...",
       "gender": "...",
       "date_of_birth": "...",
       "age": "...",
       "registration_no": "...",
       "collection_date": "...",
       "confirm_date": "..."
     }},
     "tests": [
       {{
         "name": "...",
         "value": ...,
         "unit": "...",
         "method": "...",
         "reference_range": [...],
         "standard_label": "..."
       }}
     ]
   }}

**CRITICAL:** Do NOT use markdown formatting (```json), comments, or any wrapper. Output ONLY the raw JSON object.

---

**Input OCR text:**

{raw_text}

---

Now extract and return the information in the requested JSON schema.
"""


def clean_llm_response(text):
    """Strips markdown and other junk from the LLM's JSON response."""
    return text.strip().replace("```json", "").replace("```", "").strip()


def process_text_with_llm(raw_text):
    """
    Tries to structure text using a 3-tier fallback:
    1. OpenAI (Primary)
    2. Google AI Studio
    3. Vertex AI
    """
    prompt = get_llm_prompt(raw_text)

    # --- Attempt 1: OpenAI ---
    print("\n[INFO] Attempt 1/3: Structuring with OpenAI...")
    if openai_client:
        try:
            response = openai_client.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": "You are a data extraction assistant that only returns valid JSON objects."},
                    {"role": "user", "content": prompt}
                ],
                response_format={"type": "json_object"}
            )
            response_text = response.choices[0].message.content
            print("[SUCCESS] Got structured data from OpenAI.")
            return json.loads(response_text)
        except Exception as e:
            print(f"[WARNING] OpenAI failed: {e}. Trying fallback...")
    else:
        print("[WARNING] OpenAI client not configured. Trying fallback...")

    # --- Attempt 2: Google AI Studio ---
    print("\n[INFO] Attempt 2/3: Structuring with Google AI Studio...")
    try:
        model = genai.GenerativeModel(GEMINI_MODEL)
        response = model.generate_content(prompt)
        cleaned_response = clean_llm_response(response.text)
        print("[SUCCESS] Got structured data from Google AI Studio.")
        return json.loads(cleaned_response)
    except Exception as e:
        print(f"[WARNING] Google AI Studio failed: {e}. Trying fallback...")

    # --- Attempt 3: Vertex AI ---
    print("\n[INFO] Attempt 3/3: Structuring with Vertex AI...")
    try:
        vertexai.init(project=GCP_PROJECT_ID, location=GCP_LOCATION)
        model = GenerativeModel(VERTEX_AI_MODEL)
        response = model.generate_content(prompt)
        cleaned_response = clean_llm_response(response.text)
        print("[SUCCESS] Got structured data from Vertex AI.")
        return json.loads(cleaned_response)
    except Exception as e:
        print(f"[ERROR] All providers failed. Vertex AI also failed: {e}")
        return None


# ==============================================================================
# File I/O
# ==============================================================================

def save_to_json(data, output_path):
    """Saves data to a JSON file."""
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
        print(f"[INFO] Data saved to {output_path}")
    except Exception as e:
        print(f"[ERROR] Failed to save JSON file at {output_path}: {e}")


# ==============================================================================
# STAGE 4: Test Range Analysis
# ==============================================================================

def determine_risk_level(value, reference_range):
    """
    Determines the status by comparing test value with reference ranges.
    Returns: 'low', 'normal', 'borderline', 'high', 'abnormal', or 'undefined'
    
    Args:
        value: The measured result of the test (can be numeric or string).
        reference_range: List of range objects with low, high, and label.
        
    Returns:
        For numeric tests:
            'low' - below normal range or in deficient range
            'normal' - within normal/optimal/desirable range
            'borderline' - in borderline/intermediate range
            'high' - above normal range or in high/dangerous range
        For qualitative tests (Positive/Negative):
            'normal' - expected result (e.g., Negative)
            'abnormal' - unexpected result (e.g., Positive)
        'undefined' - cannot determine or N/A
    """
    # Handle missing or invalid data
    if not reference_range or value is None or str(value).strip() == '':
        return 'undefined'
    
    # Handle "Not Applicable" or similar values
    value_str = str(value).strip().lower()
    if value_str in ['not applicable', 'n/a', 'na', 'undefined', '-', 'none']:
        return 'undefined'
    
    # Check if reference_range has only "Undefined" label
    if len(reference_range) == 1:
        label = str(reference_range[0].get('label', '')).lower()
        if 'undefined' in label and reference_range[0].get('low') is None and reference_range[0].get('high') is None:
            return 'undefined'
    
    # Try numeric comparison
    try:
        result_val = float(value)
        
        # Check which range the value falls into
        for rng in reference_range:
            low = rng.get('low')
            high = rng.get('high')
            label = str(rng.get('label', '')).lower()
            
            # Skip qualitative ranges
            if low is None and high is None:
                continue
            
            # Check if value is in this range
            low_bound = low if low is not None else float('-inf')
            high_bound = high if high is not None else float('inf')
            
            # Value falls within this range
            if low_bound <= result_val <= high_bound:
                # Map label to risk level (4 levels)
                if any(x in label for x in ['normal', 'desirable', 'optimal', 'sufficient', 'protective', 'low risk']):
                    return 'normal'
                elif any(x in label for x in ['borderline', 'intermediate', 'medium', 'acceptable', 'pre-diabetic']):
                    return 'borderline'
                elif any(x in label for x in ['high', 'elevated', 'dangerous', 'very high']):
                    return 'high'
                elif any(x in label for x in ['low', 'deficient', 'insufficient']):
                    return 'low'
                else:
                    # Unknown label but value is in a defined range
                    return 'normal'
        
        # Value doesn't fall in any defined range
        # Determine if it's below or above all ranges
        all_ranges = [(rng.get('low'), rng.get('high')) for rng in reference_range if rng.get('low') is not None or rng.get('high') is not None]
        
        if all_ranges:
            # Find the minimum low and maximum high
            min_low = min([r[0] for r in all_ranges if r[0] is not None], default=None)
            max_high = max([r[1] for r in all_ranges if r[1] is not None], default=None)
            
            if min_low is not None and result_val < min_low:
                return 'low'
            elif max_high is not None and result_val > max_high:
                return 'high'
        
        return 'undefined'
        
    except (ValueError, TypeError):
        # Handle qualitative/text-based results (e.g., Positive/Negative)
        result_str = str(value).strip().lower()
        
        # Check against reference ranges
        for rng in reference_range:
            label = str(rng.get('label', '')).strip().lower()
            if label and result_str == label:
                # For qualitative tests, matching expected = normal
                if any(x in label for x in ['negative', 'normal', 'not seen', 'non-reactive', 'absent']):
                    return 'normal'
                elif any(x in label for x in ['positive', 'reactive', 'seen', 'trace', 'present']):
                    return 'abnormal'
        
        # If result doesn't match the expected label, it's abnormal
        # Check if result contains "positive", "reactive", etc.
        if any(x in result_str for x in ['positive', 'reactive', 'seen', 'trace', 'present', '+']):
            return 'abnormal'
        
        # Default to undefined if we can't determine
        return 'undefined' if result_str else 'undefined'


def analyze_test_ranges(structured_data):
    """
    Analyzes all test results against their reference ranges using custom logic.
    Works with the new flat 'tests' array structure.
    
    Args:
        structured_data: The structured JSON data with 'patient' and 'tests' fields.
        
    Returns:
        A list of test analysis results with status:
        - Numeric: low/normal/borderline/high/undefined
        - Qualitative: normal/abnormal/undefined
    """
    analysis_summary = []
    
    try:
        # New structure has a flat 'tests' array
        tests = structured_data.get('tests', [])
        
        for test in tests:
            if not isinstance(test, dict):
                continue
            
            test_name = test.get('name', 'Unknown')
            value = test.get('value')
            unit = test.get('unit')
            reference_range = test.get('reference_range', [])
            
            # Determine status using our custom logic
            status = determine_risk_level(value, reference_range)
            
            # Build a detailed analysis entry
            analysis_entry = {
                'test_name': test_name,
                'value': value,
                'unit': unit,
                'reference_range': reference_range,
                'status': status
            }
            
            analysis_summary.append(analysis_entry)
        
        print(f"[INFO] Analyzed {len(analysis_summary)} test results.")
        return analysis_summary
        
    except Exception as e:
        print(f"[ERROR] Test range analysis failed: {e}")
        return []


# ==============================================================================
# STAGE 5: Report Generation (Markdown via LLM fallback)
# ==============================================================================

def build_lacto_report_prompt(structured_data):
    """Builds the Persian Lacto-brand report prompt from structured JSON."""
    patient = structured_data.get("patient", {}) or {}
    tests = structured_data.get("tests", []) or []

    # Map patient info
    name = str(patient.get("name", ""))
    gender = str(patient.get("gender", ""))
    age = str(patient.get("age", ""))
    birth_date = str(patient.get("date_of_birth", ""))
    sample_date = str(patient.get("collection_date", ""))
    approval_date = str(patient.get("confirm_date", ""))

    def fmt_item(item: dict) -> str:
        test_name = item.get("name", "")
        value = item.get("value")
        unit = item.get("unit") or ""
        method = item.get("method") or ""
        reference_range = item.get("reference_range", [])
        risk = item.get("standard_label", "")

        # Format value
        if value is not None:
            value_str = str(value)
        else:
            value_str = ""
        
        # Format reference range
        range_parts = []
        for rng in reference_range:
            low = rng.get('low')
            high = rng.get('high')
            label = rng.get('label', '')
            
            if low is not None and high is not None:
                range_parts.append(f"{label}: {low}-{high}")
            elif low is not None:
                range_parts.append(f"{label}: >{low}")
            elif high is not None:
                range_parts.append(f"{label}: <{high}")
            elif label:
                range_parts.append(f"{label}")
        
        range_str = ", ".join(range_parts)

        # Build the line
        parts = []
        if value_str:
            parts.append(value_str)
        if unit:
            parts.append(unit)
        value_unit = " ".join(parts)
        
        range_part = f"(Range: {range_str})" if range_str else ""
        method_part = f"[Method: {method}]" if method else ""
        risk_part = f"[Risk: {risk}]" if risk and risk != "Undefined" else ""
        
        return f"{test_name}: {value_unit} {range_part} {method_part} {risk_part}".strip()

    # Format all tests
    test_lines = []
    for test in tests:
        line = fmt_item(test)
        if line:
            test_lines.append(line)
    
    lab_results_block = "\n".join(test_lines)

    prompt = f"""
🎯 **دستور اصلی:**
لطفاً بر اساس داده‌های زیر، یک "گزارش جامع وضعیت سلامت" به سبک برند لاکتو بنویس.  
گزارش باید شامل 6 بخش زیر باشد و لحن آن ترکیبی از علم و کوچینگ شخصی‌سازی‌شده باشد:

---

🧩 **بخش‌های گزارش (ثابت و استاندارد):**

1️⃣ **مقدمه گزارش:**  
یک متن الهام‌بخش، همدلانه و علمی بنویس که بیان کند هر بدن داستانی منحصربه‌فرد دارد و این گزارش نقشه راه سلامت است. هدف این بخش ایجاد انگیزه و حس همراهی با برند لاکتو است.

2️⃣ **هویت بدن در یک نگاه (Your Body Identity at a Glance):**  
نمایش داده‌های پایه (نام، سن، جنسیت، تاریخ تولد، تاریخ آزمایش، تاریخ تأیید).  
تحلیل مختصر از اینکه این داده‌ها چرا برای شخصی‌سازی محدوده‌های نرمال مهم هستند.

3️⃣ **وضعیت کلی سلامت در یک نگاه (Your Body's Overall Health Snapshot):**  
- تحلیل کلی سلامت: نقاط قوت اصلی بدن (در قالب بولت‌پوینت با عنوان «سرمایه‌های ارزشمند شما»)  
- موارد نیازمند بهینه‌سازی (با عنوان «پیام‌های بدن شما برای توجه»)  
- در پایان جمع‌بندی امیدبخش و برنامه کوچینگ لاکتو برای بهبود آن‌ها.

4️⃣ **اجزای سازنده بدن (CBC & Basic Chemistry):**  
- داده‌های خونی و شیمی پایه را نمایش بده.  
- تحلیل کن: چه شاخص‌هایی در محدوده نرمال‌اند؟ کدام‌ها نیازمند توجه؟  
- توصیه‌های تغذیه‌ای و سبک زندگی برای هر شاخص غیرنرمال بده.  
- اگر چیزی نرمال است، به شکل تشویقی بنویس ("این وضعیت عالی را حفظ کنید").  
- از بولت‌پوینت برای وضوح استفاده کن.

5️⃣ **نقشه متابولیک و قلبی عروقی (Metabolic & Cardiovascular Landscape):**  
- قند، HbA1c، چربی‌ها، کلسترول‌ها، LDL، HDL  
- موارد مطلوب را با پیام تشویقی بنویس  
- موارد نیازمند بهبود را با توصیه تغذیه‌ای (مثلاً امگا ۳، فیبر، حذف چربی ترانس) تحلیل کن.  
- پایان بخش شامل جمع‌بندی برندمحور از کمک لاکتو باشد.

6️⃣ **سلامت اندام‌های حیاتی (Vital Organs - LFT & Thyroid):**  
- آنزیم‌های کبدی، بیلی‌روبین، TSH و...  
- برای شاخص‌های بالا یا پایین، تحلیل علمی و توصیه سبک زندگی بنویس.  
- در پایان، برنامه پیشنهادی لاکتو برای حمایت از کبد و تیروئید را توضیح بده.

---

📋 **داده‌های ورودی:**

**مشخصات کاربر:**
- نام کامل: {name}
- جنسیت: {gender}
- سن: {age}
- تاریخ تولد: {birth_date}
- تاریخ جمع‌آوری نمونه: {sample_date}
- تاریخ تأیید: {approval_date}

**داده‌های آزمایشگاهی (Lab Results):**
{lab_results_block}

(🔹 نکته: اگر محدوده مرجع موجود است از آن استفاده کن.)

---

🧭 **راهنما برای مدل:**
- فقط از داده‌های داده‌شده استفاده کن؛ از خودت داده نساز.  
- برای هر شاخص خارج از محدوده، علت احتمالی و توصیه عملی بده.  
- از تکرار پرهیز کن.  
- برای صرفه‌جویی در توکن، بخش‌ها را خلاصه ولی غنی بنویس.  
- از ایموجی‌ها در تیترها برای جذابیت استفاده کن (🌱💡❤️🧠🔥).  
- تمام تحلیل‌ها را در لحن برند لاکتو بنویس: علمی، انسانی، الهام‌بخش، و قابل‌فهم برای کاربر نهایی.  
- در جمع‌بندی هر بخش، یک جمله برندمحور بنویس مثل:  
  «لاکتو در کنار شماست تا با رویکرد غذا به‌عنوان دارو، این شاخص را بهبود دهد.»

---

🎯 **خروجی نهایی:**  
یک گزارش کامل، ساختاریافته و آماده چاپ در قالب Markdown، با تیتر، بولت‌پوینت، توضیح و جمع‌بندی هر بخش.

قوانین قالب بسیار مهم (اجرا اجباری):
- مطلقاً هیچ پیش‌گفتار/متن آغازین/تعریف و تمجید ننویس (مثل «عالی!» یا «بر اساس داده‌ها...»).
- خروجی را مستقیماً با تیترهای بخش‌ها شروع کن و فقط محتوای گزارش را بنویس.
- از هیچ متن بیرون از 6 بخش خواسته‌شده استفاده نکن.
"""
    return prompt


def generate_report_with_llm(prompt_text):
    """Generates the Markdown report using the same 3-tier fallback."""
    # Attempt 1: OpenAI
    print("\n[INFO] Report: Attempt 1/3 with OpenAI...")
    if openai_client:
        try:
            response = openai_client.chat.completions.create(
                model=OPENAI_MODEL,
                messages=[
                    {"role": "system", "content": "You are a skilled health copywriter. Return ONLY the Markdown report content without any prefaces or meta text."},
                    {"role": "user", "content": prompt_text}
                ],
            )
            text = response.choices[0].message.content
            print("[SUCCESS] Report generated by OpenAI.")
            return text
        except Exception as e:
            print(f"[WARNING] OpenAI report generation failed: {e}. Trying fallback...")
    else:
        print("[WARNING] OpenAI client not configured. Trying fallback...")

    # Attempt 2: Google AI Studio
    print("\n[INFO] Report: Attempt 2/3 with Google AI Studio...")
    try:
        model = genai.GenerativeModel(GEMINI_MODEL)
        response = model.generate_content(prompt_text)
        text = getattr(response, "text", None) or ""
        if text.strip():
            print("[SUCCESS] Report generated by Google AI Studio.")
            return text
    except Exception as e:
        print(f"[WARNING] Google AI Studio report generation failed: {e}. Trying fallback...")

    # Attempt 3: Vertex AI
    print("\n[INFO] Report: Attempt 3/3 with Vertex AI...")
    try:
        vertexai.init(project=GCP_PROJECT_ID, location=GCP_LOCATION)
        model = GenerativeModel(VERTEX_AI_MODEL)
        response = model.generate_content(prompt_text)
        text = response.text or ""
        if text.strip():
            print("[SUCCESS] Report generated by Vertex AI.")
            return text
    except Exception as e:
        print(f"[ERROR] All providers failed for report generation: {e}")
        return None


def save_to_markdown(markdown_text, output_path):
    """Saves Markdown content to a file."""
    try:
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(markdown_text)
        print(f"[INFO] Markdown report saved to {output_path}")
    except Exception as e:
        print(f"[ERROR] Failed to save Markdown file at {output_path}: {e}")


def clean_markdown_report(text):
    """Removes any leading boilerplate before the first heading and deduplicates early repeated lines."""
    if not text:
        return text
    lines = [l.rstrip() for l in text.splitlines()]
    # Trim everything before the first heading (## or ###)
    start_idx = 0
    for i, l in enumerate(lines):
        if l.strip().startswith("###") or l.strip().startswith("##") or l.strip().startswith("# "):
            start_idx = i
            break
    trimmed = lines[start_idx:]
    # Remove common boilerplate openings like 'عالی! ...' if still present
    if trimmed and trimmed[0].startswith("عالی!"):
        trimmed = trimmed[1:]
    # Deduplicate immediate repeated lines at top
    deduped = []
    seen = set()
    for l in trimmed:
        key = l.strip()
        if not deduped or key != deduped[-1].strip():
            deduped.append(l)
    return "\n".join(deduped).strip() + "\n"


# ==============================================================================
# Main Execution
# ==============================================================================

if __name__ == "__main__":
    # --- Input & Output ---
    # Can be a PDF file or a folder of images.
    input_path = "pdfs/بهرام ملاحیدری آزمایش.pdf"
    output_dir = "PipelineOutput"
    # -----------------------------

    os.makedirs(output_dir, exist_ok=True)

    base_name = os.path.splitext(os.path.basename(input_path))[0]
    output_raw_text_json = os.path.join(output_dir, f"{base_name}_raw_text.json")
    output_structured_json = os.path.join(output_dir, f"{base_name}_structured.json")
    output_range_analysis_json = os.path.join(output_dir, f"{base_name}_structured_output.json")
    output_report_md = os.path.join(output_dir, f"{base_name}_lacto_report.md")

    image_paths = []

    # --- Handle input type ---
    if os.path.isfile(input_path) and input_path.lower().endswith('.pdf'):
        print(f"[INFO] Input is a PDF: {input_path}")
        temp_images_dir = os.path.join(output_dir, f"{base_name}_temp_images")
        image_paths = convert_pdf_to_images(input_path, temp_images_dir)
    elif os.path.isdir(input_path):
        print(f"[INFO] Input is an image directory: {input_path}")
        image_paths = sorted([os.path.join(input_path, f) for f in os.listdir(input_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))])
    else:
        print(f"[FATAL] Invalid input path: {input_path}")

    # --- Run pipeline ---
    if image_paths:
        full_text = run_ocr_on_images(image_paths)
        if full_text:
            save_to_json({"full_document_text": full_text}, output_raw_text_json)

            structured_data = process_text_with_llm(full_text)
            if structured_data:
                save_to_json(structured_data, output_structured_json)

                # Stage 4: Analyze test ranges
                print("\n[INFO] Stage 4: Analyzing test ranges...")
                range_analysis = analyze_test_ranges(structured_data)
                if range_analysis:
                    save_to_json(range_analysis, output_range_analysis_json)

                # Stage 5: Generate Lacto-brand health report
                print("\n[INFO] Stage 5: Generating Lacto health report...")
                report_prompt = build_lacto_report_prompt(structured_data)
                report_markdown = generate_report_with_llm(report_prompt)
                if report_markdown:
                    cleaned_report = clean_markdown_report(report_markdown)
                    save_to_markdown(cleaned_report, output_report_md)
                print("\n[COMPLETE] Pipeline finished successfully.")
            else:
                print("\n[FATAL] All AI providers failed. Could not structure data.")
        else:
            print("\n[FATAL] OCR found no text.")
    else:
        print("\n[FATAL] No images to process.")
