from flask import Flask, request, jsonify
import os
import streamlit as st
from langchain.chains import create_sql_query_chain
from langchain_google_genai import GoogleGenerativeAI
from sqlalchemy import create_engine
from sqlalchemy.exc import ProgrammingError
from langchain_community.utilities import SQLDatabase
import google.generativeai as genai
import pymysql
import pandas as pd
import numpy as np
import random
from io import StringIO
import json
import re
from pathlib import Path
import mimetypes
from flask import request
import wget

# Configure GenAI Key
from dotenv import load_dotenv
load_dotenv()


#SET API KEY
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
api_key = os.getenv("GOOGLE_API_KEY")

# Model Configuration
MODEL_CONFIG = {
  "temperature": 0.2,
  "top_p": 1,
  "top_k": 32,
  "max_output_tokens": 8192,
}

## Safety Settings of Model
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  }
]

#LOAD GEMINI MODEL WITH MODEL CONFIGURATIONS
model = genai.GenerativeModel(model_name = "gemini-2.5-flash-lite",
                              generation_config = MODEL_CONFIG,
                              safety_settings = safety_settings)


app = Flask(__name__)

UPLOAD_FOLDER = "/var/www/private/files/voucher/"


@app.route('/process_data', methods=['GET'])
def process_data():
    data = request.json
    input_value = data.get('input_value')

    # Perform your Python logic here
    result = f"Python processed: {input_value} and doubled it to {input_value * 2}"

    return jsonify({"status": "success", "result": result})

@app.route('/home', methods=['GET'])
def home():
    return jsonify({"status": "success1"})

@app.route("/read-pdf-test", methods=["POST"])
def upload_pdf():
    if "file" not in request.files:
        return jsonify({"error": "No file part"}), 400

    pdf_file = request.files["file"]

    if pdf_file.filename == "":
        return jsonify({"error": "No selected file"}), 400

    # Save file
    save_path = os.path.join(UPLOAD_FOLDER, pdf_file.filename)
    pdf_file.save(save_path)

    return jsonify({
        "message": "PDF uploaded successfully",
        "filename": pdf_file.filename,
        "path": save_path
    }), 200

@app.route('/read-bl-pdf-custom-prompt', methods=['POST'])
def read_bl_pdf_custom_prompt():

    if "file" not in request.files:
        return jsonify({"error": "No file part"}), 400

    pdf_file = request.files["file"]

    if pdf_file.filename == "":
        return jsonify({"error": "No selected file"}), 400

    user_prompt = request.form.get("user_prompt")
    system_prompt = request.form.get("system_prompt")

    if not user_prompt:
        return jsonify({"error": "No 'user_prompt' field in the request"}), 400
    if not system_prompt:
        return jsonify({"error": "No 'system_prompt' field in the request"}), 400

    # Save file
    save_path = os.path.join(UPLOAD_FOLDER, pdf_file.filename)
    pdf_file.save(save_path)

    #EXTRACTING WHOLE DATA IN JSON 
    image_path=save_path

    try:
        response = gemini_output(image_path, system_prompt, user_prompt)

        if response.prompt_feedback and response.prompt_feedback.block_reason:
            # For simplicity, if blocked, we'll return an error JSON
            return json.dumps({"error": "Content blocked", "reason": response.prompt_feedback.block_reason})

        if not response.text:
            return json.dumps({"error": "No text content received from Gemini"})

        raw_gemini_output = response.text
        print(f"Raw Gemini output (before extraction):\n{raw_gemini_output}")
        # Try to extract JSON from markdown fences
        json_string_extracted = extract_json_from_markdown(raw_gemini_output)

        if json_string_extracted:
            try:
                # Pre-process the string to handle invalid backslashes
                # This is a targeted fix for the "Invalid \escape" error
                processed_string = json_string_extracted.replace('\,', ',') 
                processed_string = processed_string.replace('\\ ', ' ') # Replace the invalid escape with a space
                
                # 2. FIX: Remove commas from inside numbers (e.g., "6,803.13" -> "6803.13")
                # This regex looks for a comma that is between two digits.
                processed_string = re.sub(r'(\d),(\d)', r'\1\2', processed_string)

                # Validate JSON by parsing it
                parsed_data = json.loads(processed_string)                
                return json.dumps(parsed_data) # Re-dump to ensure it's a single line and valid JSON
            except json.JSONDecodeError as e:
                return json.dumps({"error": "JSON decoding failed after extraction", "message": str(e), "extracted_json": json_string_extracted, "raw_output": raw_gemini_output})
        else:
            # If no markdown fences found, try to parse the whole output as JSON
            # This handles cases where the model might sometimes skip fences or add other text
            try:
                parsed_data = json.loads(raw_gemini_output)
                return json.dumps(parsed_data)
            except json.JSONDecodeError as e:
                return json.dumps({"error": "No JSON markdown fences found and direct JSON parsing failed", "message": str(e), "raw_output": raw_gemini_output})

    except Exception as e:
        return json.dumps({"error": "API call failed", "message": str(e)})
        
@app.route('/read-bl-pdf', methods=['POST'])
def read_bl_pdf():

    if "file" not in request.files:
        return jsonify({"error": "No file part"}), 400

    pdf_file = request.files["file"]

    if pdf_file.filename == "":
        return jsonify({"error": "No selected file"}), 400

    # Save file
    save_path = os.path.join(UPLOAD_FOLDER, pdf_file.filename)
    pdf_file.save(save_path)

    #EXTRACTING WHOLE DATA IN JSON FROM INVOICE
    system_prompt = """
                You are an expert in converting bill of lading document into a structured JSON format. 
                Your task is to extract information from the provided PDF and organize it into a single JSON object. 
                This JSON object must have a single parent tag named 'bill_of_lading_details'. 
                Inside 'bill_of_lading_details', there should be exactly seven nested objects with the following names: 'shipper_data', 'consignee_data', 'notify_party_data', 'delivery_agent_data', 'carrier_data', 'voyage_data', 'cargo_data', container_data' and 'bill_of_lading_metadata'.
                
                The 'shipper_data' section must be a JSON object with the following keys: 'name','address','gst_number','pan_number','mail_id','telephone'.

                The 'consignee_data' section must be a JSON object with the following keys: 'name','address','gst_number','pan_number','mail_id','telephone'.

                The 'notify_party_data' section must be a JSON object with the following keys: 'name','address','gst_number','pan_number','mail_id','telephone'.

                The 'delivery_agent_data' section must be a JSON object with the following keys: 'name','address','gst_number','pan_number','mail_id','telephone'.

                The 'carrier_data' section must be a JSON object with the following keys: 'name','address','gst_number','pan_number','mail_id','telephone'. 

                The 'voyage_data' section must be a JSON object with the following keys: 'voyage_number', 'vessel', 'place_of_receipt', 'port_of_loading', 'port_code_of_loading', 'port_of_discharge', 'port_code_of_discharge', 'final_place_of_delivery'. 

                The 'cargo_data' section must have the following keys: 'marks_and_numbers','description_of_packages_and_goods','hs_code','total_gross_weight_kgs','total_tare_kgs','total_measurement_cbm', 'total_number_of_packages', 'type_of_package', 'package_code_type_of_package'.
                
                The 'container_data' section must be a list of objects. Each object in this list must have the following keys: 'container_number', 'container_size_type', 'seal_number', 'number_of_packages', 'type_of_package', 'package_code_type_of_package', 'gross_weight_kgs', 'tare_kgs', and 'measurement_cbm'.

                This 'bill_of_lading_metadata' tag must contain the following specific keys: 'document_type','bill_of_lading_number','non_negotiable','number_of_originals','place_of_issue' and 'date_of_issue'. 

                The 'other_data' section must be a list of objects.  Each object in this list must have the following key: 'container_combined_string'

                """
    #system_prompt = "Convert Invoice data into json format with appropriate json tags as required for the data in image "
    #image_path = "one_bill.pdf"
    #image_path = request.args.get('file_name')
    image_path=save_path
    #image_path="/var/www/private/files/voucher/MSC.pdf"
    user_prompt = """
                Voyage Data section:

                1. Rule for Identifying Combined String for Vessel and Voyage Number: 
                Locate the single string that contains both the vessel name and the voyage number. This string is typically found next to headings like "Ocean vessel", "Vessel", "Voyage", "Vessel Voyage", "Vessel/Voyage" or "PRE-CARRIAGE BY" in the document.

                2. Rules for Vessel and Voyage Number Extraction:
                2.1) Identify the Combined String: If a single string is present that contains both the vessel name and voyage number, use this string.
                
                2.2) Separate the Components:
                2.2.1) The voyage number is the last part of the combined string. It's a short, unique identifier, often a mix of letters and numbers (e.g., 008, 033N, V.W191, 088S). It typically follows a vessel name and is separated by a space or another common delimiter.
                2.2.2) The vessel name is the rest of the string before the voyage number.

                2.3) Assign the Values:
                2.3.1) Assign the identified vessel name to the vessel key.
                2.3.2) Assign the identified voyage number to the voyage_number key.

                2.4) Handling Standalone Voyage Numbers:
                If a combined string is not available, look for a separate entry that contains a value resembling a voyage number, such as V.65065, 033N, 009W, or similar alphanumeric patterns. Assign this value to the voyage_number key and set the vessel key to the name of the vessel found elsewhere in the document.

                3. Examples for Vessel and Voyage Number Separation:
                3.1) If the combined string is XIN CHI WAN 088S, then:
                vessel: XIN CHI WAN
                voyage_number: 088S
                3.2) If the combined string is MV ZUNAIRAH ZARISHA 4 008, then:
                vessel: MV ZUNAIRAH ZARISHA 4
                voyage_number: 008
                3.3) If the combined string is LALIT BHUM 033N, then:
                vessel: LALIT BHUM
                voyage_number: 033N
                3.4) If the combined string is WAN HAI 510 V.W191, then:
                vessel: WAN HAI 510
                voyage_number: V.W191

                4. Rules for Port of Loading and Port of Discharge:
                For the 'port_of_loading' and 'port_of_discharge' keys, extract only the port name. You must exclude any additional descriptors such as 'SEA PORT' or 'IN VIETNAM'.
                Example for 'port_of_loading': 'HAI PHONG SEA PORT IN VIETNAM' should be extracted as 'HAI PHONG'.
                Example for 'port_of_discharge': 'KATTUPALLI SEA PORT' should be extracted as 'KATTUPALLI'.

                You must also include the United Nations Location Codes (UN/LOCODE) for the ports. 
                Use the keys 'port_code_of_loading' and 'port_code_of_discharge' for this purpose. 
                You can refer to a UN/LOCODE database to find the correct codes. 
                Example for 'port_code_of_loading': 'HAI PHONG' has the UN/LOCODE 'VNHPH'.
                Example for 'port_code_of_discharge': 'KATTUPALLI' has the UN/LOCODE 'INKAT'.

                End of Rules for Voyage Data.

                Cargo Data section:

                1. Rule for Identifying Total Number of Packages and Type of Package:
                Locate the data in sections such as 'No. of Packages or Shipping Units', 'Quantity of Packages', or within the 'Description of packages and goods' section.
                For the total_number_of_packages and type_of_package keys in the cargo_data section, prioritize extracting information from the 'Description of packages and goods' section.
                If total number of packages is given in two different units, use the smaller unit 
                (e.g., 45 BUNDLES = 3,654 PCS), then assign the total_number_of_packages to 45 and the type_of_package to BUNDLES) 
                The package_code_type_of_package key should be populated with the standardized package code 
                (e.g., BDL for BUNDLES) if available.

                2. Rule for Identifying Decription of Packages and Goods:
                This may require combining information from multiple pages, including the main table 
                and any 'Continuation' pages or sections labeled 'DESCRIPTION OF PACKAGES AND GOODS'. 

                3. Rule for Decription of Packages and Goods Extraction:
                You must extract the complete description. 
                Ensure the final output for this key contains all relevant details.

                End of Rules for Cargo Data.

                Container Data section:

                Container and Seal Numbers:

                1. Rule for Identifying Container and Seal Numbers:
                1.1) Container detail will be combined on a single string. e.g., [Container Number]/[Seal Number]/[Size and Type]/[Shipment Type]/[Number of Packages]/[Type of Package]/[Gross Weight]/[Measurement].
                Forward slash as a primary separator and Space can also be. (e.g., 'REGU5156798/VNHPH2515138' or 'TEMU5031078/IB813576/').
                1.2) Scan the document for container detail, typically found under headings like "Container No.", "Seal No.", "Marks and numbers" or in the main description of goods.
                1.3) Identify string that match the standard format for a container number (four letters followed by seven digits) (e.g., TEMU5792971).
                1.4) Identify Seal Number which is a combination of alpha and numeric codes, or only numeric code 
                which immediately follows the container number (e.g., TEMU5792971).
                1.6) The document may contains multiple container numbers and their associated seal numbers together. 
                Identify the total number of containers from a field like No. and Kinds of Containers or Pkgs (e.g., "8 X 20 GP").
                Create separate JSON object for every container listed in the document.
                1.7) Do not include the seperator (slash or space) in any of the extracted fields.
                Do not assign a seal number from one container to another.
                Do not use "N/M" as a value for container_number or seal_number. 

                Container Size/Type:

                1. Rule for Identifying Container Size and Type:
                The container_size_type key should have exactly four characters (e.g., '20GP', '40HC'). 
                The first two characters refer to the container size, and the last two refer to the container type.

                2. Assign the Values:
                For the container_size_type key within each container object, extract the specific container size type if available. 
                If the document provides a single, overall size type for the entire shipment, (e.g., '5 X 20GP') use those values to populate container_size_type for each container. 
                Example, the container_size_type for all the 5 containers are 20GP, which you must determine. 

                Package Details:

                1. Rule for Identifying Number of Packages and Type of Package:
                For the number_of_packages key, identify and extract the numerical value from sections 
                such as 'No. of Packages or Shipping Units', 'Quantity of Packages', 
                or within the 'Description of packages and goods' section.

                2. Rule for Type of Package Extraction:
                The type_of_package key should be populated with the descriptive name of the items 
                (e.g., 'BOX UNITS', 'PIECES', 'CARTONS', 'BAGS') from the document. 
                The package_code_type_of_package key should be populated with the standardized package code 
                (e.g., 'PCS' for PIECES) if available.

                3. Rule for Number of Packages for each Container Extraction:
                If the document provides specific package quantities for each container line item 
                (e.g., "854 CARTON" for one container and "365 CARTON" for another), 
                use those individual values to populate the number_of_packages key for the corresponding container object. 

                4. Rule for Number of Packages and Type of Package, if Container wise is not available:
                If the document provides overall number and type of packages for the entire shipment 
                then use that total value to populate the number_of_packages key for each container.
                
                Weight and Measurement Details:

                1. Rule for Identifying Gross Weight, Tare Weight, and Measurement for each Container:
                For the gross_weight_kgs, tare_kgs, and measurement_cbm keys, extract the specific values for each container, if available.

                2. Rule for Gross Weight Extraction:
                gross_weight_kgs: The gross weight of the goods in kilograms. 
                This must be a numerical value. Convert any notation (like a comma for a thousands separator) to a standard decimal format. 
                For example, "22,650.000 KGS" should be represented as 22650.0.

                3. Rule for Tare Weight Extraction:
                tare_kgs: The tare weight of the container in kilograms. 
                This must be a numerical value. Convert any notation (like a comma for a thousands separator) to a standard decimal format. 
                For example, "2.200,000" should be parsed as 2200.0.

                4. Rule for Measurement Extraction:
                measurement_cbm: The measurement of the goods in cubic meters. 
                This must be a numerical value. Convert any notation (like a comma for a thousands separator) to a standard decimal format.

                5. Rule for Gross Weight, Tare Weight, and Measurement, if each Container not available:
                If a single, overall value is provided for the entire shipment, divide that value by the number of containers to get the per-container value.
                                
                JSON Structure for Container Data and Cargo Data:
                Ensure the JSON includes a cargo_data section with overall shipment details and a container_data section for each container. 

                Rule for Container Data, if Containers with Total Cargo details:
                If the PDF provides containers with total cargo details, populate the number_of_packages, type_of_package, gross_weight_kgs, 
                and measurement_cbm along with container_no, seal_no, and container_size_type keys within the container_data section 
                using the corresponding values from the cargo_data section.

                Common Rule for Package and Unit Extraction:

                For both the overall shipment (cargo_data) and individual containers (container_data), extract the number and type of packages.
                Number: The number_of_packages key should contain the numerical count (e.g., '80' for '80 METAL BOX UNITS').
                Type: The type_of_package key should contain the descriptive name (e.g., 'BOX UNITS','PIECES', 'CARTONS', 'BAGS') from the document.
                Code: The package_code_type_of_package key should contain the standardized code (e.g., 'PCS' for PIECES) if available.
                Priority: If the document provides a total quantity for the entire shipment but does not break it down by container, 
                use the total value for each container's number_of_packages key.

                End of Rules for Container Data.

                Bill of Lading Metadata:

                Rule for Date of Issue:
                1. Search for a date associated with terms such as 'Date of Issue', 'Date Laden on Board', or 'Date Shipped on Board'.
                2. You must format this date as YYYY-MM-DD.
                3. Ensure the output is only the formatted date string, with no additional text or explanation.
                4. The final output should be suitable for direct use in a MySQL database.
                
                End of Rules for Bill of Lading Metadata.

                Other Data section:

                1. Rule for Identifying Other Data:
                Locate the container/seal number combined string.
                Calculate total number of containers and create a separate JSON object for each container.
                2. Rule for Other Data Extraction:
                Extract the string for each container from the document and populate the key container_combined_string in the other_data section. 

                End of Rules for Other Data.

                Final Output:
                Now, Extract the data from the uploaded bill of lading PDF and populate the JSON structure as described above.
                
                """

    try:
        response = gemini_output(image_path, system_prompt, user_prompt)
        #output = gemini_output(image_path, system_prompt, user_prompt)
        #output=output[7:-4]
        #return output

        if response.prompt_feedback and response.prompt_feedback.block_reason:
            # For simplicity, if blocked, we'll return an error JSON
            return json.dumps({"error": "Content blocked", "reason": response.prompt_feedback.block_reason})

        if not response.text:
            return json.dumps({"error": "No text content received from Gemini"})

        raw_gemini_output = response.text
        print(f"Raw Gemini output (before extraction):\n{raw_gemini_output}")
        # Try to extract JSON from markdown fences
        json_string_extracted = extract_json_from_markdown(raw_gemini_output)

        if json_string_extracted:
            try:
                # Pre-process the string to handle invalid backslashes
                # This is a targeted fix for the "Invalid \escape" error
                processed_string = json_string_extracted.replace('\,', ',') 
                processed_string = processed_string.replace('\\ ', ' ') # Replace the invalid escape with a space
                
                # Validate JSON by parsing it
                parsed_data = json.loads(processed_string)                
                return json.dumps(parsed_data) # Re-dump to ensure it's a single line and valid JSON
            except json.JSONDecodeError as e:
                return json.dumps({"error": "JSON decoding failed after extraction", "message": str(e), "extracted_json": json_string_extracted, "raw_output": raw_gemini_output})
        else:
            # If no markdown fences found, try to parse the whole output as JSON
            # This handles cases where the model might sometimes skip fences or add other text
            try:
                parsed_data = json.loads(raw_gemini_output)
                return json.dumps(parsed_data)
            except json.JSONDecodeError as e:
                return json.dumps({"error": "No JSON markdown fences found and direct JSON parsing failed", "message": str(e), "raw_output": raw_gemini_output})

    except Exception as e:
        return json.dumps({"error": "API call failed", "message": str(e)})


@app.route('/read-pdf', methods=['POST'])
def read_pdf():

    if "file" not in request.files:
        return jsonify({"error": "No file part"}), 400

    pdf_file = request.files["file"]

    if pdf_file.filename == "":
        return jsonify({"error": "No selected file"}), 400

    # Save file
    save_path = os.path.join(UPLOAD_FOLDER, pdf_file.filename)
    pdf_file.save(save_path)

    #EXTRACTING WHOLE DATA IN JSON FROM INVOICE
    system_prompt = """
                You are an expert in converting shipping invoices into a structured JSON format.
                Your task is to extract information from the provided PDF and organize it into a single JSON object.
                This JSON object must have a single parent tag named 'invoice_details'.
                Inside 'invoice_details', there should be exactly four nested objects with the following names: 'supplier_data', 'customer_data', 'charges_data' and 'invoice_metadata'.

                The 'supplier_data' section must be a JSON object with the following keys: 'name', 'address', 'gstin', 'cin', and 'pan'.

                The 'customer_data' section must be a JSON object with the following keys: 'client_no', 'invoice_to', 'address', 'state', 'gstin', and 'pan'.

                The 'charges_data' section must be a list of objects.
                Each object in this list must have the following keys: 'charge', 'hsn_code', 'qty_x_rate_curr', 'currency', 'total_curr', 'roe', 'total_inr', 'sgst_ugst_percent', 'sgst_ugst_amount', 'cgst_percent', 'cgst_amount', 'igst_percent', and 'igst_amount'.

                This 'invoice_metadata' tag must contain the following specific keys: 'invoice_no', 'invoice_date', 'place_of_supply', 'vessel', 'voyage', 'pol', 'pod', 'bl_no', 'taxable_value_inr', 'total_gst_amount_inr', and 'total_invoice_amount_inr'.
                Within 'invoice_metadata', you must also include a nested JSON object called 'currency_total' with the keys 'amount_in_eur', 'amount_in_usd', and 'amount_in_inr'.

                Extract the corresponding data from the invoice and populate these tags accordingly.
                """
    #system_prompt = "Convert Invoice data into json format with appropriate json tags as required for the data in image "
    #image_path = "one_bill.pdf"
    #image_path = request.args.get('file_name')
    image_path=save_path
    #image_path="/var/www/private/files/voucher/MSC.pdf"
    user_prompt = """
                Supplier Data section:
                If "Payable To" section exist, Extract the supplier name listed under the 'Payable to' section.
                End of Rules for Supplier Data.

                Charges Data section:
                
                1. Rule for Rate of Exchange Extraction:
                If 'Rate of Exchange' section exist, Extract the 'roe' listed under the 'Rate of Exchange' row.

                2. Rule for Measurement Extraction:
                If 'GST Tax' section exist, Extract the SAC column values from the table detailing the GST Tax.

                Common Rule for Charges Data Extraction:
                Do not list line items from the GST breakdown table  as separate charges.
                
                End of Rules for Charges Data.

                Invoice Metadata:

                Rule for Invoice Date:
                1. Search for a date associated with terms such as 'Invoice Date'.
                2. You must format this date as YYYY-MM-DD.
                3. Ensure the output is only the formatted date string, with no additional text or explanation.
                4. The final output should be suitable for direct use in a MySQL database.
                
                End of Rules for Invoice Metadata.

                Final Output:
                Now, Extract the data from the uploaded shipping line invoice PDF and populate the JSON structure as described above.
                
                """
    try:
        response = gemini_output(image_path, system_prompt, user_prompt)
        #output = gemini_output(image_path, system_prompt, user_prompt)
        #output=output[7:-4]
        #return output

        if response.prompt_feedback and response.prompt_feedback.block_reason:
            # For simplicity, if blocked, we'll return an error JSON
            return json.dumps({"error": "Content blocked", "reason": response.prompt_feedback.block_reason})

        if not response.text:
            return json.dumps({"error": "No text content received from Gemini"})

        raw_gemini_output = response.text
        print(f"Raw Gemini output (before extraction):\n{raw_gemini_output}")
        # Try to extract JSON from markdown fences
        json_string_extracted = extract_json_from_markdown(raw_gemini_output)

        if json_string_extracted:
            try:
                # Validate JSON by parsing it, then return the original string
                parsed_data = json.loads(json_string_extracted)
                return json.dumps(parsed_data) # Re-dump to ensure it's a single line and valid JSON
            except json.JSONDecodeError as e:
                return json.dumps({"error": "JSON decoding failed after extraction", "message": str(e), "extracted_json": json_string_extracted, "raw_output": raw_gemini_output})
        else:
            # If no markdown fences found, try to parse the whole output as JSON
            # This handles cases where the model might sometimes skip fences or add other text
            try:
                parsed_data = json.loads(raw_gemini_output)
                return json.dumps(parsed_data)
            except json.JSONDecodeError as e:
                return json.dumps({"error": "No JSON markdown fences found and direct JSON parsing failed", "message": str(e), "raw_output": raw_gemini_output})

    except Exception as e:
        return json.dumps({"error": "API call failed", "message": str(e)})


#USER METHODS
#DEFINE PDF FORMAT TO INPUT IN GEMINI
def read_pdf_bytes(pdf_path: str):
    """
    Reads a PDF file, gets its raw bytes, and formats it into a dictionary
    with the 'application/pdf' MIME type. This format is often used for APIs
    that accept PDF file inputs.

    Args:
        pdf_path (str): The path to the input PDF file.

    Returns:
        list: A list containing a dictionary with 'mime_type' and 'data' keys.
              Returns an empty list if the file is not found or not a PDF.
    """


    url = pdf_path
    #save_path = wget.download(url,"/var/gemini-ai/pdf-upload/")
    #file_name='pdf-upload/7689206597a01a-158412-7-MSCpdf.pdf'
    pdf_file = Path(pdf_path)

    if not pdf_file.exists():
        raise FileNotFoundError(f"Could not find PDF file: {pdf_file}")

    # Explicitly set MIME type for PDF
    mime_type = "application/pdf"

    # Optional: You could add a check here to ensure it's likely a PDF
    # based on extension, though reading bytes will work for any file.
    # For a stricter check, you might inspect the first few bytes (magic numbers)
    # or rely on a more robust library if validation is critical.
    if pdf_file.suffix.lower() != ".pdf":
        print(f"Warning: The file '{pdf_file.name}' does not have a .pdf extension. "
              f"Proceeding assuming it's a PDF, but this might indicate an issue.")
        # Alternatively, you could raise an error here if you only want to process .pdf files
        # raise ValueError(f"File '{pdf_file.name}' is not a PDF based on its extension.")


    try:
        # Read the entire content of the PDF file as bytes
        pdf_data = pdf_file.read_bytes()
    except Exception as e:
        raise IOError(f"Error reading bytes from PDF file '{pdf_path}': {e}")


    pdf_parts = [
        {
            "mime_type": mime_type,
            "data": pdf_data
        }
    ]
    return pdf_parts

#GEMINI MODEL OUTPUT
def gemini_output(image_path, system_prompt, user_prompt):

#    image_info = image_format(image_path)
    image_info = read_pdf_bytes(image_path)
    input_prompt= [system_prompt, image_info[0], user_prompt]
    response = model.generate_content(input_prompt)
    #return response.text
    return response

#EXTRACT JSON FROM MARKDOWN FENCES (```json ... ```).
def extract_json_from_markdown(text):
    """
    Extracts a JSON string enclosed in markdown code fences (```json ... ```).
    Returns the extracted JSON string or None if not found/invalid.
    """
    # Regex to find content between ```json and ```
    match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

#END USER METHODS

if __name__ == '__main__':
    app.run(host='0.0.0.0',port=5000,debug=True) # Run on port 5000
