from flask import Flask, request, jsonify
import os
import streamlit as st
from langchain.chains import create_sql_query_chain
from langchain_google_genai import GoogleGenerativeAI
from sqlalchemy import create_engine
from sqlalchemy.exc import ProgrammingError
from langchain_community.utilities import SQLDatabase
import google.generativeai as genai
import pymysql
import pandas as pd
import numpy as np
import random
from io import StringIO
import json
import re
from pathlib import Path
import mimetypes
from flask import request
import wget

# Configure GenAI Key
from dotenv import load_dotenv
load_dotenv()


#SET API KEY
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
api_key = os.getenv("GOOGLE_API_KEY")

# Model Configuration
MODEL_CONFIG = {
  "temperature": 0.2,
  "top_p": 1,
  "top_k": 32,
  "max_output_tokens": 4096,
}

## Safety Settings of Model
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_MEDIUM_AND_ABOVE"
  }
]

#LOAD GEMINI MODEL WITH MODEL CONFIGURATIONS
model = genai.GenerativeModel(model_name = "gemini-2.5-flash-lite",
                              generation_config = MODEL_CONFIG,
                              safety_settings = safety_settings)


app = Flask(__name__)

UPLOAD_FOLDER = "/var/www/private/files/voucher/"


@app.route('/process_data', methods=['GET'])
def process_data():
    data = request.json
    input_value = data.get('input_value')

    # Perform your Python logic here
    result = f"Python processed: {input_value} and doubled it to {input_value * 2}"

    return jsonify({"status": "success", "result": result})

@app.route('/home', methods=['GET'])
def home():
    return jsonify({"status": "success"})

@app.route("/read-pdf-test", methods=["POST"])
def upload_pdf():
    if "file" not in request.files:
        return jsonify({"error": "No file part"}), 400

    pdf_file = request.files["file"]

    if pdf_file.filename == "":
        return jsonify({"error": "No selected file"}), 400

    # Save file
    save_path = os.path.join(UPLOAD_FOLDER, pdf_file.filename)
    pdf_file.save(save_path)

    return jsonify({
        "message": "PDF uploaded successfully",
        "filename": pdf_file.filename,
        "path": save_path
    }), 200

@app.route('/read-bl-pdf', methods=['POST'])
def read_bl_pdf():

    if "file" not in request.files:
        return jsonify({"error": "No file part"}), 400

    pdf_file = request.files["file"]

    if pdf_file.filename == "":
        return jsonify({"error": "No selected file"}), 400

    # Save file
    save_path = os.path.join(UPLOAD_FOLDER, pdf_file.filename)
    pdf_file.save(save_path)

    #EXTRACTING WHOLE DATA IN JSON FROM INVOICE
    system_prompt = """
                You are an expert in converting bill of lading document into a structured JSON format. 
                Your task is to extract information from the provided PDF and organize it into a single JSON object. 
                This JSON object must have a single parent tag named 'bill_of_lading_details'. 
                Inside 'bill_of_lading_details', there should be exactly seven nested objects with the following names: 'shipper_data', 'consignee_data', 'notify_party_data', 'delivery_agent_data', 'carrier_data', 'voyage_data', 'cargo_data', container_data' and 'bill_of_lading_metadata'.
                
                The 'shipper_data' section must be a JSON object with the following keys: 'name','address','gst_number','pan_number','mail_id','telephone'.

                The 'consignee_data' section must be a JSON object with the following keys: 'name','address','gst_number','pan_number','mail_id','telephone'.

                The 'notify_party_data' section must be a JSON object with the following keys: 'name','address','gst_number','pan_number','mail_id','telephone'.

                The 'delivery_agent_data' section must be a JSON object with the following keys: 'name','address','gst_number','pan_number','mail_id','telephone'.

                The 'carrier_data' section must be a JSON object with the following keys: 'name','address','gst_number','pan_number','mail_id','telephone'. 

                The 'voyage_data' section must be a JSON object with the following keys: 'voyage_number','vessel','place_of_receipt','port_of_loading','port_of_discharge','final_place_of_delivery'. For the 'port_of_loading' and 'port_of_discharge' keys, extract the city name only, without the country. You should also include united nations location codes for 'port_of_loading','port_of_discharge' keys as 'port_code_of_loading','port_code_of_discharge'. For the united nations location codes, you can refer this website https://unece.org/trade/cefact/unlocode-code-list-country-and-territory.
                For the 'voyage_number','vessel' keys, You should extract the following information from the PDF file:
                'vessel': This is the name of the ship, which is a combination of letters and sometimes numbers (e.g., 'TS KEELUNG').
                'voyage_number': This is the unique identifier for a specific trip. It typically includes a set of numbers followed by a letter (e.g., '25005W').
                You should follow below instructions for the keys 'voyage_number','vessel'.
                Do not combine the vessel name and voyage number.
                Separate the vessel name from the voyage number, even if they appear together in the source PDF file.
                Provide the output in a JSON format with a key for 'vessel' and a separate key for 'voyage_number'.

                The 'cargo_data' section must have the following keys: 'marks_and_numbers',description_of_packages_and_goods_as_stated_by_shipper','hs_code','total_gross_weight_kgs','total_tare_kgs','total_measurement_cbm', 'total_number_of_packages', 'type_of_package'.
                You should also include the standard packaging codes widely used in ocean export and import shipments bill of lading for 'type_of_package' key as 'package_code_type_of_package'. 
                For the key 'total_number_of_packages', You are a specialist in extracting data of number of packages and calculating total number of packages from bill of lading documents and You should identify and extract the number of packages from the provided PDF. This information can be found in sections such as 'No. of Packages or Shipping Units', 'Quantity of Packages', or within the 'Description of packages and goods' section.
                Your response should be a concise statement that includes the numerical value of the packages (e.g., '6').

                The 'container_data' section must be a list of objects. Each object in this list must have the following keys: 'container_number','container_size_type','seal_number','number_of_packages','type_of_package','gross_weight_kgs','tare_kgs','measurement_cbm'.
                The key 'container_size_type' should have exactly four characters, first two characters refers container size and next two characters refers container type.
                You should also include the standard packaging codes widely used in ocean export and import shipments bill of lading for 'type_of_package' key as 'package_code_type_of_package'.
                For the key 'number_of_packages', You are a specialist in extracting data of number of packages from bill of lading documents and You should identify and extract the number of packages from the provided PDF. This information can be found in sections such as 'No. of Packages or Shipping Units', 'Quantity of Packages', or within the 'Description of packages and goods' section against each container.
                Your response should be a concise statement that includes the numerical value of the packages (e.g., '6').
                Ensure the JSON includes a cargo_data section with the overall shipment details and a container_data section for each container. If the PDF provides a single container with all cargo details, populate the number_of_packages, type_of_package, gross_weight_kgs, and measurement_cbm keys within the container_data section using the corresponding values from the cargo_data section.
                You are a helpful AI assistant designed to extract information from shipping documents. When extracting container information, please follow these specific rules:
                1.  Container Number: Look for a unique alphanumeric code typically consisting of 4 letters followed by 7 digits. If the container number is combined with another number by a slash (e.g., 'CMAU6629295/R3239486'), the value before the slash is the container number.
                2.  Seal Number: The seal number is a separate identifier. If it is combined with the container number, the value after the slash is the seal number. Do not include the slash in either field.
                3.  Output Format: Provide the extracted data in 'container_data' section, with separate keys for 'container_number' and 'seal_number'.

                This 'bill_of_lading_metadata' tag must contain the following specific keys: 'document_type','bill_of_lading_number','non_negotiable','number_of_originals','place_of_issue','date_of_issue'. 
                When extracting date information for the keys 'date_of_issue', please follow these specific rules:
                1.  You must format this date as YYYY-MM-DD. 
                2.  Ensure the output is only the formatted date string, with no additional text or explanation. 
                3.  The final output should be suitable for direct use in a MySQL database.
                
                Extract the corresponding data from the invoice and populate these tags accordingly.
                """
    #system_prompt = "Convert Invoice data into json format with appropriate json tags as required for the data in image "
    #image_path = "one_bill.pdf"
    #image_path = request.args.get('file_name')
    image_path=save_path
    #image_path="/var/www/private/files/voucher/MSC.pdf"
    user_prompt = """Please extract the data from the attached bill of lading PDF and convert it to the specified JSON format."""

    try:
        response = gemini_output(image_path, system_prompt, user_prompt)
        #output = gemini_output(image_path, system_prompt, user_prompt)
        #output=output[7:-4]
        #return output

        if response.prompt_feedback and response.prompt_feedback.block_reason:
            # For simplicity, if blocked, we'll return an error JSON
            return json.dumps({"error": "Content blocked", "reason": response.prompt_feedback.block_reason})

        if not response.text:
            return json.dumps({"error": "No text content received from Gemini"})

        raw_gemini_output = response.text
        print(f"Raw Gemini output (before extraction):\n{raw_gemini_output}")
        # Try to extract JSON from markdown fences
        json_string_extracted = extract_json_from_markdown(raw_gemini_output)

        if json_string_extracted:
            try:
                # Validate JSON by parsing it, then return the original string
                parsed_data = json.loads(json_string_extracted)
                return json.dumps(parsed_data) # Re-dump to ensure it's a single line and valid JSON
            except json.JSONDecodeError as e:
                return json.dumps({"error": "JSON decoding failed after extraction", "message": str(e), "extracted_json": json_string_extracted, "raw_output": raw_gemini_output})
        else:
            # If no markdown fences found, try to parse the whole output as JSON
            # This handles cases where the model might sometimes skip fences or add other text
            try:
                parsed_data = json.loads(raw_gemini_output)
                return json.dumps(parsed_data)
            except json.JSONDecodeError as e:
                return json.dumps({"error": "No JSON markdown fences found and direct JSON parsing failed", "message": str(e), "raw_output": raw_gemini_output})

    except Exception as e:
        return json.dumps({"error": "API call failed", "message": str(e)})


@app.route('/read-pdf', methods=['POST'])
def read_pdf():

    if "file" not in request.files:
        return jsonify({"error": "No file part"}), 400

    pdf_file = request.files["file"]

    if pdf_file.filename == "":
        return jsonify({"error": "No selected file"}), 400

    # Save file
    save_path = os.path.join(UPLOAD_FOLDER, pdf_file.filename)
    pdf_file.save(save_path)

    #EXTRACTING WHOLE DATA IN JSON FROM INVOICE
    system_prompt = """
                You are an expert in converting shipping invoices into a structured JSON format.
                Your task is to extract information from the provided PDF and organize it into a single JSON object.
                This JSON object must have a single parent tag named 'invoice_details'.
                Inside 'invoice_details', there should be exactly four nested objects with the following names: 'supplier_data', 'customer_data', 'charges_data' and 'invoice_metadata'.

                The 'supplier_data' section must be a JSON object with the following keys: 'name', 'address', 'gstin', 'cin', and 'pan'.

                The 'customer_data' section must be a JSON object with the following keys: 'client_no', 'invoice_to', 'address', 'state', 'gstin', and 'pan'.

                The 'charges_data' section must be a list of objects.
                Each object in this list must have the following keys: 'charge', 'hsn_code', 'qty_x_rate_curr', 'currency', 'total_curr', 'roe', 'total_inr', 'sgst_ugst_percent', 'sgst_ugst_amount', 'cgst_percent', 'cgst_amount', 'igst_percent', and 'igst_amount'.

                This 'invoice_metadata' tag must contain the following specific keys: 'invoice_no', 'invoice_date', 'place_of_supply', 'vessel', 'voyage', 'pol', 'pod', 'bl_no', 'taxable_value_inr', 'total_gst_amount_inr', and 'total_invoice_amount_inr'.
                Within 'invoice_metadata', you must also include a nested JSON object called 'currency_total' with the keys 'amount_in_eur', 'amount_in_usd', and 'amount_in_inr'.

                Extract the corresponding data from the invoice and populate these tags accordingly.
                """
    #system_prompt = "Convert Invoice data into json format with appropriate json tags as required for the data in image "
    #image_path = "one_bill.pdf"
    #image_path = request.args.get('file_name')
    image_path=save_path
    #image_path="/var/www/private/files/voucher/MSC.pdf"
    user_prompt = """Please extract the data from the attached invoice PDF and convert it to the specified JSON format."""

    try:
        response = gemini_output(image_path, system_prompt, user_prompt)
        #output = gemini_output(image_path, system_prompt, user_prompt)
        #output=output[7:-4]
        #return output

        if response.prompt_feedback and response.prompt_feedback.block_reason:
            # For simplicity, if blocked, we'll return an error JSON
            return json.dumps({"error": "Content blocked", "reason": response.prompt_feedback.block_reason})

        if not response.text:
            return json.dumps({"error": "No text content received from Gemini"})

        raw_gemini_output = response.text
        print(f"Raw Gemini output (before extraction):\n{raw_gemini_output}")
        # Try to extract JSON from markdown fences
        json_string_extracted = extract_json_from_markdown(raw_gemini_output)

        if json_string_extracted:
            try:
                # Validate JSON by parsing it, then return the original string
                parsed_data = json.loads(json_string_extracted)
                return json.dumps(parsed_data) # Re-dump to ensure it's a single line and valid JSON
            except json.JSONDecodeError as e:
                return json.dumps({"error": "JSON decoding failed after extraction", "message": str(e), "extracted_json": json_string_extracted, "raw_output": raw_gemini_output})
        else:
            # If no markdown fences found, try to parse the whole output as JSON
            # This handles cases where the model might sometimes skip fences or add other text
            try:
                parsed_data = json.loads(raw_gemini_output)
                return json.dumps(parsed_data)
            except json.JSONDecodeError as e:
                return json.dumps({"error": "No JSON markdown fences found and direct JSON parsing failed", "message": str(e), "raw_output": raw_gemini_output})

    except Exception as e:
        return json.dumps({"error": "API call failed", "message": str(e)})


#USER METHODS
#DEFINE PDF FORMAT TO INPUT IN GEMINI
def read_pdf_bytes(pdf_path: str):
    """
    Reads a PDF file, gets its raw bytes, and formats it into a dictionary
    with the 'application/pdf' MIME type. This format is often used for APIs
    that accept PDF file inputs.

    Args:
        pdf_path (str): The path to the input PDF file.

    Returns:
        list: A list containing a dictionary with 'mime_type' and 'data' keys.
              Returns an empty list if the file is not found or not a PDF.
    """


    url = pdf_path
    #save_path = wget.download(url,"/var/gemini-ai/pdf-upload/")
    #file_name='pdf-upload/7689206597a01a-158412-7-MSCpdf.pdf'
    pdf_file = Path(pdf_path)

    if not pdf_file.exists():
        raise FileNotFoundError(f"Could not find PDF file: {pdf_file}")

    # Explicitly set MIME type for PDF
    mime_type = "application/pdf"

    # Optional: You could add a check here to ensure it's likely a PDF
    # based on extension, though reading bytes will work for any file.
    # For a stricter check, you might inspect the first few bytes (magic numbers)
    # or rely on a more robust library if validation is critical.
    if pdf_file.suffix.lower() != ".pdf":
        print(f"Warning: The file '{pdf_file.name}' does not have a .pdf extension. "
              f"Proceeding assuming it's a PDF, but this might indicate an issue.")
        # Alternatively, you could raise an error here if you only want to process .pdf files
        # raise ValueError(f"File '{pdf_file.name}' is not a PDF based on its extension.")


    try:
        # Read the entire content of the PDF file as bytes
        pdf_data = pdf_file.read_bytes()
    except Exception as e:
        raise IOError(f"Error reading bytes from PDF file '{pdf_path}': {e}")


    pdf_parts = [
        {
            "mime_type": mime_type,
            "data": pdf_data
        }
    ]
    return pdf_parts

#GEMINI MODEL OUTPUT
def gemini_output(image_path, system_prompt, user_prompt):

#    image_info = image_format(image_path)
    image_info = read_pdf_bytes(image_path)
    input_prompt= [system_prompt, image_info[0], user_prompt]
    response = model.generate_content(input_prompt)
    #return response.text
    return response

#EXTRACT JSON FROM MARKDOWN FENCES (```json ... ```).
def extract_json_from_markdown(text):
    """
    Extracts a JSON string enclosed in markdown code fences (```json ... ```).
    Returns the extracted JSON string or None if not found/invalid.
    """
    # Regex to find content between ```json and ```
    match = re.search(r'```json\s*(.*?)\s*```', text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

#END USER METHODS

if __name__ == '__main__':
    app.run(host='0.0.0.0',port=5000,debug=True) # Run on port 5000