import torch
from transformers import pipeline
import librosa
from datetime import datetime
from deep_translator import GoogleTranslator
from typing import Dict, Union
from gliner import GLiNER
import gradio as gr

# Define available models
MODELS = {
    "Whisper Large V3 Turbo": "openai/whisper-large-v3-turbo",
    "Whisper Agri Model": "maliahson/whisper-agri"
}

def initialize_pipeline(model_name):
    """
    Initialize the Whisper pipeline with the selected model.
    """
    device = 0 if torch.cuda.is_available() else "cpu"
    return pipeline(
        task="automatic-speech-recognition",
        model=model_name,
        chunk_length_s=30,
        device=device,
    )

# Initialize GLiNER for information extraction
gliner_model = GLiNER.from_pretrained("xomad/gliner-model-merge-large-v1.0").to("cpu")

def merge_entities(entities):
    if not entities:
        return []
    merged = []
    current = entities[0]
    for next_entity in entities[1:]:
        if next_entity['entity'] == current['entity'] and (next_entity['start'] == current['end'] + 1 or next_entity['start'] == current['end']):
            current['word'] += ' ' + next_entity['word']
            current['end'] = next_entity['end']
        else:
            merged.append(current)
            current = next_entity
    merged.append(current)
    return merged

def transcribe_audio(audio_path, model_key):
    """
    Transcribe a local audio file using the selected Whisper pipeline, log timing, and save transcription to a file.
    """
    try:
        # Resolve the model name from the MODELS dictionary
        model_name = MODELS[model_key]
        # Initialize pipeline with the selected model
        pipe = initialize_pipeline(model_name)

        # Log start time
        start_time = datetime.now()

        # Ensure audio is mono and resampled to 16kHz
        audio, sr = librosa.load(audio_path, sr=16000, mono=True)

        # Perform transcription
        transcription = pipe(audio, batch_size=8, generate_kwargs={"language": "urdu"})["text"]

        # Log end time
        end_time = datetime.now()

        return transcription

    except Exception as e:
        return f"Error processing audio: {e}"

def translate_text_to_english(text):
    """
    Translate text into English using GoogleTranslator.
    """
    try:
        # Perform translation
        translated_text = GoogleTranslator(source='auto', target='en').translate(text)
        return translated_text
    except Exception as e:
        return f"Error during translation: {e}"

def extract_information(prompt: str, text: str, threshold: float, nested_ner: bool) -> Dict[str, Union[str, int, float]]:
    """
    Extract entities from the English text using GLiNER model.
    """
    try:
        text = prompt + "\n" + text
        entities = [
            {
                "entity": entity["label"],
                "word": entity["text"],
                "start": entity["start"],
                "end": entity["end"],
                "score": 0,
            }
            for entity in gliner_model.predict_entities(
                text, ["match"], flat_ner=not nested_ner, threshold=threshold
            )
        ]
        merged_entities = merge_entities(entities)
        return {"text": text, "entities": merged_entities}
    except Exception as e:
        return {"error": f"Information extraction failed: {e}"}

def pipeline_fn(audio, model_key, prompt, threshold, nested_ner):
    """
    Combine transcription, translation, and information extraction in a single pipeline.
    """
    transcription = transcribe_audio(audio, model_key)
    if "Error" in transcription:
        return transcription, "", "", {}

    translated_text = translate_text_to_english(transcription)
    if "Error" in translated_text:
        return transcription, translated_text, "", {}

    info_extraction = extract_information(prompt, translated_text, threshold, nested_ner)
    return transcription, translated_text, info_extraction

# Gradio Interface
with gr.Blocks(title="Audio Processing and Information Extraction") as interface:
    gr.Markdown("## Audio Transcription, Translation, and Information Extraction")
    
    with gr.Row():
        audio_input = gr.Audio(type="filepath", label="Upload Audio File")
        model_selector = gr.Dropdown(
            choices=list(MODELS.keys()),
            value="Whisper Large V3 Turbo",
            label="Select Model"
        )
        prompt_input = gr.Textbox(label="Prompt for Information Extraction", placeholder="Enter your prompt here")
    
    with gr.Row():
        threshold_slider = gr.Slider(0, 1, value=0.3, step=0.01, label="NER Threshold")
        nested_ner_checkbox = gr.Checkbox(label="Enable Nested NER")
    
    with gr.Row():
        transcription_output = gr.Textbox(label="Transcription (Urdu)", interactive=False)
        translation_output = gr.Textbox(label="Translation (English)", interactive=False)
    
    with gr.Row():
        extraction_output = gr.HighlightedText(label="Extracted Information")

    process_button = gr.Button("Process Audio")

    process_button.click(
        fn=pipeline_fn,
        inputs=[audio_input, model_selector, prompt_input, threshold_slider, nested_ner_checkbox],
        outputs=[transcription_output, translation_output, extraction_output],
    )

if __name__ == "__main__":
    interface.launch()