import gradio as gr import openai from openai import AzureOpenAI from openai import OpenAIError, RateLimitError from dotenv import load_dotenv import os load_dotenv() AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") AZURE_OPENAI_KEY = os.getenv("AZURE_OPENAI_KEY") AZURE_API_VERSION = os.getenv("AZURE_API_VERSION") client = AzureOpenAI( azure_endpoint=AZURE_OPENAI_ENDPOINT, api_key=AZURE_OPENAI_KEY, api_version=AZURE_API_VERSION ) from openai import OpenAI,AsyncAzureOpenAI from client import gradio_client import gradio as gr import logging logging.basicConfig(level=logging.INFO) import time import asyncio def get_streaming_response(query): try: messages, urls = gradio_client.predict( query=query, api_name="/predict") logging.info("Starting streaming response...") response = client.chat.completions.create( messages=messages, model="urdu-llama", temperature=0.5, stream=True ) output = '' for chunk in response: if chunk.choices: token = chunk.choices[0].delta.content if token: output += token yield output,urls except Exception as e: yield f"Error: {str(e)}" iface = gr.Interface( fn=get_streaming_response, inputs=gr.Textbox(placeholder="Ask me anything...", show_label=False), outputs=[ gr.Markdown(label="AI Response"), # gr.JSON(label="Related Questions to Explore:"), gr.JSON(label="URLs") ], live=False ) iface.launch()