File size: 4,816 Bytes
5f325ee
6ee2c17
5f325ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ee2c17
5f325ee
6ee2c17
5f325ee
 
6ee2c17
 
5f325ee
 
 
 
6ee2c17
5f325ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ee2c17
5f325ee
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import os
import gradio as gr
import requests
import io
import re
from PIL import Image
from groq import Groq

# Set Your API Keys
#  Use environment variables securely
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
HF_API_KEY = os.getenv("HF_TOKEN")

if not GROQ_API_KEY or not HF_API_KEY:
    raise ValueError("GROQ_API_KEY and HF_TOKEN must be set in the environment variables.")
# Initialize Groq API client
client = Groq(api_key=GROQ_API_KEY)

# Use a Public Hugging Face Image Model
HF_IMAGE_MODEL = "stabilityai/stable-diffusion-2-1"


# Function 1: Tamil Audio to Tamil Text (Transcription)
def transcribe_audio(audio_path):
    if not audio_path:
        return "Error: Please upload an audio file."

    try:
        with open(audio_path, "rb") as file:
            transcription = client.audio.transcriptions.create(
                file=(os.path.basename(audio_path), file.read()),
                model="whisper-large-v3",
                language="ta",  # Tamil
                response_format="verbose_json",
            )
        return transcription.text.strip()

    except Exception as e:
        return f"Error in transcription: {str(e)}"


# Function 2: Tamil Text to English Translation
def translate_tamil_to_english(tamil_text):
    if not tamil_text:
        return "Error: Please enter Tamil text for translation."

    prompt = f"Translate this Tamil text to English: {tamil_text}\nGive only the translated text as output."

    try:
        response = client.chat.completions.create(
            model="llama3-8b-8192",  # Groq-supported model
            messages=[{"role": "user", "content": prompt}],
        )
        translated_text = response.choices[0].message.content.strip()

        #  Fix: Remove unwanted XML tags like <think></think>
        translated_text = re.sub(r"</?think>", "", translated_text).strip()
        return translated_text

    except Exception as e:
        return f"Error in translation: {str(e)}"


# Function 3: English Text to Image Generation (Hugging Face)
def generate_image(english_text):
    if not english_text:
        return "Error: Please enter a description for image generation."

    try:
        headers = {"Authorization": f"Bearer {HF_API_KEY}"}
        payload = {"inputs": english_text}

        response = requests.post(f"https://api-inference.huggingface.co/models/{HF_IMAGE_MODEL}",
                                 headers=headers, json=payload)
        response.raise_for_status()
        image_bytes = response.content

        #  Check if the response is a valid image
        if not image_bytes:
            return "Error: Received empty response from API."

        return Image.open(io.BytesIO(image_bytes))

    except Exception as e:
        return f"Error in image generation: {str(e)}"


# Function 4: English Text to AI-Generated Text

def generate_text(english_text):
    if not english_text:
        return "Please enter a prompt."

    try:
        response = client.chat.completions.create(
            model="llama3-8b-8192",  #  Ensure you're using a valid model
            messages=[{"role": "user", "content": english_text}],
        )

        # Extract the response content
        generated_text = response.choices[0].message.content.strip()

        # Remove unwanted XML-like tags
        cleaned_text = re.sub(r"</?think>", "", generated_text).strip()

        return cleaned_text

    except Exception as e:
        return f"Error in text generation: {str(e)}"


# Combined Function to Process All Steps
def process_audio(audio_path):
    # Step 1: Tamil Audio → Tamil Text
    tamil_text = transcribe_audio(audio_path)
    if "Error" in tamil_text:
        return tamil_text, None, None, None

    # Step 2: Tamil Text → English Text
    english_text = translate_tamil_to_english(tamil_text)
    if "Error" in english_text:
        return tamil_text, english_text, None, None

    # Step 3: English Text → Image
    image = generate_image(english_text)
    if isinstance(image, str) and "Error" in image:
        return tamil_text, english_text, None, None

    # Step 4: English Text → AI-Generated Text
    generated_text = generate_text(english_text)
    return tamil_text, english_text, image, generated_text


# Create Gradio Interface
iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath", label="Upload Tamil Audio"),
    outputs=[
        gr.Textbox(label="Transcribed Tamil Text"),
        gr.Textbox(label="Translated English Text"),
        gr.Image(label="Generated Image"),
        gr.Textbox(label="Generated Text from English Prompt"),
    ],
    title="Tamil Audio to AI Processing Pipeline",
    description="Upload a Tamil audio file and get transcription, translation, image generation, and further text generation.",
)

# Launch Gradio App
iface.launch()