llama3.2_vision / app.py
Goodnight7's picture
Create app.py
8942d5c verified
raw
history blame
4.15 kB
import streamlit as st
import os
from PIL import Image
import io
import base64
import requests
import json
from pathlib import Path
# Ensure assets directory exists
Path("./assets").mkdir(parents=True, exist_ok=True)
# Function to call Groq API directly (avoiding the groq package)
def call_groq_api(image_base64, model, prompt):
api_key = os.environ.get("GROQ_API_KEY", "")
if not api_key:
return None, "Error: GROQ_API_KEY environment variable is not set."
headers = {
"Authorization": f"Bearer {api_key}",
"Content-Type": "application/json"
}
payload = {
"model": model,
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": prompt
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{image_base64}"
}
}
]
}
],
"temperature": 0.1,
"max_tokens": 1000
}
try:
response = requests.post(
"https://api.groq.com/openai/v1/chat/completions",
headers=headers,
json=payload
)
response.raise_for_status()
return response.json()["choices"][0]["message"]["content"], None
except Exception as e:
return None, f"Error calling Groq API: {str(e)}"
# Page configuration
st.set_page_config(
page_title="Llama-3-2-90b-vision-preview",
page_icon="πŸ‘οΈ",
layout="wide",
initial_sidebar_state="expanded"
)
# Add clear button to top right
col1, col2 = st.columns([6, 1])
with col1:
st.markdown("""
<img src="data:image/png;base64,{}" width="50" style="vertical-align: -12px;"> Llama-3-2-90b-vision-preview
""".format(base64.b64encode(open("img/llama.png", "rb").read()).decode()), unsafe_allow_html=True)
with col2:
if st.button("Clear πŸ—‘οΈ"):
if "ocr_result" in st.session_state:
del st.session_state["ocr_result"]
st.rerun()
st.markdown("Extract structured text from images using Vision Models!", unsafe_allow_html=True)
st.markdown("---")
# Move upload controls to sidebar
with st.sidebar:
st.header("Upload Image")
uploaded_file = st.file_uploader("Choose an image...", type=["png", "jpg", "jpeg"])
# Model selection
st.subheader("Model Settings")
model = st.selectbox(
"Select Vision Model",
["Llama-3-2-11b-vision-preview", "Llama-3-2-90b-vision-preview"],
index=0
)
if uploaded_file is not None:
# Display the uploaded image
image = Image.open(uploaded_file)
st.image(image, caption="Uploaded Image")
if st.button("Extract Text πŸ”", type="primary"):
with st.spinner("Processing image..."):
try:
# Convert image for API
buffered = io.BytesIO()
image.save(buffered, format="PNG")
img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
# Prepare the prompt
prompt = """Analyze the text in the provided image. Extract all readable content
and present it in a structured Markdown format that is clear, concise,
and well-organized. Ensure proper formatting (e.g., headings, lists, or
code blocks) as necessary to represent the content effectively."""
# Call the API
result, error = call_groq_api(img_str, model, prompt)
if error:
st.error(error)
else:
st.session_state["ocr_result"] = result
except Exception as e:
st.error(f"Error processing image: {str(e)}")
# Main content area for results
if "ocr_result" in st.session_state:
st.markdown(st.session_state["ocr_result"])
else:
st.info("Upload an image and click 'Extract Text' to see the results here.")
# Footer
st.markdown("---")
st.markdown("Made using Vision Models via Groq API")