AA_T5

Sleeping

App Files Files Community

ahm14 commited on Feb 18

Commit

0d3d327

verified ·

1 Parent(s): bd7a5fe

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -118

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import pandas as pd
 import streamlit as st
 import re
@@ -6,39 +7,32 @@ import nltk
 from docx import Document
 import io
 from langdetect import detect
-from transformers import pipeline
 from dotenv import load_dotenv
 from langchain_groq import ChatGroq
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 # Load environment variables
 load_dotenv()
 # Initialize logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # Initialize LLM (Groq API)
-llm = ChatGroq(temperature=0.5, groq_api_key="GROQ_API_KEY", model_name="llama3-8b-8192")
 # Download required NLTK resources
 nltk.download("punkt")
-# Tone categories for fallback method
-tone_categories = {
-    "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
-    "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
-    "Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"],
-    "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
-    "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
-    "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
-    "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
-    "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
-    "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
-    "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
-}
-# Frame categories for fallback method
 frame_categories = {
     "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
     "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
@@ -48,13 +42,6 @@ frame_categories = {
     "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
     "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
     "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
-    "Activism & Advocacy": ["justice", "rights", "demand", "protest", "march", "campaign", "freedom of speech"],
-    "Systemic Oppression": ["discrimination", "oppression", "minorities", "marginalized", "exclusion"],
-    "Intersectionality": ["intersecting", "women", "minorities", "struggles", "multiple oppression"],
-    "Call to Action": ["join us", "sign petition", "take action", "mobilize", "support movement"],
-    "Empowerment & Resistance": ["empower", "resist", "challenge", "fight for", "stand up"],
-    "Climate Justice": ["environment", "climate change", "sustainability", "biodiversity", "pollution"],
-    "Human Rights Advocacy": ["human rights", "violations", "honor killing", "workplace discrimination", "law reform"]
 }
 # Detect language
@@ -65,7 +52,7 @@ def detect_language(text):
         logging.error(f"Error detecting language: {e}")
         return "unknown"
-# Extract tone using Groq API (or fallback method)
 def extract_tone(text):
     try:
         response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
@@ -73,66 +60,40 @@ def extract_tone(text):
         return response["choices"][0]["message"]["content"].split(", ")
     except Exception as e:
         logging.error(f"Groq API error: {e}")
-        return extract_tone_fallback(text)
-# Fallback method for tone extraction
-def extract_tone_fallback(text):
-    detected_tones = set()
-    text_lower = text.lower()
-    for category, keywords in tone_categories.items():
-        if any(word in text_lower for word in keywords):
-            detected_tones.add(category)
-    return list(detected_tones) if detected_tones else ["Neutral"]
 # Extract hashtags
 def extract_hashtags(text):
     return re.findall(r"#\w+", text)
-# Extract frames using Groq API (with categorization: Major Focus, Significant Focus, Minor Mention)
-def extract_frames(text):
-    try:
-        # Prompt Groq to categorize frames and their focus
-        response = llm.chat([{"role": "system", "content": "Classify the following text into relevant activism frames and categorize each frame as Major Focus, Significant Focus, or Minor Mention."},
-                             {"role": "user", "content": text}])
-        return parse_frames(response["choices"][0]["message"]["content"])
-    except Exception as e:
-        logging.error(f"Groq API error: {e}")
-        return extract_frames_fallback(text)
-# Function to parse Groq response and categorize frames
-def parse_frames(response_text):
-    frame_data = {}
-    lines = response_text.splitlines()
-    for line in lines:
-        if "Major Focus" in line or "Significant Focus" in line or "Minor Mention" in line:
-            category = line.split(":")[0].strip()
-            frame = line.split(":")[1].strip()
-            if category not in frame_data:
-                frame_data[category] = []
-            frame_data[category].append(frame)
-    return frame_data
-# Fallback method for frame extraction (with categorization of Major, Significant, Minor)
 def extract_frames_fallback(text):
-    detected_frames = set()
-    frame_focus = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
     text_lower = text.lower()
     for category, keywords in frame_categories.items():
-        keyword_count = sum(word in text_lower for word in keywords)
-        if keyword_count > 3:
-            frame_focus["Major Focus"].append(category)
-        elif keyword_count > 1:
-            frame_focus["Significant Focus"].append(category)
-        elif keyword_count > 0:
-            frame_focus["Minor Mention"].append(category)
-    # Return categorized frames
-    for focus, categories in frame_focus.items():
-        for category in categories:
-            detected_frames.add(f"{focus}: {category}")
-    return list(detected_frames)
 # Extract captions from DOCX
 def extract_captions_from_docx(docx_file):
@@ -148,7 +109,7 @@ def extract_captions_from_docx(docx_file):
             captions[current_post].append(text)
     return {post: " ".join(lines) for post, lines in captions.items() if lines}
-# Function to extract metadata from an Excel file
 def extract_metadata_from_excel(excel_file):
     try:
         df = pd.read_excel(excel_file)
@@ -156,6 +117,7 @@ def extract_metadata_from_excel(excel_file):
         if not all(col in df.columns for col in required_columns):
             st.error("Excel file is missing required columns.")
             return []
         extracted_data = []
         for index, row in df.iterrows():
             post_data = {
@@ -175,28 +137,25 @@ def extract_metadata_from_excel(excel_file):
         logging.error(f"Error processing Excel file: {e}")
         return []
-# Merge metadata from Excel with the generated data
 def merge_metadata_with_generated_data(generated_data, excel_metadata):
-    # Loop through the Excel data and merge it with the generated data
     for post_data in excel_metadata:
         post_number = post_data["Post Number"]
         if post_number in generated_data:
-            # If the post exists in both, merge Excel and Word data
             generated_data[post_number].update(post_data)
         else:
-            # If the post exists only in Excel, create a new entry in generated data
-            generated_data[post_number] = post_data
     return generated_data
-# Function to create DOCX from merged data
 def create_docx_from_data(extracted_data):
     doc = Document()
     for post_number, data in extracted_data.items():
         doc.add_heading(post_number, level=1)
         for key, value in data.items():
-            doc.add_paragraph(f"{key}: {value}")
-        doc.add_paragraph("\n")  # Add a line break between posts
     return doc
 # Streamlit app
@@ -204,31 +163,21 @@ st.title("AI-Powered Activism Message Analyzer")
 st.write("Enter text or upload a DOCX/Excel file for analysis:")
-# Text input
 input_text = st.text_area("Input Text", height=200)
-# File upload (DOCX)
 uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
-# File upload (Excel)
 uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
-# Initialize output dictionary
 output_data = {}
-# Process Text Input
 if input_text:
     output_data["Manual Input"] = {
         "Full Caption": input_text,
         "Language": detect_language(input_text),
         "Tone": extract_tone(input_text),
         "Hashtags": extract_hashtags(input_text),
-        "Frames": extract_frames(input_text),
     }
-    st.success("Analysis completed for text input.")
-# Process DOCX file
 if uploaded_docx:
     captions = extract_captions_from_docx(uploaded_docx)
     for caption, text in captions.items():
@@ -237,35 +186,16 @@ if uploaded_docx:
             "Language": detect_language(text),
             "Tone": extract_tone(text),
             "Hashtags": extract_hashtags(text),
-            "Frames": extract_frames(text),
         }
-    st.success(f"Analysis completed for {len(captions)} posts from DOCX.")
-# Process Excel file
-if uploaded_excel:
-    with st.spinner("Processing Excel file..."):
-        excel_metadata = extract_metadata_from_excel(uploaded_excel)
-        if excel_metadata:
-            st.success(f"Excel metadata extracted with {len(excel_metadata)} posts.")
-        else:
-            st.warning("No valid data extracted from the Excel file.")
-# Merge the Word and Excel data
 if uploaded_excel:
     output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
-# Display results in collapsible sections for better UI
-if output_data:
-    for post_number, data in output_data.items():
-        with st.expander(post_number):
-            for key, value in data.items():
-                st.write(f"**{key}:** {value}")
-# Allow downloading the merged DOCX file
 if output_data:
     docx_output = create_docx_from_data(output_data)
     docx_io = io.BytesIO()
     docx_output.save(docx_io)
     docx_io.seek(0)
-    st.download_button(label="Download Merged Analysis as DOCX", data=docx_io, file_name="merged_analysis.docx")

+import os
 import pandas as pd
 import streamlit as st
 import re
 from docx import Document
 import io
 from langdetect import detect
+from collections import Counter
 from dotenv import load_dotenv
 from langchain_groq import ChatGroq
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
+from transformers import pipeline
 # Load environment variables
 load_dotenv()
+# Check if Groq API key is available
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+if not GROQ_API_KEY:
+    logging.error("Missing Groq API key. Please set the GROQ_API_KEY environment variable.")
+    st.error("API key is missing. Please provide a valid API key.")
 # Initialize logging
 logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
 # Initialize LLM (Groq API)
+llm = ChatGroq(temperature=0.5, groq_api_key=GROQ_API_KEY, model_name="llama3-8b-8192")
 # Download required NLTK resources
 nltk.download("punkt")
+# Frame categories with keywords
 frame_categories = {
     "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
     "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
     "Environmental Crisis & Activism": ["climate", "deforestation", "water", "pollution", "sustainability"],
     "Anti-Extremism & Anti-Violence": ["extremism", "violence", "hate speech", "radicalism", "mob attack"],
     "Social Inequality & Economic Disparities": ["class privilege", "labor rights", "economic", "discrimination"],
 }
 # Detect language
         logging.error(f"Error detecting language: {e}")
         return "unknown"
+# Extract tone using Groq API
 def extract_tone(text):
     try:
         response = llm.chat([{"role": "system", "content": "Analyze the tone of the following text and provide descriptive tone labels."},
         return response["choices"][0]["message"]["content"].split(", ")
     except Exception as e:
         logging.error(f"Groq API error: {e}")
+        return ["Neutral"]
 # Extract hashtags
 def extract_hashtags(text):
     return re.findall(r"#\w+", text)
+# Categorize frames into Major, Significant, and Minor based on frequency
+def categorize_frames(frame_list):
+    frame_counter = Counter(frame_list)
+    categorized_frames = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
+    sorted_frames = sorted(frame_counter.items(), key=lambda x: x[1], reverse=True)
+    for i, (frame, count) in enumerate(sorted_frames):
+        if i == 0:  # Highest frequency frame
+            categorized_frames["Major Focus"].append(frame)
+        elif i < 3:  # Top 3 most mentioned frames
+            categorized_frames["Significant Focus"].append(frame)
+        else:
+            categorized_frames["Minor Mention"].append(frame)
+    return categorized_frames
+# Extract frames using keyword matching and categorize
 def extract_frames_fallback(text):
+    detected_frames = []
     text_lower = text.lower()
     for category, keywords in frame_categories.items():
+        keyword_count = sum(1 for word in keywords if word in text_lower)
+        if keyword_count > 0:
+            detected_frames.append(category)
+    return categorize_frames(detected_frames)
 # Extract captions from DOCX
 def extract_captions_from_docx(docx_file):
             captions[current_post].append(text)
     return {post: " ".join(lines) for post, lines in captions.items() if lines}
+# Extract metadata from Excel file
 def extract_metadata_from_excel(excel_file):
     try:
         df = pd.read_excel(excel_file)
         if not all(col in df.columns for col in required_columns):
             st.error("Excel file is missing required columns.")
             return []
         extracted_data = []
         for index, row in df.iterrows():
             post_data = {
         logging.error(f"Error processing Excel file: {e}")
         return []
+# Merge metadata with generated analysis
 def merge_metadata_with_generated_data(generated_data, excel_metadata):
     for post_data in excel_metadata:
         post_number = post_data["Post Number"]
         if post_number in generated_data:
             generated_data[post_number].update(post_data)
         else:
+            generated_data[post_number] = post_data  # Preserve metadata even if no text caption
     return generated_data
+# Create DOCX file from extracted data
 def create_docx_from_data(extracted_data):
     doc = Document()
     for post_number, data in extracted_data.items():
         doc.add_heading(post_number, level=1)
         for key, value in data.items():
+            doc.add_paragraph(f"**{key}:** {value}")
+        doc.add_paragraph("\n")
     return doc
 # Streamlit app
 st.write("Enter text or upload a DOCX/Excel file for analysis:")
 input_text = st.text_area("Input Text", height=200)
 uploaded_docx = st.file_uploader("Upload a DOCX file", type=["docx"])
 uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
 output_data = {}
 if input_text:
     output_data["Manual Input"] = {
         "Full Caption": input_text,
         "Language": detect_language(input_text),
         "Tone": extract_tone(input_text),
         "Hashtags": extract_hashtags(input_text),
+        "Frames": extract_frames_fallback(input_text),
     }
 if uploaded_docx:
     captions = extract_captions_from_docx(uploaded_docx)
     for caption, text in captions.items():
             "Language": detect_language(text),
             "Tone": extract_tone(text),
             "Hashtags": extract_hashtags(text),
+            "Frames": extract_frames_fallback(text),
         }
 if uploaded_excel:
+    excel_metadata = extract_metadata_from_excel(uploaded_excel)
     output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
 if output_data:
     docx_output = create_docx_from_data(output_data)
     docx_io = io.BytesIO()
     docx_output.save(docx_io)
     docx_io.seek(0)
+    st.download_button("Download Merged Analysis as DOCX", data=docx_io, file_name="merged_analysis.docx")