AA_T5

Sleeping

App Files Files Community

ahm14 commited on Feb 17

Commit

2bc09f4

verified ·

1 Parent(s): f4228a1

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -119

app.py CHANGED Viewed

@@ -2,9 +2,7 @@ import pandas as pd
 import streamlit as st
 import re
 import logging
-import nltk
 from docx import Document
-import io
 from langdetect import detect
 from transformers import pipeline
 from dotenv import load_dotenv
@@ -24,21 +22,7 @@ llm = ChatGroq(temperature=0.5, groq_api_key="GROQ_API_KEY", model_name="llama3-
 # Download required NLTK resources
 nltk.download("punkt")
-# Tone categories for fallback method
-tone_categories = {
-    "Emotional": ["urgent", "violence", "disappearances", "forced", "killing", "crisis", "concern"],
-    "Harsh": ["corrupt", "oppression", "failure", "repression", "exploit", "unjust", "authoritarian"],
-    "Somber": ["tragedy", "loss", "pain", "sorrow", "mourning", "grief", "devastation"],
-    "Motivational": ["rise", "resist", "mobilize", "inspire", "courage", "change", "determination"],
-    "Informative": ["announcement", "event", "scheduled", "update", "details", "protest", "statement"],
-    "Positive": ["progress", "unity", "hope", "victory", "together", "solidarity", "uplifting"],
-    "Angry": ["rage", "injustice", "fury", "resentment", "outrage", "betrayal"],
-    "Fearful": ["threat", "danger", "terror", "panic", "risk", "warning"],
-    "Sarcastic": ["brilliant", "great job", "amazing", "what a surprise", "well done", "as expected"],
-    "Hopeful": ["optimism", "better future", "faith", "confidence", "looking forward"]
-}
-# Frame categories for fallback method
 frame_categories = {
     "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
     "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
@@ -84,10 +68,6 @@ def extract_tone_fallback(text):
             detected_tones.add(category)
     return list(detected_tones) if detected_tones else ["Neutral"]
-# Extract hashtags
-def extract_hashtags(text):
-    return re.findall(r"#\w+", text)
 # Extract frames using Groq API (or fallback)
 def extract_frames(text):
     try:
@@ -98,27 +78,14 @@ def extract_frames(text):
         logging.error(f"Groq API error: {e}")
         return extract_frames_fallback(text)
-# Fallback method for frame extraction (with categorization of Major, Significant, Minor)
 def extract_frames_fallback(text):
     detected_frames = set()
-    frame_focus = {"Major Focus": [], "Significant Focus": [], "Minor Mention": []}
     text_lower = text.lower()
     for category, keywords in frame_categories.items():
-        keyword_count = sum(word in text_lower for word in keywords)
-        if keyword_count > 3:
-            frame_focus["Major Focus"].append(category)
-        elif keyword_count > 1:
-            frame_focus["Significant Focus"].append(category)
-        elif keyword_count > 0:
-            frame_focus["Minor Mention"].append(category)
-    # Return categorized frames
-    for focus, categories in frame_focus.items():
-        for category in categories:
-            detected_frames.add(f"{focus}: {category}")
-    return list(detected_frames)
 # Extract captions from DOCX
 def extract_captions_from_docx(docx_file):
@@ -134,48 +101,64 @@ def extract_captions_from_docx(docx_file):
             captions[current_post].append(text)
     return {post: " ".join(lines) for post, lines in captions.items() if lines}
-# Function to extract metadata from an Excel file
 def extract_metadata_from_excel(excel_file):
-    df = pd.read_excel(excel_file)
-    extracted_data = []
-    for index, row in df.iterrows():
-        post_data = {
-            "Post Number": f"Post {index + 1}",
-            "Date of Post": row.get("Date", "N/A"),
-            "Media Type": row.get("Media Type", "N/A"),
-            "Number of Pictures": row.get("Number of Pictures", 0),
-            "Number of Videos": row.get("Number of Videos", 0),
-            "Number of Audios": row.get("Number of Audios", 0),
-            "Likes": row.get("Likes", 0),
-            "Comments": row.get("Comments", 0),
-            "Tagged Audience": row.get("Tagged Audience", "No"),
-        }
-        extracted_data.append(post_data)
-    return extracted_data
 # Merge metadata from Excel with the generated data
 def merge_metadata_with_generated_data(generated_data, excel_metadata):
-    for post_data in excel_metadata:
-        post_number = post_data["Post Number"]
-        if post_number in generated_data:
-            generated_data[post_number].update(post_data)
-        else:
-            generated_data[post_number] = post_data
     return generated_data
-# Function to create DOCX from extracted data
-def create_docx_from_data(extracted_data):
     doc = Document()
-    for post_number, data in extracted_data.items():
-        doc.add_heading(post_number, level=1)
-        for key, value in data.items():
-            doc.add_paragraph(f"{key}: {value}")
-        doc.add_paragraph("\n")  # Add a line break between posts
-    return doc
-# Streamlit app
 st.title("AI-Powered Activism Message Analyzer")
 st.write("Enter text or upload a DOCX/Excel file for analysis:")
@@ -192,53 +175,24 @@ uploaded_excel = st.file_uploader("Upload an Excel file", type=["xlsx"])
 # Initialize output dictionary
 output_data = {}
-# Process Text Input
-if input_text:
-    output_data["Manual Input"] = {
-        "Full Caption": input_text,
-        "Language": detect_language(input_text),
-        "Tone": extract_tone(input_text),
-        "Hashtags": extract_hashtags(input_text),
-        "Frames": extract_frames(input_text),
-    }
-    st.success("Analysis completed for text input.")
-# Process DOCX file
 if uploaded_docx:
-    captions = extract_captions_from_docx(uploaded_docx)
-    for caption, text in captions.items():
-        output_data[caption] = {
-            "Full Caption": text,
-            "Language": detect_language(text),
-            "Tone": extract_tone(text),
-            "Hashtags": extract_hashtags(text),
-            "Frames": extract_frames(text),
-        }
-    st.success(f"Analysis completed for {len(captions)} posts from DOCX.")
-# Process Excel file
 if uploaded_excel:
-    excel_metadata = extract_metadata_from_excel(uploaded_excel)
-    st.success(f"Excel metadata extracted with {len(excel_metadata)} posts.")
-# Merge and display final data
-if uploaded_excel:
-    output_data = merge_metadata_with_generated_data(output_data, excel_metadata)
-# Display results
-if output_data:
-    st.write(output_data)
-# Create DOCX file for download
 if output_data:
-    doc = create_docx_from_data(output_data)
-    docx_io = io.BytesIO()
-    doc.save(docx_io)
-    docx_io.seek(0)
-    st.download_button(
-        label="Download Extracted Data as DOCX",
-        data=docx_io,
-        file_name="extracted_data.docx",
-        mime="application/vnd.openxmlformats-officedocument.wordprocessingml.document"
-    )

 import streamlit as st
 import re
 import logging
 from docx import Document
 from langdetect import detect
 from transformers import pipeline
 from dotenv import load_dotenv
 # Download required NLTK resources
 nltk.download("punkt")
+# Frame categories for fallback method (with Major, Significant, Minor focus)
 frame_categories = {
     "Human Rights & Justice": ["rights", "law", "justice", "legal", "humanitarian"],
     "Political & State Accountability": ["government", "policy", "state", "corruption", "accountability"],
             detected_tones.add(category)
     return list(detected_tones) if detected_tones else ["Neutral"]
 # Extract frames using Groq API (or fallback)
 def extract_frames(text):
     try:
         logging.error(f"Groq API error: {e}")
         return extract_frames_fallback(text)
+# Fallback method for frame extraction
 def extract_frames_fallback(text):
     detected_frames = set()
     text_lower = text.lower()
     for category, keywords in frame_categories.items():
+        if any(word in text_lower for word in keywords):
+            detected_frames.add(f"{category}: Major Focus")
+    return list(detected_frames) if detected_frames else ["No Focus"]
 # Extract captions from DOCX
 def extract_captions_from_docx(docx_file):
             captions[current_post].append(text)
     return {post: " ".join(lines) for post, lines in captions.items() if lines}
+# Extract metadata from Excel file
 def extract_metadata_from_excel(excel_file):
+    try:
+        df = pd.read_excel(excel_file)
+        metadata = df.set_index("Post Number").to_dict(orient="index")
+        return metadata
+    except Exception as e:
+        logging.error(f"Error reading Excel file: {e}")
+        return {}
 # Merge metadata from Excel with the generated data
 def merge_metadata_with_generated_data(generated_data, excel_metadata):
+    for post, metadata in excel_metadata.items():
+        if post in generated_data:
+            generated_data[post].update(metadata)
     return generated_data
+# Function to create the final DOCX with structured output (without tables)
+def create_structured_output_without_table(merged_data, output_path):
     doc = Document()
+    doc.add_heading('Extracted Social Media Data', 0)
+    # Loop through each post and add its structured data
+    for sr_no, (post, data) in enumerate(merged_data.items(), 1):
+        doc.add_heading(f'Post {sr_no}', level=1)
+        # Adding the details for each post
+        doc.add_paragraph(f"Date of Post: {data.get('Date of Post', 'N/A')}")
+        doc.add_paragraph(f"Media Type: {data.get('Media Type', 'N/A')}")
+        doc.add_paragraph(f"No of Pictures: {data.get('No of Pictures', 0)}")
+        doc.add_paragraph(f"No of Videos: {data.get('No of Videos', 0)}")
+        doc.add_paragraph(f"No of Audios: {data.get('No of Audios', 0)}")
+        doc.add_paragraph(f"Likes: {data.get('Likes', 'N/A')}")
+        doc.add_paragraph(f"Comments: {data.get('Comments', 'N/A')}")
+        doc.add_paragraph(f"Tagged Audience: {data.get('Tagged Audience', 'No')}")
+        doc.add_paragraph(f"Caption: {data.get('Full Caption', 'N/A')}")
+        doc.add_paragraph(f"Language of Caption: {data.get('Language', 'N/A')}")
+        doc.add_paragraph(f"Total No of Hashtags: {len(data.get('Hashtags', []))}")
+        if data.get('Hashtags'):
+            doc.add_paragraph(f"Hashtags: {', '.join(data['Hashtags'])}")
+        else:
+            doc.add_paragraph("Hashtags: N/A")
+        # Adding Frames for each post
+        doc.add_heading("Frames", level=2)
+        if data.get("Frames"):
+            for frame in data['Frames']:
+                doc.add_paragraph(f"- {frame}")
+        else:
+            doc.add_paragraph("No Frames available")
+        doc.add_paragraph("\n")  # Add a space between posts
+    # Save the document
+    doc.save(output_path)
+# Streamlit app setup
 st.title("AI-Powered Activism Message Analyzer")
 st.write("Enter text or upload a DOCX/Excel file for analysis:")
 # Initialize output dictionary
 output_data = {}
+# Extract and process data based on file uploads or input text
 if uploaded_docx:
+    output_data = extract_captions_from_docx(uploaded_docx)
 if uploaded_excel:
+    metadata = extract_metadata_from_excel(uploaded_excel)
+    output_data = merge_metadata_with_generated_data(output_data, metadata)
+# Generate output
 if output_data:
+    # Process each post to extract frames
+    for post, data in output_data.items():
+        # Extract frames using Groq API or fallback method
+        frames = extract_frames(data)
+        data['Frames'] = frames
+    # Call the function to generate the DOCX report
+    create_structured_output_without_table(output_data, "final_output.docx")
+    st.write("The DOCX file has been created and saved!")
+    st.download_button("Download DOCX", data=open("final_output.docx", "rb"), file_name="final_output.docx")
+# Further refinement can be added for additional features as necessary