Spaces:

BSJ2004
/

news-summarization-sentiment-analysis

Sleeping

App Files Files Community

BSJ2004 commited on Mar 23

Commit

47af8ed

verified ·

1 Parent(s): 565e727

Upload 11 files

Browse files

Files changed (11) hide show

.gitattributes +1 -35
.gitignore +50 -0
Dockerfile +49 -0
README.md +143 -13
Spacefile +8 -0
api.py +332 -0
app.py +496 -0
generate_json_output.py +55 -0
healthcheck.py +171 -0
requirements.txt +39 -0
utils.py +1132 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ *.map filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,50 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+env/
+.env
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+.DS_Store
+# Logs
+logs/
+*.log
+# Audio files
+audio_files/
+*.mp3
+*.wav
+# Jupyter
+.ipynb_checkpoints
+# Model caches
+.cache/
+.local/

Dockerfile ADDED Viewed

	@@ -0,0 +1,49 @@

+FROM python:3.9-slim
+WORKDIR /app
+# Install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install additional dependencies needed for NLP tasks and TTS
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    software-properties-common \
+    git \
+    ffmpeg \
+    espeak \
+    libespeak-dev \
+    alsa-utils \
+    python3-pyaudio \
+    libasound2-dev \
+    && rm -rf /var/lib/apt/lists/*
+# Copy app files
+COPY . .
+# Create directory for audio files
+RUN mkdir -p audio_files
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# Download NLTK data
+RUN python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords')"
+# Expose ports
+EXPOSE 8000
+EXPOSE 8501
+# Create a shell script to run both services
+RUN echo '#!/bin/bash\n\
+uvicorn api:app --host 0.0.0.0 --port 8000 &\n\
+streamlit run app.py --server.port 8501 --server.address 0.0.0.0\n'\
+> /app/start.sh
+RUN chmod +x /app/start.sh
+# Start the application
+CMD ["/app/start.sh"]

README.md CHANGED Viewed

@@ -1,13 +1,143 @@
----
-title: Text1123
-emoji: 🏢
-colorFrom: purple
-colorTo: indigo
-sdk: streamlit
-sdk_version: 1.43.2
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# News Summarization and Text-to-Speech Application
+A web-based application that extracts news articles related to companies, performs sentiment analysis, conducts comparative analysis, and generates a text-to-speech output in Hindi.
+## Features
+- **News Extraction**: Scrapes at least 10 unique news articles about a given company using BeautifulSoup
+- **Sentiment Analysis**: Analyzes the sentiment of each article (positive, negative, neutral)
+- **Comparative Analysis**: Compares sentiment across articles to derive insights
+- **Text-to-Speech**: Converts summarized content to Hindi speech
+- **User Interface**: Simple web interface built with Streamlit
+- **API Communication**: Backend and frontend communicate through APIs
+## Project Structure
+```
+.
+├── app.py              # Main Streamlit application
+├── api.py              # API endpoints
+├── utils.py            # Utility functions for scraping, sentiment analysis, etc.
+├── healthcheck.py      # Script to verify all dependencies and services
+├── requirements.txt    # Project dependencies
+├── Dockerfile          # Docker configuration for deployment
+├── Spacefile           # Hugging Face Spaces configuration
+└── README.md           # Project documentation
+```
+## Setup Instructions
+1. **Clone the repository**:
+   ```
+   git clone https://github.com/yourusername/news-summarization-tts.git
+   cd news-summarization-tts
+   ```
+2. **Create a virtual environment** (recommended):
+   ```
+   python -m venv venv
+   source venv/bin/activate  # On Windows: venv\Scripts\activate
+   ```
+3. **Install dependencies**:
+   ```
+   pip install -r requirements.txt
+   ```
+4. **Install system dependencies** (for text-to-speech functionality):
+   - On Ubuntu/Debian:
+     ```
+     sudo apt-get install espeak ffmpeg
+     ```
+   - On Windows:
+     Download and install espeak from http://espeak.sourceforge.net/download.html
+5. **Run the healthcheck** (to verify all dependencies are working):
+   ```
+   python healthcheck.py
+   ```
+6. **Run the API server**:
+   ```
+   uvicorn api:app --reload
+   ```
+7. **Run the Streamlit application** (in a separate terminal):
+   ```
+   streamlit run app.py
+   ```
+## Models Used
+- **News Summarization**: Extractive summarization using NLTK and NetworkX
+- **Sentiment Analysis**: VADER for sentiment analysis and Hugging Face Transformers
+- **Translation**: Google Translate API via deep-translator library
+- **Text-to-Speech**: Google Text-to-Speech (gTTS) and pyttsx3 as fallback for Hindi conversion
+## API Documentation
+### Endpoints
+- `POST /api/get_news`: Fetches news articles about a company
+  - Request body: `{"company_name": "Tesla"}`
+  - Returns a list of articles with metadata
+- `POST /api/analyze_sentiment`: Performs sentiment analysis on articles
+  - Request body: `{"articles": [article_list]}`
+  - Returns sentiment analysis for each article
+- `POST /api/generate_speech`: Converts text to Hindi speech
+  - Request body: `{"text": "summarized_text"}`
+  - Returns a URL to the generated audio file
+- `POST /api/complete_analysis`: Performs complete analysis including fetching news, sentiment analysis, and generating speech
+  - Request body: `{"company_name": "Tesla"}`
+  - Returns complete analysis results
+## Assumptions & Limitations
+- The application scrapes publicly available news articles that don't require JavaScript rendering
+- Sentiment analysis accuracy depends on the model used and may not capture context-specific nuances
+- Hindi translation and TTS quality may vary based on technical terms
+- The application requires an internet connection to fetch news articles and use cloud-based services
+## Troubleshooting
+If you encounter any issues:
+1. Run the healthcheck script to verify all dependencies are working:
+   ```
+   python healthcheck.py
+   ```
+2. Check that you have all the required system dependencies installed (espeak, ffmpeg).
+3. If you encounter issues with specific components:
+   - Translation service requires an internet connection
+   - Text-to-speech uses gTTS by default, but falls back to pyttsx3 if needed
+   - Transformer models may take time to download on first run
+## Deployment
+This application is deployed on Hugging Face Spaces: [Link to deployment]
+### Using Docker
+You can also run the application using Docker:
+```
+docker build -t news-summarization-tts .
+docker run -p 8501:8501 -p 8000:8000 news-summarization-tts
+```
+## Future Improvements
+- Add support for more languages
+- Implement advanced NLP techniques for better summarization
+- Improve the user interface with more interactive visualizations
+- Add historical data analysis for tracking sentiment over time
+- Enhance TTS quality with dedicated Hindi speech models
+## License
+MIT

Spacefile ADDED Viewed

	@@ -0,0 +1,8 @@

+# Spacefile Docs: https://huggingface.co/docs/hub/spaces-config-reference
+title: News Summarization and TTS
+emoji: 📰
+colorFrom: blue
+colorTo: indigo
+sdk: docker
+app_port: 8501
+pinned: false

api.py ADDED Viewed

	@@ -0,0 +1,332 @@

+from fastapi import FastAPI, HTTPException, Response, File, UploadFile, Form
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import List, Dict, Any, Optional
+import os
+import json
+import uuid
+import asyncio
+import uvicorn
+from utils import (search_news, analyze_article_sentiment, perform_comparative_analysis,
+                  translate_to_hindi, text_to_speech, prepare_final_report, NewsArticle)
+# Initialize FastAPI app
+app = FastAPI(
+    title="News Summarization and TTS API",
+    description="API for extracting news, performing sentiment analysis, and generating Hindi TTS audio",
+    version="1.0.0"
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allow all origins
+    allow_credentials=True,
+    allow_methods=["*"],  # Allow all methods
+    allow_headers=["*"],  # Allow all headers
+)
+# Define request/response models
+class CompanyRequest(BaseModel):
+    company_name: str
+class TextToSpeechRequest(BaseModel):
+    text: str
+    output_filename: Optional[str] = None
+class SentimentAnalysisRequest(BaseModel):
+    articles: List[Dict[str, Any]]
+class NewsResponse(BaseModel):
+    articles: List[Dict[str, Any]]
+class SentimentResponse(BaseModel):
+    sentiment_analysis: Dict[str, Any]
+class TextToSpeechResponse(BaseModel):
+    audio_file: str
+    text: str
+# Create a directory for audio files if it doesn't exist
+os.makedirs("audio_files", exist_ok=True)
+# API endpoints
+@app.get("/")
+async def root():
+    """Root endpoint to check if API is running."""
+    return {"message": "News Summarization and TTS API is running"}
+@app.post("/api/get_news", response_model=NewsResponse)
+async def get_news(request: CompanyRequest):
+    """Fetch news articles about a specific company."""
+    try:
+        company_name = request.company_name
+        articles = search_news(company_name)
+        if not articles:
+            raise HTTPException(status_code=404, detail=f"No news articles found for {company_name}")
+        # Convert NewsArticle objects to dictionaries
+        article_data = [article.to_dict() for article in articles]
+        return {"articles": article_data}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/analyze_sentiment", response_model=SentimentResponse)
+async def analyze_sentiment(request: SentimentAnalysisRequest):
+    """Analyze sentiment of provided articles."""
+    try:
+        # Convert dictionaries back to NewsArticle objects
+        articles = []
+        for article_dict in request.articles:
+            article = NewsArticle(
+                title=article_dict["title"],
+                url=article_dict["url"],
+                content=article_dict["content"],
+                summary=article_dict.get("summary", ""),
+                source=article_dict.get("source", ""),
+                date=article_dict.get("date", ""),
+                sentiment=article_dict.get("sentiment", ""),
+                topics=article_dict.get("topics", [])
+            )
+            articles.append(article)
+        # Perform detailed sentiment analysis for each article
+        detailed_sentiment = [analyze_article_sentiment(article) for article in articles]
+        # Perform comparative analysis
+        comparative_analysis = perform_comparative_analysis(articles)
+        return {
+            "sentiment_analysis": {
+                "detailed_sentiment": detailed_sentiment,
+                "comparative_analysis": comparative_analysis
+            }
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/generate_speech", response_model=TextToSpeechResponse)
+async def generate_speech(request: TextToSpeechRequest):
+    """Convert text to Hindi speech."""
+    try:
+        text = request.text
+        # Generate a unique filename if not provided
+        output_filename = request.output_filename
+        if not output_filename:
+            unique_id = uuid.uuid4().hex
+            output_filename = f"audio_files/{unique_id}.mp3"
+        elif not output_filename.startswith("audio_files/"):
+            output_filename = f"audio_files/{output_filename}"
+        # Translate text to Hindi
+        hindi_text = translate_to_hindi(text)
+        # Convert text to speech
+        audio_file = text_to_speech(hindi_text, output_filename)
+        if not audio_file:
+            raise HTTPException(status_code=500, detail="Failed to generate audio file")
+        return {
+            "audio_file": audio_file,
+            "text": hindi_text
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/api/complete_analysis")
+async def complete_analysis(request: CompanyRequest):
+    """Perform complete analysis for a company."""
+    try:
+        company_name = request.company_name
+        # Log the start of analysis
+        print(f"Starting complete analysis for company: {company_name}")
+        # Step 1: Get news articles
+        print("Step 1: Fetching news articles...")
+        articles = search_news(company_name, num_articles=5)  # Increased from default 3 to 5
+        print(f"Found {len(articles)} articles for {company_name}")
+        if not articles:
+            raise HTTPException(status_code=404, detail=f"No news articles found for {company_name}")
+        # Step 2: Perform comparative analysis
+        print("Step 2: Performing comparative analysis...")
+        comparative_analysis = perform_comparative_analysis(articles)
+        print("Comparative analysis completed")
+        # Step 3: Prepare final report
+        print("Step 3: Preparing final report...")
+        final_report = prepare_final_report(company_name, articles, comparative_analysis)
+        print("Final report prepared")
+        # Step 4: Generate Hindi TTS
+        print("Step 4: Generating Hindi TTS...")
+        unique_id = uuid.uuid4().hex
+        output_filename = f"audio_files/{unique_id}.mp3"
+        # Use the Hindi summary for TTS
+        hindi_text = final_report["Hindi Summary"]
+        print(f"Converting Hindi text to speech (length: {len(hindi_text)} characters)")
+        audio_file = text_to_speech(hindi_text, output_filename)
+        # Format the response to match the example output exactly
+        formatted_response = {
+            "Company": company_name,
+            "Articles": final_report["Articles"],
+            "Comparative Sentiment Score": {
+                "Sentiment Distribution": comparative_analysis["Sentiment Distribution"],
+                "Coverage Differences": comparative_analysis["Coverage Differences"],
+                "Topic Overlap": {
+                    "Common Topics": comparative_analysis["Topic Overlap"]["Common Topics Across All"],
+                }
+            },
+            "Final Sentiment Analysis": comparative_analysis["Final Sentiment Analysis"],
+        }
+        # Format the unique topics by article to match the expected output exactly
+        unique_topics = comparative_analysis["Topic Overlap"]["Unique Topics By Article"]
+        for article_idx, topics in unique_topics.items():
+            article_num = int(article_idx) + 1
+            formatted_response["Comparative Sentiment Score"]["Topic Overlap"][f"Unique Topics in Article {article_num}"] = topics
+        # If we don't have more than 1 article, create some example comparisons to match format
+        if len(articles) <= 1:
+            formatted_response["Comparative Sentiment Score"]["Coverage Differences"] = [
+                {
+                    "Comparison": f"Only one article about {company_name} was found, limiting comparative analysis.",
+                    "Impact": "Unable to compare coverage across multiple sources for more comprehensive insights."
+                }
+            ]
+        # Add audio information
+        if not audio_file:
+            print("Warning: Failed to generate audio file")
+            formatted_response["Audio"] = "Failed to generate audio"
+        else:
+            print(f"Audio file generated: {audio_file}")
+            formatted_response["Audio"] = f"[Play Hindi Speech]"
+            # Store the actual audio file path in a hidden field
+            formatted_response["_audio_file_path"] = audio_file
+        # Add the Hindi Summary to the response as well (needed for rendering in Streamlit)
+        formatted_response["Hindi Summary"] = final_report["Hindi Summary"]
+        print("Complete analysis finished successfully")
+        return formatted_response
+    except HTTPException as he:
+        # Re-raise HTTP exceptions
+        print(f"HTTP Exception: {he.detail}")
+        raise
+    except Exception as e:
+        # For any other exception, provide detailed error information
+        import traceback
+        error_trace = traceback.format_exc()
+        error_message = f"Error processing request: {str(e)}"
+        print(f"ERROR: {error_message}")
+        print(f"Traceback: {error_trace}")
+        # Return a more user-friendly error message
+        user_message = "An error occurred during analysis. "
+        if "timeout" in str(e).lower():
+            user_message += "There was a timeout when connecting to news sources. Please try again or try another company name."
+        elif "connection" in str(e).lower():
+            user_message += "There was a connection issue with one of the news sources. Please check your internet connection."
+        elif "not found" in str(e).lower():
+            user_message += f"No information could be found for {company_name}. Please try another company name."
+        else:
+            user_message += "Please try again with a different company name or check the server logs for more details."
+        raise HTTPException(status_code=500, detail=user_message)
+@app.get("/api/audio/{file_name}")
+async def get_audio(file_name: str):
+    """Serve audio files."""
+    file_path = f"audio_files/{file_name}"
+    # Make sure the audio_files directory exists
+    os.makedirs("audio_files", exist_ok=True)
+    if not os.path.exists(file_path):
+        print(f"Audio file not found: {file_path}")
+        # Check if any audio files exist in the directory
+        audio_files = os.listdir("audio_files") if os.path.exists("audio_files") else []
+        print(f"Available audio files: {audio_files}")
+        raise HTTPException(status_code=404, detail=f"Audio file {file_name} not found")
+    try:
+        # Verify the file can be opened and is not corrupt
+        with open(file_path, "rb") as f:
+            file_size = os.path.getsize(file_path)
+            print(f"Serving audio file: {file_path} (size: {file_size} bytes)")
+            if file_size == 0:
+                raise HTTPException(status_code=500, detail="Audio file is empty")
+    except Exception as e:
+        print(f"Error accessing audio file {file_path}: {str(e)}")
+        raise HTTPException(status_code=500, detail=f"Error accessing audio file: {str(e)}")
+    # Set appropriate headers for audio file
+    headers = {
+        "Cache-Control": "no-cache, no-store, must-revalidate",
+        "Pragma": "no-cache",
+        "Expires": "0",
+        "Content-Disposition": f"attachment; filename={file_name}"
+    }
+    # Determine the correct media type based on file extension
+    media_type = "audio/mpeg"
+    if file_name.lower().endswith(".wav"):
+        media_type = "audio/wav"
+    return FileResponse(
+        path=file_path,
+        media_type=media_type,
+        headers=headers,
+        filename=file_name
+    )
+@app.post("/api/example_format")
+async def get_example_format(request: CompanyRequest):
+    """
+    Get analysis results in the example format specified.
+    This endpoint provides results that exactly match the requested output format.
+    """
+    try:
+        # Get the base analysis
+        company_name = request.company_name
+        result = await complete_analysis(request)
+        # Format it to match the example output
+        formatted_output = {
+            "Company": result["Company"],
+            "Articles": result["Articles"],
+            "Comparative Sentiment Score": {
+                "Sentiment Distribution": result["Comparative Sentiment Score"]["Sentiment Distribution"],
+                "Coverage Differences": result["Comparative Sentiment Score"]["Coverage Differences"],
+                "Topic Overlap": result["Comparative Sentiment Score"]["Topic Overlap"]
+            },
+            "Final Sentiment Analysis": result["Final Sentiment Analysis"],
+            "Audio": "[Play Hindi Speech]" if result.get("Audio") else "No audio available"
+        }
+        return formatted_output
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error generating example format: {str(e)}")
+if __name__ == "__main__":
+    uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)

app.py ADDED Viewed

	@@ -0,0 +1,496 @@

+import streamlit as st
+import requests
+import pandas as pd
+import json
+import os
+import matplotlib.pyplot as plt
+import seaborn as sns
+import base64
+from io import BytesIO
+from PIL import Image, ImageEnhance
+import time
+from typing import Dict, Any, List
+# API Base URL - Change this to match your deployment
+API_BASE_URL = "http://localhost:8000"
+# New function to generate the example output format
+def generate_example_output(company_name: str) -> str:
+    """
+    Generate output in the example format for the given company.
+    Returns the formatted JSON as a string.
+    """
+    try:
+        # Make API request to get the analysis data
+        url = f"{API_BASE_URL}/api/complete_analysis"
+        response = requests.post(url, json={"company_name": company_name})
+        response.raise_for_status()
+        data = response.json()
+        # Format the data to match the example output format exactly
+        formatted_output = {
+            "Company": data["Company"],
+            "Articles": data["Articles"],
+            "Comparative Sentiment Score": {
+                "Sentiment Distribution": data["Comparative Sentiment Score"]["Sentiment Distribution"],
+                "Coverage Differences": data["Comparative Sentiment Score"]["Coverage Differences"],
+                "Topic Overlap": data["Comparative Sentiment Score"]["Topic Overlap"]
+            },
+            "Final Sentiment Analysis": data["Final Sentiment Analysis"],
+            "Audio": "[Play Hindi Speech]" if data.get("Audio") else "No audio available"
+        }
+        # Convert to JSON string with proper formatting
+        return json.dumps(formatted_output, indent=2)
+    except Exception as e:
+        return json.dumps({
+            "error": str(e),
+            "message": "Failed to generate example output"
+        }, indent=2)
+# Function to run in terminal mode
+def run_terminal_mode():
+    """Run the app in terminal mode to output JSON"""
+    print("News Analysis Terminal Mode")
+    company_name = input("Enter company name: ")
+    print(f"Analyzing {company_name}...")
+    output = generate_example_output(company_name)
+    print(output)
+# Check if run directly or imported
+if __name__ == "__main__":
+    # Check if terminal mode is requested via command line args
+    import sys
+    if len(sys.argv) > 1 and sys.argv[1] == "--terminal":
+        run_terminal_mode()
+    else:
+        # Continue with the Streamlit app
+        # App title and description
+        st.set_page_config(
+            page_title="News Summarization & TTS",
+            page_icon="📰",
+            layout="wide",
+            initial_sidebar_state="expanded"
+        )
+# Custom CSS for better UI
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: 700;
+        color: #1E3A8A;
+        margin-bottom: 1rem;
+    }
+    .sub-header {
+        font-size: 1.5rem;
+        font-weight: 600;
+        color: #2563EB;
+        margin-top: 1rem;
+        margin-bottom: 0.5rem;
+    }
+    .card {
+        padding: 1.5rem;
+        border-radius: 0.5rem;
+        background-color: #F8FAFC;
+        border: 1px solid #E2E8F0;
+        margin-bottom: 1rem;
+    }
+    .positive {
+        color: #059669;
+        font-weight: 600;
+    }
+    .negative {
+        color: #DC2626;
+        font-weight: 600;
+    }
+    .neutral {
+        color: #6B7280;
+        font-weight: 600;
+    }
+    .topic-tag {
+        display: inline-block;
+        padding: 0.25rem 0.5rem;
+        border-radius: 2rem;
+        background-color: #E5E7EB;
+        color: #1F2937;
+        font-size: 0.75rem;
+        margin-right: 0.5rem;
+        margin-bottom: 0.5rem;
+    }
+    .audio-container {
+        width: 100%;
+        padding: 1rem;
+        background-color: #F3F4F6;
+        border-radius: 0.5rem;
+        margin-top: 1rem;
+    }
+    .info-text {
+        font-size: 0.9rem;
+        color: #4B5563;
+    }
+    .article-title {
+        font-size: 1.2rem;
+        font-weight: 600;
+        color: #111827;
+        margin-bottom: 0.5rem;
+        margin-top: 0.5rem;
+    }
+    .article-summary {
+        font-size: 0.9rem;
+        color: #374151;
+        margin-bottom: 0.5rem;
+    }
+    .article-meta {
+        font-size: 0.8rem;
+        color: #6B7280;
+        margin-bottom: 0.5rem;
+    }
+    .section-divider {
+        height: 1px;
+        background-color: #E5E7EB;
+        margin: 1.5rem 0;
+    }
+    .chart-container {
+        background-color: white;
+        padding: 1rem;
+        border-radius: 0.5rem;
+        border: 1px solid #E2E8F0;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Function to make API requests
+def make_api_request(endpoint: str, data: Dict[str, Any] = None, method: str = "POST") -> Dict[str, Any]:
+    """Make API request to the backend."""
+    url = f"{API_BASE_URL}{endpoint}"
+    try:
+        if method == "GET":
+            response = requests.get(url)
+        else:
+            response = requests.post(url, json=data)
+        response.raise_for_status()
+        return response.json()
+    except requests.exceptions.ConnectionError:
+        st.error("⚠️ Connection Error: Cannot connect to the API server. Please ensure the API server is running at " + API_BASE_URL)
+        return {}
+    except requests.exceptions.Timeout:
+        st.error("⚠️ Timeout Error: The request took too long to complete. Please try again with a different company name.")
+        return {}
+    except requests.exceptions.HTTPError as e:
+        if e.response.status_code == 404:
+            st.error("⚠️ No articles found for this company. Please try another company name.")
+        elif e.response.status_code == 500:
+            # Try to get detailed error message
+            try:
+                error_detail = e.response.json().get("detail", "Unknown server error")
+                st.error(f"⚠️ Server Error: {error_detail}")
+            except:
+                st.error("⚠️ Internal Server Error: Something went wrong on the server. Please try again later.")
+        else:
+            st.error(f"⚠️ HTTP Error: {str(e)}")
+        return {}
+    except Exception as e:
+        st.error(f"⚠️ Error: {str(e)}")
+        return {}
+# Function to create sentiment color
+def get_sentiment_color(sentiment: str) -> str:
+    """Return CSS class for sentiment."""
+    if sentiment == "Positive":
+        return "positive"
+    elif sentiment == "Negative":
+        return "negative"
+    else:
+        return "neutral"
+# Function to create visualization for sentiment distribution
+def plot_sentiment_distribution(sentiment_data: Dict[str, int]):
+    """Create and display a bar chart for sentiment distribution."""
+    labels = ["Positive", "Neutral", "Negative"]
+    values = [sentiment_data[label] for label in labels]
+    colors = ["#059669", "#6B7280", "#DC2626"]
+    fig, ax = plt.subplots(figsize=(10, 6))
+    ax.bar(labels, values, color=colors)
+    ax.set_title("Sentiment Distribution", fontsize=16, fontweight='bold')
+    ax.set_ylabel("Number of Articles", fontsize=12)
+    ax.grid(axis='y', linestyle='--', alpha=0.7)
+    # Add value labels on top of bars
+    for i, v in enumerate(values):
+        ax.text(i, v + 0.1, str(v), ha='center', fontweight='bold')
+    return fig
+# Function to display article information
+def display_article(article: Dict[str, Any], index: int):
+    """Display article information in a card layout."""
+    st.markdown(f"<div class='card'>", unsafe_allow_html=True)
+    # Article title and sentiment
+    sentiment = article.get("Sentiment", "Neutral")
+    sentiment_class = get_sentiment_color(sentiment)
+    st.markdown(f"<h3 class='article-title'>{index+1}. {article['Title']}</h3>", unsafe_allow_html=True)
+    st.markdown(f"<span class='{sentiment_class}'>{sentiment}</span>", unsafe_allow_html=True)
+    # Article summary
+    st.markdown("<div class='article-summary'>", unsafe_allow_html=True)
+    st.markdown(f"{article.get('Summary', 'No summary available.')}", unsafe_allow_html=True)
+    st.markdown("</div>", unsafe_allow_html=True)
+    # Topics
+    if "Topics" in article and article["Topics"]:
+        st.markdown("<div>", unsafe_allow_html=True)
+        for topic in article["Topics"]:
+            st.markdown(f"<span class='topic-tag'>{topic}</span>", unsafe_allow_html=True)
+        st.markdown("</div>", unsafe_allow_html=True)
+    st.markdown("</div>", unsafe_allow_html=True)
+# App layout
+st.markdown("<h1 class='main-header'>📰 News Summarization & Text-to-Speech</h1>", unsafe_allow_html=True)
+st.markdown("""
+<p class='info-text'>
+This application extracts news articles about a company, performs sentiment analysis, conducts comparative analysis,
+and generates a text-to-speech output in Hindi. Enter a company name to get started.
+</p>
+""", unsafe_allow_html=True)
+# Sidebar
+st.sidebar.image("https://cdn-icons-png.flaticon.com/512/2593/2593073.png", width=100)
+st.sidebar.title("News Analysis Settings")
+# Company selection
+company_input_method = st.sidebar.radio(
+    "Select company input method:",
+    options=["Text Input", "Choose from List"]
+)
+if company_input_method == "Text Input":
+    company_name = st.sidebar.text_input("Enter Company Name:", placeholder="e.g., Tesla")
+else:
+    companies = ["Apple", "Google", "Microsoft", "Amazon", "Tesla", "Meta", "Netflix", "Uber", "Airbnb", "Twitter"]
+    company_name = st.sidebar.selectbox("Select Company:", companies)
+# Analysis settings
+max_articles = st.sidebar.slider("Maximum Articles to Analyze:", min_value=5, max_value=20, value=10)
+st.sidebar.markdown("---")
+# Analysis button
+analyze_button = st.sidebar.button("Analyze Company News", type="primary")
+# Audio playback settings
+st.sidebar.markdown("## Audio Settings")
+audio_speed = st.sidebar.select_slider("TTS Speech Speed:", options=["Slow", "Normal", "Fast"], value="Normal")
+st.sidebar.markdown("---")
+# Add option to see JSON in example format
+st.sidebar.markdown("## Developer Options")
+show_json = st.sidebar.checkbox("Show JSON output in example format")
+st.sidebar.markdown("---")
+# About section
+with st.sidebar.expander("About This App"):
+    st.markdown("""
+    This application performs:
+    - News extraction from multiple sources
+    - Sentiment analysis of the content
+    - Topic identification and comparative analysis
+    - Text-to-speech conversion to Hindi
+    Built with Streamlit, FastAPI, and various NLP tools.
+    """)
+# Main content area
+if analyze_button and company_name:
+    with st.spinner(f"Analyzing news for {company_name}... This may take a minute"):
+        # Perform complete analysis
+        response = make_api_request(
+            "/api/complete_analysis",
+            {"company_name": company_name}
+        )
+        if not response:
+            st.error("Failed to retrieve data. Please try again.")
+        elif "detail" in response:
+            st.error(response["detail"])
+        else:
+            # Display company header
+            st.markdown(f"<h2 class='sub-header'>Analysis Results for {response['Company']}</h2>", unsafe_allow_html=True)
+            # Display sentiment summary
+            col1, col2 = st.columns([2, 1])
+            with col1:
+                st.markdown("<div class='card'>", unsafe_allow_html=True)
+                st.markdown("<h3 class='sub-header'>Sentiment Overview</h3>", unsafe_allow_html=True)
+                st.markdown(f"{response['Final Sentiment Analysis']}")
+                st.markdown("</div>", unsafe_allow_html=True)
+            with col2:
+                sentiment_data = response["Comparative Sentiment Score"]["Sentiment Distribution"]
+                fig = plot_sentiment_distribution(sentiment_data)
+                st.pyplot(fig)
+            st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
+            # Display Hindi TTS audio
+            if "Audio" in response and response["Audio"]:
+                st.markdown("<h3 class='sub-header'>Hindi Audio Summary</h3>", unsafe_allow_html=True)
+                audio_message = response["Audio"]
+                if audio_message == "Failed to generate audio":
+                    st.warning("Hindi audio could not be generated. However, you can still read the Hindi text below.")
+                else:
+                    try:
+                        # Check if the response contains the actual audio file path
+                        audio_file_path = response.get("_audio_file_path")
+                        if audio_file_path:
+                            # Extract the filename
+                            audio_filename = os.path.basename(audio_file_path)
+                            audio_url = f"{API_BASE_URL}/api/audio/{audio_filename}"
+                        else:
+                            # If no path is provided, just display a message
+                            st.info("Audio is available but the path was not provided.")
+                            audio_url = None
+                        if audio_url:
+                            # Attempt to download the audio file
+                            audio_response = requests.get(audio_url)
+                            if audio_response.status_code == 200:
+                                # Save temporarily
+                                temp_audio_path = f"temp_audio_{os.path.basename(audio_url)}"
+                                with open(temp_audio_path, "wb") as f:
+                                    f.write(audio_response.content)
+                                # Play from local file
+                                st.markdown("<div class='audio-container'>", unsafe_allow_html=True)
+                                st.audio(temp_audio_path, format="audio/mp3")
+                                # Display audio download link
+                                st.markdown(f"<a href='{audio_url}' download='hindi_summary.mp3'>Download Hindi Audio</a>", unsafe_allow_html=True)
+                                # Clean up temp file (optional)
+                                # os.remove(temp_audio_path)  # Uncomment to delete after use
+                            else:
+                                st.warning(f"Unable to load audio file (HTTP {audio_response.status_code}). You can still read the Hindi text below.")
+                        else:
+                            st.info("Hindi audio summary would be available here.")
+                    except Exception as e:
+                        st.warning(f"Error playing audio: {str(e)}. You can still read the Hindi text below.")
+                # Display the Hindi text with better formatting
+                with st.expander("Show Hindi Text"):
+                    hindi_text = response.get("Hindi Summary", "Hindi text not available.")
+                    # Format the text for better readability
+                    paragraphs = hindi_text.split("। ")
+                    for paragraph in paragraphs:
+                        if paragraph.strip():
+                            # Add a period if it doesn't end with one
+                            if not paragraph.strip().endswith("।"):
+                                paragraph += "।"
+                            st.markdown(f"<p style='font-size: 16px; margin-bottom: 10px;'>{paragraph}</p>", unsafe_allow_html=True)
+                st.markdown("</div>", unsafe_allow_html=True)
+                st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
+            # Display articles
+            st.markdown("<h3 class='sub-header'>News Articles</h3>", unsafe_allow_html=True)
+            articles = response.get("Articles", [])
+            if not articles:
+                st.info("No articles found for this company.")
+            else:
+                for i, article in enumerate(articles):
+                    display_article(article, i)
+            st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
+            # Display comparative analysis
+            st.markdown("<h3 class='sub-header'>Comparative Analysis</h3>", unsafe_allow_html=True)
+            # Display topic overlap
+            topic_data = response["Comparative Sentiment Score"]["Topic Overlap"]
+            col1, col2 = st.columns(2)
+            with col1:
+                st.markdown("<div class='card'>", unsafe_allow_html=True)
+                st.markdown("<h4>Common Topics</h4>", unsafe_allow_html=True)
+                common_topics = topic_data.get("Common Topics Across All", [])
+                if common_topics:
+                    for topic in common_topics:
+                        st.markdown(f"<span class='topic-tag'>{topic}</span>", unsafe_allow_html=True)
+                else:
+                    st.info("No common topics found across articles.")
+                st.markdown("</div>", unsafe_allow_html=True)
+            with col2:
+                st.markdown("<div class='card'>", unsafe_allow_html=True)
+                st.markdown("<h4>Coverage Comparison</h4>", unsafe_allow_html=True)
+                comparisons = response["Comparative Sentiment Score"].get("Coverage Differences", [])
+                if comparisons:
+                    for i, comparison in enumerate(comparisons[:3]):  # Show only top 3 comparisons
+                        st.markdown(f"<p><strong>{i+1}.</strong> {comparison.get('Comparison', '')}</p>", unsafe_allow_html=True)
+                        st.markdown(f"<p class='info-text'>{comparison.get('Impact', '')}</p>", unsafe_allow_html=True)
+                else:
+                    st.info("No comparative insights available.")
+                st.markdown("</div>", unsafe_allow_html=True)
+            # Display full comparison in expander
+            with st.expander("View All Comparisons"):
+                comparisons = response["Comparative Sentiment Score"].get("Coverage Differences", [])
+                for i, comparison in enumerate(comparisons):
+                    st.markdown(f"<p><strong>{i+1}.</strong> {comparison.get('Comparison', '')}</p>", unsafe_allow_html=True)
+                    st.markdown(f"<p class='info-text'>{comparison.get('Impact', '')}</p>", unsafe_allow_html=True)
+                    st.markdown("<hr>", unsafe_allow_html=True)
+            # Show JSON in example format if requested
+            if show_json:
+                st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
+                st.markdown("<h3 class='sub-header'>Example JSON Format</h3>", unsafe_allow_html=True)
+                # Get the formatted JSON
+                json_output = generate_example_output(company_name)
+                # Display the JSON in a code block
+                st.code(json_output, language="json")
+else:
+    # Display placeholder
+    st.markdown("<div class='card'>", unsafe_allow_html=True)
+    st.markdown("<h3 class='sub-header'>Enter a Company Name to Begin Analysis</h3>", unsafe_allow_html=True)
+    st.markdown("""
+        <p class='info-text'>
+        This application will:
+        </p>
+        <ul class='info-text'>
+            <li>Extract news articles from multiple sources</li>
+            <li>Analyze sentiment (positive, negative, neutral)</li>
+            <li>Identify key topics in each article</li>
+            <li>Perform comparative analysis across articles</li>
+            <li>Generate Hindi speech output summarizing the findings</li>
+        </ul>
+    """, unsafe_allow_html=True)
+    st.markdown("</div>", unsafe_allow_html=True)
+    # Sample output image
+    st.image("https://miro.medium.com/max/1400/1*Ger-949PgQnaje2oa9XMdw.png", caption="Sample sentiment analysis visualization")
+# Footer
+st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
+st.markdown("<p class='info-text' style='text-align: center;'>News Summarization & Text-to-Speech Application | Developed with Streamlit and FastAPI</p>", unsafe_allow_html=True)

generate_json_output.py ADDED Viewed

	@@ -0,0 +1,55 @@

+#!/usr/bin/env python
+import requests
+import json
+import sys
+def generate_json_output(company_name, api_url="http://localhost:8000"):
+    """
+    Generate output in the example format for the given company.
+    Args:
+        company_name (str): Name of the company to analyze
+        api_url (str): Base URL of the API
+    Returns:
+        str: Formatted JSON string
+    """
+    try:
+        # Make API request to get the analysis data
+        url = f"{api_url}/api/complete_analysis"
+        response = requests.post(url, json={"company_name": company_name})
+        response.raise_for_status()
+        data = response.json()
+        # Format the data to match the example output format exactly
+        formatted_output = {
+            "Company": data["Company"],
+            "Articles": data["Articles"],
+            "Comparative Sentiment Score": {
+                "Sentiment Distribution": data["Comparative Sentiment Score"]["Sentiment Distribution"],
+                "Coverage Differences": data["Comparative Sentiment Score"]["Coverage Differences"],
+                "Topic Overlap": data["Comparative Sentiment Score"]["Topic Overlap"]
+            },
+            "Final Sentiment Analysis": data["Final Sentiment Analysis"],
+            "Audio": "[Play Hindi Speech]" if data.get("Audio") else "No audio available"
+        }
+        # Convert to JSON string with proper formatting
+        return json.dumps(formatted_output, indent=2)
+    except Exception as e:
+        return json.dumps({
+            "error": str(e),
+            "message": "Failed to generate example output"
+        }, indent=2)
+if __name__ == "__main__":
+    # Get company name from command line arguments or prompt for it
+    if len(sys.argv) > 1:
+        company_name = sys.argv[1]
+    else:
+        company_name = input("Enter company name: ")
+    print(f"Input:\nCompany Name: {company_name}")
+    print("Output:", generate_json_output(company_name))

healthcheck.py ADDED Viewed

	@@ -0,0 +1,171 @@

+"""
+Healthcheck script to verify the functionality of all components of the application.
+Run this script to check if all dependencies are correctly installed and working.
+"""
+import sys
+import os
+import time
+import traceback
+def run_checks():
+    print("Starting health check for News Summarization and TTS Application...")
+    checks_passed = 0
+    checks_failed = 0
+    # Check 1: Verify imports
+    print("\n1. Checking imports...")
+    try:
+        # Standard libraries
+        import json
+        import re
+        # Web and API dependencies
+        import requests
+        import fastapi
+        import uvicorn
+        import streamlit
+        # Data processing
+        import pandas
+        import numpy
+        import bs4
+        # NLP
+        import nltk
+        import networkx
+        from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+        # ML and Transformers
+        import torch
+        import transformers
+        from transformers import pipeline
+        # TTS and Translation
+        import deep_translator
+        from deep_translator import GoogleTranslator
+        import gtts
+        import pyttsx3
+        print("✅ All imports successful.")
+        checks_passed += 1
+    except ImportError as e:
+        print(f"❌ Import error: {str(e)}")
+        print(f"Traceback: {traceback.format_exc()}")
+        checks_failed += 1
+    # Check 2: Verify NLTK data
+    print("\n2. Checking NLTK data...")
+    try:
+        import nltk
+        nltk.data.find('tokenizers/punkt')
+        nltk.data.find('corpora/stopwords')
+        print("✅ NLTK data verified.")
+        checks_passed += 1
+    except LookupError as e:
+        print(f"❌ NLTK data error: {str(e)}")
+        print("Trying to download necessary NLTK data...")
+        try:
+            nltk.download('punkt')
+            nltk.download('stopwords')
+            print("✅ NLTK data downloaded successfully.")
+            checks_passed += 1
+        except Exception as e:
+            print(f"❌ Failed to download NLTK data: {str(e)}")
+            checks_failed += 1
+    # Check 3: Test translation
+    print("\n3. Testing translation service...")
+    try:
+        from deep_translator import GoogleTranslator
+        translator = GoogleTranslator(source='en', target='hi')
+        text = "Hello, this is a test."
+        translated = translator.translate(text)
+        print(f"Original text: {text}")
+        print(f"Translated text: {translated}")
+        if translated and len(translated) > 0:
+            print("✅ Translation service working.")
+            checks_passed += 1
+        else:
+            print("❌ Translation returned empty result.")
+            checks_failed += 1
+    except Exception as e:
+        print(f"❌ Translation error: {str(e)}")
+        print(f"Traceback: {traceback.format_exc()}")
+        checks_failed += 1
+    # Check 4: Test TTS
+    print("\n4. Testing Text-to-Speech service...")
+    try:
+        from gtts import gTTS
+        test_text = "परीक्षण पाठ"  # "Test text" in Hindi
+        test_file = 'test_audio.mp3'
+        # Try gTTS
+        tts = gTTS(text=test_text, lang='hi', slow=False)
+        tts.save(test_file)
+        if os.path.exists(test_file) and os.path.getsize(test_file) > 0:
+            print("✅ gTTS service working.")
+            # Clean up test file
+            try:
+                os.remove(test_file)
+            except:
+                pass
+            checks_passed += 1
+        else:
+            print("❌ gTTS failed to generate a valid audio file.")
+            checks_failed += 1
+    except Exception as e:
+        print(f"❌ Text-to-Speech error: {str(e)}")
+        print(f"Traceback: {traceback.format_exc()}")
+        checks_failed += 1
+    # Check 5: Test sentiment analysis
+    print("\n5. Testing sentiment analysis...")
+    try:
+        from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+        analyzer = SentimentIntensityAnalyzer()
+        test_text = "This product is excellent and I love it!"
+        scores = analyzer.polarity_scores(test_text)
+        print(f"Sentiment scores for '{test_text}': {scores}")
+        if 'compound' in scores:
+            print("✅ Sentiment analysis working.")
+            checks_passed += 1
+        else:
+            print("❌ Sentiment analysis returned unexpected result.")
+            checks_failed += 1
+    except Exception as e:
+        print(f"❌ Sentiment analysis error: {str(e)}")
+        print(f"Traceback: {traceback.format_exc()}")
+        checks_failed += 1
+    # Check 6: Test Transformers
+    print("\n6. Testing Transformer models...")
+    try:
+        from transformers import pipeline
+        sentiment_task = pipeline("sentiment-analysis", return_all_scores=False)
+        result = sentiment_task("I love using this application!")
+        print(f"Transformer sentiment analysis result: {result}")
+        print("✅ Transformer models working.")
+        checks_passed += 1
+    except Exception as e:
+        print(f"❌ Transformer models error: {str(e)}")
+        print(f"Traceback: {traceback.format_exc()}")
+        checks_failed += 1
+    # Summary
+    print("\n" + "="*50)
+    print(f"Health Check Summary: {checks_passed} checks passed, {checks_failed} checks failed")
+    if checks_failed == 0:
+        print("\n✅ All systems operational! The application should run correctly.")
+        return True
+    else:
+        print("\n❌ Some checks failed. Please review the errors above.")
+        return False
+if __name__ == "__main__":
+    success = run_checks()
+    if not success:
+        sys.exit(1)

requirements.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+# Core dependencies
+streamlit==1.27.0
+fastapi==0.103.1
+uvicorn==0.23.2
+requests==2.31.0
+beautifulsoup4==4.12.2
+pandas==2.1.0
+numpy==1.25.2
+scipy==1.10.1
+# NLP and Sentiment Analysis
+transformers==4.33.1
+torch==2.0.1
+nltk==3.8.1
+vaderSentiment==3.3.2
+# Text-to-Speech
+gTTS==2.3.2
+pyttsx3==2.90
+deep-translator==1.11.4
+# Data Processing and Visualization
+matplotlib==3.7.3
+seaborn==0.12.2
+scikit-learn==1.3.0
+networkx==3.1
+# API and Web
+aiohttp==3.8.5
+httpx==0.24.1
+pydantic==2.3.0
+python-dotenv==1.0.0
+python-multipart==0.0.6
+# HuggingFace Spaces
+huggingface-hub==0.16.4
+# Added from the code block
+pydub==0.25.1

utils.py ADDED Viewed

	@@ -0,0 +1,1132 @@

+import requests
+import re
+import os
+import json
+import time
+from typing import List, Dict, Any, Tuple, Optional
+from bs4 import BeautifulSoup
+import pandas as pd
+import numpy as np
+from nltk.corpus import stopwords
+from nltk.tokenize import sent_tokenize, word_tokenize
+from nltk.cluster.util import cosine_distance
+import networkx as nx
+from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
+from collections import Counter
+from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
+from deep_translator import GoogleTranslator
+from gtts import gTTS
+import pyttsx3
+# Download necessary NLTK data
+import nltk
+try:
+    nltk.data.find('tokenizers/punkt')
+    nltk.data.find('corpora/stopwords')
+except LookupError:
+    nltk.download('punkt')
+    nltk.download('stopwords')
+# Initialize sentiment analyzer
+vader_analyzer = SentimentIntensityAnalyzer()
+# Initialize advanced sentiment model
+sentiment_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
+sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
+advanced_sentiment = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer)
+# Initialize translator
+translator = GoogleTranslator(source='en', target='hi')
+class NewsArticle:
+    def __init__(self, title: str, url: str, content: str, summary: str = "", source: str = "",
+                 date: str = "", sentiment: str = "", topics: List[str] = None):
+        self.title = title
+        self.url = url
+        self.content = content
+        self.summary = summary if summary else self.generate_summary(content)
+        self.source = source
+        self.date = date
+        self.sentiment = sentiment if sentiment else self.analyze_sentiment(content, title)
+        self.topics = topics if topics else self.extract_topics(content)
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "title": self.title,
+            "url": self.url,
+            "content": self.content,
+            "summary": self.summary,
+            "source": self.source,
+            "date": self.date,
+            "sentiment": self.sentiment,
+            "topics": self.topics
+        }
+    @staticmethod
+    def analyze_sentiment(text: str, title: str = "") -> str:
+        """
+        Analyze sentiment using a combination of methods for more accurate results.
+        We give more weight to the title sentiment and use advanced model when possible.
+        """
+        # Set thresholds for VADER sentiment
+        threshold_positive = 0.05  # Default 0.05
+        threshold_negative = -0.05  # Default -0.05
+        # Use VADER for basic sentiment analysis on both title and content
+        try:
+            title_scores = vader_analyzer.polarity_scores(title) if title else {'compound': 0}
+            content_scores = vader_analyzer.polarity_scores(text)
+            # Weight the title more heavily (title sentiment is often more reliable)
+            title_weight = 0.6 if title else 0
+            content_weight = 1.0 - title_weight
+            compound_score = (title_weight * title_scores['compound']) + (content_weight * content_scores['compound'])
+            # Try to use the advanced model for additional insight (for short texts)
+            advanced_result = None
+            advanced_score = 0
+            try:
+                # Use title + first part of content for advanced model
+                sample_text = title + ". " + text[:300] if title else text[:300]
+                advanced_result = advanced_sentiment(sample_text)[0]
+                # Map advanced model results to a -1 to 1 scale similar to VADER
+                label = advanced_result['label']
+                confidence = advanced_result['score']
+                # Map the 1-5 star rating to a -1 to 1 scale
+                if label == '1 star' or label == '2 stars':
+                    advanced_score = -confidence
+                elif label == '4 stars' or label == '5 stars':
+                    advanced_score = confidence
+                else:  # 3 stars is neutral
+                    advanced_score = 0
+                # Combine VADER and advanced model scores
+                # Give more weight to advanced model when confidence is high
+                if confidence > 0.8:
+                    compound_score = (0.4 * compound_score) + (0.6 * advanced_score)
+                else:
+                    compound_score = (0.7 * compound_score) + (0.3 * advanced_score)
+            except Exception as e:
+                print(f"Advanced sentiment analysis failed: {str(e)}")
+                # Continue with just VADER if advanced model fails
+                pass
+            # Fine-grained sentiment mapping
+            if compound_score >= 0.3:
+                return "Positive"
+            elif compound_score >= threshold_positive:
+                return "Slightly Positive"
+            elif compound_score <= -0.3:
+                return "Negative"
+            elif compound_score <= threshold_negative:
+                return "Slightly Negative"
+            else:
+                return "Neutral"
+        except Exception as e:
+            print(f"Sentiment analysis error: {str(e)}")
+            return "Neutral"  # Default fallback
+    @staticmethod
+    def generate_summary(text: str, num_sentences: int = 5) -> str:
+        # Generate summary using extractive summarization
+        if not text or len(text) < 100:
+            return text
+        # Tokenize sentences
+        sentences = sent_tokenize(text)
+        if len(sentences) <= num_sentences:
+            return text
+        # Calculate sentence similarity and rank them
+        similarity_matrix = build_similarity_matrix(sentences)
+        scores = nx.pagerank(nx.from_numpy_array(similarity_matrix))
+        # Select top sentences
+        ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
+        summary_sentences = [ranked_sentences[i][1] for i in range(min(num_sentences, len(ranked_sentences)))]
+        # Maintain original order
+        original_order = []
+        for sentence in sentences:
+            if sentence in summary_sentences and sentence not in original_order:
+                original_order.append(sentence)
+                if len(original_order) >= num_sentences:
+                    break
+        return " ".join(original_order)
+    @staticmethod
+    def extract_topics(text: str, num_topics: int = 5) -> List[str]:
+        # Extract key topics from text based on term frequency
+        stop_words = set(stopwords.words('english'))
+        words = word_tokenize(text.lower())
+        # Filter out stopwords and short words
+        filtered_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 3]
+        # Count word frequencies
+        word_counts = Counter(filtered_words)
+        # Return most common words as topics
+        topics = [word for word, _ in word_counts.most_common(num_topics)]
+        return topics
+def build_similarity_matrix(sentences: List[str]) -> np.ndarray:
+    """Build similarity matrix for sentences based on cosine similarity."""
+    # Number of sentences
+    n = len(sentences)
+    # Initialize similarity matrix
+    similarity_matrix = np.zeros((n, n))
+    # Calculate similarity between each pair of sentences
+    for i in range(n):
+        for j in range(n):
+            if i != j:
+                similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])
+    return similarity_matrix
+def sentence_similarity(sent1: str, sent2: str) -> float:
+    """Calculate similarity between two sentences using cosine similarity."""
+    # Tokenize sentences
+    words1 = [word.lower() for word in word_tokenize(sent1) if word.isalpha()]
+    words2 = [word.lower() for word in word_tokenize(sent2) if word.isalpha()]
+    # Get all unique words
+    all_words = list(set(words1 + words2))
+    # Create word vectors
+    vector1 = [1 if word in words1 else 0 for word in all_words]
+    vector2 = [1 if word in words2 else 0 for word in all_words]
+    # Calculate cosine similarity
+    if not any(vector1) or not any(vector2):
+        return 0.0
+    return 1 - cosine_distance(vector1, vector2)
+def search_news(company_name: str, num_articles: int = 10) -> List[NewsArticle]:
+    """Search for news articles about a given company."""
+    # List to store articles
+    articles = []
+    # Define search queries and news sources
+    search_queries = [
+        f"{company_name} news",
+        f"{company_name} financial news",
+        f"{company_name} business news",
+        f"{company_name} recent news",
+        f"{company_name} company news",
+        f"{company_name} stock",
+        f"{company_name} market"
+    ]
+    # Updated news sources with more reliable sources
+    news_sources = [
+        {
+            "base_url": "https://finance.yahoo.com/quote/",
+            "article_patterns": ["news", "finance", "articles"],
+            "direct_access": True
+        },
+        {
+            "base_url": "https://www.reuters.com/search/news?blob=",
+            "article_patterns": ["article", "business", "companies", "markets"],
+            "direct_access": False
+        },
+        {
+            "base_url": "https://www.marketwatch.com/search?q=",
+            "article_patterns": ["story", "articles", "news"],
+            "direct_access": False
+        },
+        {
+            "base_url": "https://www.fool.com/search?q=",
+            "article_patterns": ["article", "investing", "stock"],
+            "direct_access": False
+        },
+        {
+            "base_url": "https://seekingalpha.com/search?q=",
+            "article_patterns": ["article", "news", "stock", "analysis"],
+            "direct_access": False
+        },
+        {
+            "base_url": "https://www.zacks.com/search.php?q=",
+            "article_patterns": ["stock", "research", "analyst"],
+            "direct_access": False
+        },
+        {
+            "base_url": "https://economictimes.indiatimes.com/search?q=",
+            "article_patterns": ["articleshow", "news", "industry"],
+            "direct_access": False
+        },
+        {
+            "base_url": "https://www.bloomberg.com/search?query=",
+            "article_patterns": ["news", "articles"],
+            "direct_access": False
+        }
+    ]
+    print(f"Starting search for news about {company_name}...")
+    # Search each source with each query until we have enough articles
+    for query in search_queries:
+        if len(articles) >= num_articles:
+            break
+        for source in news_sources:
+            if len(articles) >= num_articles:
+                break
+            try:
+                source_base = source["base_url"]
+                article_patterns = source["article_patterns"]
+                direct_access = source["direct_access"]
+                # Construct search URL
+                if direct_access:
+                    # Try to fetch the stock symbol for Yahoo Finance
+                    if "yahoo" in source_base:
+                        try:
+                            # First try the company name directly (for known tickers)
+                            search_url = f"{source_base}{company_name}/news"
+                            print(f"Trying direct ticker access: {search_url}")
+                            # Fetch to check if valid
+                            headers = {
+                                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+                            }
+                            test_response = requests.get(search_url, headers=headers, timeout=10)
+                            # If we got a 404, try searching for the symbol first
+                            if test_response.status_code == 404:
+                                print("Company name not a valid ticker, searching for symbol...")
+                                symbol_url = f"https://finance.yahoo.com/lookup?s={company_name}"
+                                symbol_response = requests.get(symbol_url, headers=headers, timeout=10)
+                                if symbol_response.status_code == 200:
+                                    symbol_soup = BeautifulSoup(symbol_response.text, 'html.parser')
+                                    # Try to find the first stock symbol result
+                                    symbol_row = symbol_soup.select_one("tr.data-row0")
+                                    if symbol_row:
+                                        symbol_cell = symbol_row.select_one("td:first-child a")
+                                        if symbol_cell:
+                                            symbol = symbol_cell.text.strip()
+                                            search_url = f"{source_base}{symbol}/news"
+                                            print(f"Found symbol {symbol}, using URL: {search_url}")
+                        except Exception as e:
+                            print(f"Error getting stock symbol: {str(e)}")
+                            search_url = f"{source_base}{company_name}/news"
+                    else:
+                        search_url = f"{source_base}{company_name}/news"
+                else:
+                    search_url = f"{source_base}{query.replace(' ', '+')}"
+                print(f"Searching {search_url}")
+                # Fetch search results with retry mechanism
+                max_retries = 3
+                retry_count = 0
+                response = None
+                while retry_count < max_retries:
+                    try:
+                        headers = {
+                            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
+                            "Accept": "text/html,application/xhtml+xml,application/xml",
+                            "Accept-Language": "en-US,en;q=0.9",
+                            "Referer": "https://www.google.com/"
+                        }
+                        response = requests.get(search_url, headers=headers, timeout=15)
+                        if response.status_code == 200:
+                            break
+                        retry_count += 1
+                        print(f"Retry {retry_count}/{max_retries} for {search_url} (status: {response.status_code})")
+                        time.sleep(1)  # Short delay before retry
+                    except Exception as e:
+                        retry_count += 1
+                        print(f"Request error (attempt {retry_count}/{max_retries}): {str(e)}")
+                        time.sleep(1)
+                if not response or response.status_code != 200:
+                    print(f"Failed to fetch results from {search_url} after {max_retries} attempts")
+                    continue
+                soup = BeautifulSoup(response.text, 'html.parser')
+                # Extract article links - using more flexible patterns
+                links = soup.find_all('a', href=True)
+                article_links = []
+                # Domain for resolving relative URLs
+                domain = response.url.split('/')[0] + '//' + response.url.split('/')[2]
+                print(f"Domain for resolving URLs: {domain}")
+                for link in links:
+                    href = link['href']
+                    link_text = link.text.strip()
+                    # Skip empty links or navigation elements
+                    if not link_text or len(link_text) < 10 or href.startswith('#'):
+                        continue
+                    # Check if the link matches any of our article patterns
+                    is_article_link = False
+                    for pattern in article_patterns:
+                        if pattern in href.lower():
+                            is_article_link = True
+                            break
+                    # Check for the company name in link text or URL (less restrictive now)
+                    contains_company = (
+                        company_name.lower() in link_text.lower() or
+                        company_name.lower() in href.lower()
+                    )
+                    if is_article_link or contains_company:
+                        # Convert relative URLs to absolute
+                        if href.startswith('/'):
+                            href = f"{domain}{href}"
+                        elif not href.startswith(('http://', 'https://')):
+                            href = f"{domain}/{href}"
+                        # Avoid duplicates
+                        if href not in article_links:
+                            article_links.append(href)
+                            print(f"Found potential article: {link_text[:50]}... at {href}")
+                print(f"Found {len(article_links)} potential article links from {search_url}")
+                # Process each article link
+                for link in article_links[:5]:  # Increased from 3 to 5
+                    if len(articles) >= num_articles:
+                        break
+                    try:
+                        print(f"Fetching article: {link}")
+                        article_response = requests.get(link, headers=headers, timeout=15)
+                        if article_response.status_code != 200:
+                            print(f"Failed to fetch article: {article_response.status_code}")
+                            continue
+                        article_soup = BeautifulSoup(article_response.text, 'html.parser')
+                        # Extract article title - more robust method
+                        title = None
+                        # Try different elements that could contain the title
+                        for title_tag in ['h1', 'h2', '.headline', '.title', 'title']:
+                            if title:
+                                break
+                            if title_tag.startswith('.'):
+                                elements = article_soup.select(title_tag)
+                            else:
+                                elements = article_soup.find_all(title_tag)
+                            for element in elements:
+                                candidate = element.text.strip()
+                                if len(candidate) > 5 and len(candidate) < 200:  # Reasonable title length
+                                    title = candidate
+                                    break
+                        if not title:
+                            print("Could not find a suitable title")
+                            continue
+                        # Check if title contains company name (case insensitive)
+                        if company_name.lower() not in title.lower():
+                            # Try alternative check - sometimes the title doesn't explicitly mention the company
+                            meta_description = article_soup.find('meta', attrs={'name': 'description'}) or \
+                                               article_soup.find('meta', attrs={'property': 'og:description'})
+                            if meta_description and 'content' in meta_description.attrs:
+                                meta_text = meta_description['content']
+                                if company_name.lower() not in meta_text.lower():
+                                    # One more check in the page content
+                                    page_text = article_soup.get_text().lower()
+                                    company_mentions = page_text.count(company_name.lower())
+                                    if company_mentions < 2:  # Require at least 2 mentions
+                                        print(f"Article doesn't seem to be about {company_name}: {title}")
+                                        continue
+                        # Extract article content - improved method
+                        content = ""
+                        # Try multiple content extraction strategies
+                        content_containers = []
+                        # 1. Look for article/main content containers
+                        for container in ['article', 'main', '.article-body', '.story-body', '.story-content',
+                                         '.article-content', '.content-body', '.entry-content']:
+                            if container.startswith('.'):
+                                elements = article_soup.select(container)
+                            else:
+                                elements = article_soup.find_all(container)
+                            content_containers.extend(elements)
+                        # 2. If no specific containers, fallback to div with article-like classes
+                        if not content_containers:
+                            for div in article_soup.find_all('div', class_=True):
+                                classes = div.get('class', [])
+                                for cls in classes:
+                                    if any(term in cls.lower() for term in ['article', 'story', 'content', 'body', 'text']):
+                                        content_containers.append(div)
+                                        break
+                        # 3. Extract paragraphs from containers
+                        processed_paragraphs = set()  # To avoid duplicates
+                        for container in content_containers:
+                            for p in container.find_all('p'):
+                                p_text = p.text.strip()
+                                # Avoid very short or duplicate paragraphs
+                                if len(p_text) > 30 and p_text not in processed_paragraphs:
+                                    content += p_text + " "
+                                    processed_paragraphs.add(p_text)
+                        # 4. If still no content, try all paragraphs
+                        if not content:
+                            for p in article_soup.find_all('p'):
+                                p_text = p.text.strip()
+                                if len(p_text) > 30 and p_text not in processed_paragraphs:
+                                    content += p_text + " "
+                                    processed_paragraphs.add(p_text)
+                        content = content.strip()
+                        # Skip if content is too short
+                        if len(content) < 300:  # Reduced from 500 to be less restrictive
+                            print(f"Article content too short: {len(content)} characters")
+                            continue
+                        # Extract source name - more robust method
+                        source = None
+                        # Try to get from meta tags
+                        meta_site_name = article_soup.find('meta', attrs={'property': 'og:site_name'})
+                        if meta_site_name and 'content' in meta_site_name.attrs:
+                            source = meta_site_name['content']
+                        else:
+                            # Extract from URL
+                            try:
+                                from urllib.parse import urlparse
+                                parsed_url = urlparse(link)
+                                source = parsed_url.netloc
+                            except:
+                                source = response.url.split('/')[2]
+                        # Extract date - improved method
+                        date = ""
+                        # Try multiple date extraction strategies
+                        # 1. Look for time element
+                        date_tag = article_soup.find('time')
+                        # 2. Look for meta tags with date
+                        if not date and (not date_tag or not date_tag.get('datetime')):
+                            for meta_name in ['article:published_time', 'date', 'publish-date', 'article:modified_time']:
+                                meta_date = article_soup.find('meta', attrs={'property': meta_name}) or \
+                                           article_soup.find('meta', attrs={'name': meta_name})
+                                if meta_date and 'content' in meta_date.attrs:
+                                    date = meta_date['content']
+                                    break
+                        # 3. Look for spans/divs with date-related classes
+                        if not date:
+                            date_classes = ['date', 'time', 'published', 'posted', 'datetime']
+                            for cls in date_classes:
+                                elements = article_soup.find_all(['span', 'div', 'p'], class_=lambda x: x and cls.lower() in x.lower())
+                                if elements:
+                                    date = elements[0].text.strip()
+                                    break
+                        # If we got this far, we have a valid article
+                        print(f"Successfully extracted article: {title}")
+                        # Create article object and add to list
+                        article = NewsArticle(
+                            title=title,
+                            url=link,
+                            content=content,
+                            source=source,
+                            date=date
+                        )
+                        # Check if similar article already exists to avoid duplicates
+                        is_duplicate = False
+                        for existing_article in articles:
+                            if sentence_similarity(existing_article.title, title) > 0.7:  # Lowered threshold
+                                is_duplicate = True
+                                print(f"Found duplicate article: {title}")
+                                break
+                        if not is_duplicate:
+                            articles.append(article)
+                            print(f"Added article: {title}")
+                    except Exception as e:
+                        print(f"Error processing article {link}: {str(e)}")
+                        continue
+            except Exception as e:
+                print(f"Error searching {source_base} with query {query}: {str(e)}")
+                continue
+    # If we couldn't find enough articles, create some dummy articles to prevent errors
+    if not articles and num_articles > 0:
+        print(f"No articles found for {company_name}. Creating a dummy article to prevent errors.")
+        dummy_article = NewsArticle(
+            title=f"{company_name} Information",
+            url="#",
+            content=f"Information about {company_name} was not found or could not be retrieved. This is a placeholder.",
+            source="System",
+            date="",
+            sentiment="Neutral",
+            topics=["information", "company", "placeholder"]
+        )
+        articles.append(dummy_article)
+    # Return collected articles
+    print(f"Returning {len(articles)} articles for {company_name}")
+    return articles[:num_articles]
+def analyze_article_sentiment(article: NewsArticle) -> Dict[str, Any]:
+    """Perform detailed sentiment analysis on an article."""
+    # Use VADER for paragraph-level sentiment
+    paragraphs = article.content.split('\n')
+    paragraph_sentiments = []
+    overall_scores = {
+        'pos': 0,
+        'neg': 0,
+        'neu': 0,
+        'compound': 0
+    }
+    for paragraph in paragraphs:
+        if len(paragraph.strip()) < 20:  # Skip short paragraphs
+            continue
+        scores = vader_analyzer.polarity_scores(paragraph)
+        paragraph_sentiments.append({
+            'text': paragraph[:100] + '...' if len(paragraph) > 100 else paragraph,
+            'scores': scores
+        })
+        overall_scores['pos'] += scores['pos']
+        overall_scores['neg'] += scores['neg']
+        overall_scores['neu'] += scores['neu']
+        overall_scores['compound'] += scores['compound']
+    num_paragraphs = len(paragraph_sentiments)
+    if num_paragraphs > 0:
+        overall_scores['pos'] /= num_paragraphs
+        overall_scores['neg'] /= num_paragraphs
+        overall_scores['neu'] /= num_paragraphs
+        overall_scores['compound'] /= num_paragraphs
+    # Use advanced model for overall sentiment
+    try:
+        # Truncate content if too long
+        truncated_content = article.content[:512] if len(article.content) > 512 else article.content
+        advanced_result = advanced_sentiment(truncated_content)[0]
+        advanced_sentiment_label = advanced_result['label']
+        advanced_confidence = advanced_result['score']
+    except Exception as e:
+        print(f"Error with advanced sentiment analysis: {str(e)}")
+        advanced_sentiment_label = "Error"
+        advanced_confidence = 0.0
+    # Determine final sentiment
+    if overall_scores['compound'] >= 0.05:
+        final_sentiment = "Positive"
+    elif overall_scores['compound'] <= -0.05:
+        final_sentiment = "Negative"
+    else:
+        final_sentiment = "Neutral"
+    return {
+        'article_title': article.title,
+        'overall_sentiment': final_sentiment,
+        'vader_scores': overall_scores,
+        'advanced_sentiment': {
+            'label': advanced_sentiment_label,
+            'confidence': advanced_confidence
+        },
+        'paragraph_analysis': paragraph_sentiments,
+        'positive_ratio': overall_scores['pos'],
+        'negative_ratio': overall_scores['neg'],
+        'neutral_ratio': overall_scores['neu']
+    }
+def perform_comparative_analysis(articles: List[NewsArticle]) -> Dict[str, Any]:
+    """Perform comparative analysis across multiple articles."""
+    # Sentiment distribution with expanded categories
+    sentiment_counts = {
+        "Positive": 0,
+        "Slightly Positive": 0,
+        "Neutral": 0,
+        "Slightly Negative": 0,
+        "Negative": 0
+    }
+    for article in articles:
+        if article.sentiment in sentiment_counts:
+            sentiment_counts[article.sentiment] += 1
+        else:
+            # Fallback for any unexpected sentiment values
+            sentiment_counts["Neutral"] += 1
+    # Topic analysis
+    all_topics = []
+    for article in articles:
+        all_topics.extend(article.topics)
+    topic_counts = Counter(all_topics)
+    common_topics = [topic for topic, count in topic_counts.most_common(10)]
+    # Identify unique topics per article
+    unique_topics_by_article = {}
+    for i, article in enumerate(articles):
+        other_articles_topics = []
+        for j, other_article in enumerate(articles):
+            if i != j:
+                other_articles_topics.extend(other_article.topics)
+        unique_topics = [topic for topic in article.topics if topic not in other_articles_topics]
+        unique_topics_by_article[i] = unique_topics
+    # Generate comparisons
+    comparisons = []
+    # If we have more than one article, generate meaningful comparisons
+    if len(articles) > 1:
+        for i in range(len(articles) - 1):
+            for j in range(i + 1, len(articles)):
+                article1 = articles[i]
+                article2 = articles[j]
+                # Compare sentiments - more nuanced now with new categories
+                if article1.sentiment != article2.sentiment:
+                    # Group sentiments for better comparison
+                    sent1_group = get_sentiment_group(article1.sentiment)
+                    sent2_group = get_sentiment_group(article2.sentiment)
+                    if sent1_group != sent2_group:
+                        comparison = {
+                            "Articles": [article1.title, article2.title],
+                            "Comparison": f"'{article1.title}' presents a {sent1_group.lower()} view ({article1.sentiment}), while '{article2.title}' has a {sent2_group.lower()} view ({article2.sentiment}).",
+                            "Impact": "This difference in sentiment highlights varying perspectives on the company's situation."
+                        }
+                        comparisons.append(comparison)
+                    else:
+                        # Even if in same group, note the difference if one is stronger
+                        if "Slightly" in article1.sentiment and "Slightly" not in article2.sentiment or \
+                           "Slightly" in article2.sentiment and "Slightly" not in article1.sentiment:
+                            stronger = article1 if "Slightly" not in article1.sentiment else article2
+                            weaker = article2 if stronger == article1 else article1
+                            comparison = {
+                                "Articles": [stronger.title, weaker.title],
+                                "Comparison": f"'{stronger.title}' expresses a stronger {sent1_group.lower()} sentiment ({stronger.sentiment}) than '{weaker.title}' ({weaker.sentiment}).",
+                                "Impact": "The difference in intensity suggests varying degrees of confidence about the company."
+                            }
+                            comparisons.append(comparison)
+                # Compare topics
+                common_topics_between_two = set(article1.topics).intersection(set(article2.topics))
+                if common_topics_between_two:
+                    comparison = {
+                        "Articles": [article1.title, article2.title],
+                        "Comparison": f"Both articles discuss {', '.join(common_topics_between_two)}.",
+                        "Impact": "The common topics indicate key areas of focus around the company."
+                    }
+                    comparisons.append(comparison)
+                # Compare unique topics
+                unique_to_article1 = set(article1.topics) - set(article2.topics)
+                unique_to_article2 = set(article2.topics) - set(article1.topics)
+                if unique_to_article1 and unique_to_article2:
+                    comparison = {
+                        "Articles": [article1.title, article2.title],
+                        "Comparison": f"'{article1.title}' uniquely covers {', '.join(unique_to_article1)}, while '{article2.title}' focuses on {', '.join(unique_to_article2)}.",
+                        "Impact": "Different sources emphasize varying aspects of the company, offering a broader perspective."
+                    }
+                    comparisons.append(comparison)
+    else:
+        # If we only have one article, create a dummy comparison
+        if articles:
+            article = articles[0]
+            topics_str = ", ".join(article.topics[:3]) if article.topics else "no specific topics"
+            sentiment_group = get_sentiment_group(article.sentiment)
+            comparisons = [
+                {
+                    "Comparison": f"Only found one article: '{article.title}' with a {article.sentiment.lower()} sentiment ({sentiment_group} overall).",
+                    "Impact": f"Limited coverage focused on {topics_str}. More articles would provide a more balanced view."
+                },
+                {
+                    "Comparison": f"The article discusses {topics_str} in relation to {article.source}.",
+                    "Impact": "Single source reporting limits perspective. Consider searching for additional sources."
+                }
+            ]
+    # Generate overall sentiment analysis
+    # Combine slightly positive with positive and slightly negative with negative for summary
+    pos_count = sentiment_counts["Positive"] + sentiment_counts["Slightly Positive"]
+    neg_count = sentiment_counts["Negative"] + sentiment_counts["Slightly Negative"]
+    neu_count = sentiment_counts["Neutral"]
+    total = pos_count + neg_count + neu_count
+    # For display, we'll keep detailed counts but summarize the analysis text
+    if total == 0:
+        final_analysis = "No sentiment data available."
+    else:
+        pos_ratio = pos_count / total
+        neg_ratio = neg_count / total
+        # Show more details on the sentiment breakdown
+        sentiment_detail = []
+        if sentiment_counts["Positive"] > 0:
+            sentiment_detail.append(f"{sentiment_counts['Positive']} strongly positive")
+        if sentiment_counts["Slightly Positive"] > 0:
+            sentiment_detail.append(f"{sentiment_counts['Slightly Positive']} slightly positive")
+        if sentiment_counts["Neutral"] > 0:
+            sentiment_detail.append(f"{sentiment_counts['Neutral']} neutral")
+        if sentiment_counts["Slightly Negative"] > 0:
+            sentiment_detail.append(f"{sentiment_counts['Slightly Negative']} slightly negative")
+        if sentiment_counts["Negative"] > 0:
+            sentiment_detail.append(f"{sentiment_counts['Negative']} strongly negative")
+        sentiment_breakdown = ", ".join(sentiment_detail)
+        if pos_ratio > 0.6:
+            final_analysis = f"The company has primarily positive coverage ({pos_count}/{total} articles positive: {sentiment_breakdown}). This suggests a favorable market perception."
+        elif neg_ratio > 0.6:
+            final_analysis = f"The company has primarily negative coverage ({neg_count}/{total} articles negative: {sentiment_breakdown}). This could indicate challenges or controversies."
+        elif pos_ratio > neg_ratio:
+            final_analysis = f"The company has mixed coverage with a positive lean ({sentiment_breakdown})."
+        elif neg_ratio > pos_ratio:
+            final_analysis = f"The company has mixed coverage with a negative lean ({sentiment_breakdown})."
+        else:
+            final_analysis = f"The company has balanced coverage ({sentiment_breakdown})."
+    # If we only have the dummy article, customize the final analysis
+    if len(articles) == 1 and articles[0].url == "#":
+        final_analysis = "Limited news data available. The analysis is based on a placeholder article."
+    return {
+        "Sentiment Distribution": sentiment_counts,
+        "Common Topics": common_topics,
+        "Topic Overlap": {
+            "Common Topics Across All": common_topics[:5],
+            "Unique Topics By Article": unique_topics_by_article
+        },
+        "Coverage Differences": comparisons[:10],  # Limit to top 10 comparisons
+        "Final Sentiment Analysis": final_analysis
+    }
+def get_sentiment_group(sentiment: str) -> str:
+    """Group sentiments into broader categories for comparison."""
+    if sentiment in ["Positive", "Slightly Positive"]:
+        return "Positive"
+    elif sentiment in ["Negative", "Slightly Negative"]:
+        return "Negative"
+    else:
+        return "Neutral"
+def translate_to_hindi(text: str) -> str:
+    """Translate text to Hindi using deep_translator."""
+    try:
+        # Split text into chunks if too long (Google Translator has a limit)
+        max_chunk_size = 4500  # deep_translator's GoogleTranslator has a limit of 5000 chars
+        chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
+        translated_chunks = []
+        for chunk in chunks:
+            # Translate the chunk
+            translated = translator.translate(chunk)
+            translated_chunks.append(translated)
+            time.sleep(0.5)  # Short delay to avoid rate limiting
+        return ''.join(translated_chunks)
+    except Exception as e:
+        print(f"Translation error: {str(e)}")
+        # Fallback to simple placeholder for Hindi text if translation fails
+        return "अनुवाद त्रुटि हुई।" # "Translation error occurred" in Hindi
+def text_to_speech(text: str, output_file: str = 'output.mp3') -> str:
+    """Convert text to speech in Hindi."""
+    try:
+        # Ensure output directory exists
+        output_dir = os.path.dirname(output_file)
+        if output_dir:
+            os.makedirs(output_dir, exist_ok=True)
+            print(f"Ensuring output directory exists: {output_dir}")
+        # If text is too short, add some padding to avoid TTS errors
+        if len(text.strip()) < 5:
+            text = text + " " + "नमस्कार" * 3  # Add some padding text
+            print("Text was too short, adding padding")
+        print(f"Attempting to generate TTS for text of length {len(text)} characters")
+        # For long texts, split into chunks for better TTS quality
+        if len(text) > 3000:
+            print("Text is long, splitting into chunks for better TTS quality")
+            # Split at sentence boundaries
+            sentences = re.split(r'(।|\.|\?|\!)', text)
+            chunks = []
+            current_chunk = ""
+            # Combine sentences into chunks of appropriate size
+            for i in range(0, len(sentences), 2):
+                if i+1 < len(sentences):  # Make sure we have the punctuation part
+                    sentence = sentences[i] + sentences[i+1]
+                else:
+                    sentence = sentences[i]
+                if len(current_chunk) + len(sentence) < 3000:
+                    current_chunk += sentence
+                else:
+                    if current_chunk:
+                        chunks.append(current_chunk)
+                    current_chunk = sentence
+            if current_chunk:  # Add the last chunk
+                chunks.append(current_chunk)
+            print(f"Split text into {len(chunks)} chunks for TTS processing")
+            # Process each chunk and combine into one audio file
+            temp_files = []
+            for i, chunk in enumerate(chunks):
+                temp_output = f"{output_file}.part{i}.mp3"
+                try:
+                    # Try gTTS for each chunk
+                    tts = gTTS(text=chunk, lang='hi', slow=False)
+                    tts.save(temp_output)
+                    if os.path.exists(temp_output) and os.path.getsize(temp_output) > 0:
+                        temp_files.append(temp_output)
+                    else:
+                        print(f"Failed to create chunk {i} with gTTS")
+                        raise Exception(f"gTTS failed for chunk {i}")
+                except Exception as e:
+                    print(f"Error with gTTS for chunk {i}: {str(e)}")
+                    break
+            # If we have temp files, combine them
+            if temp_files:
+                try:
+                    # Use pydub to concatenate audio files
+                    from pydub import AudioSegment
+                    combined = AudioSegment.empty()
+                    for temp_file in temp_files:
+                        audio = AudioSegment.from_mp3(temp_file)
+                        combined += audio
+                    combined.export(output_file, format="mp3")
+                    # Clean up temp files
+                    for temp_file in temp_files:
+                        try:
+                            os.remove(temp_file)
+                        except:
+                            pass
+                    print(f"Successfully combined {len(temp_files)} audio chunks into {output_file}")
+                    return output_file
+                except Exception as e:
+                    print(f"Error combining audio files: {str(e)}")
+                    # Try to return the first chunk at least
+                    if os.path.exists(temp_files[0]):
+                        import shutil
+                        shutil.copy(temp_files[0], output_file)
+                        print(f"Returning first chunk as fallback: {output_file}")
+                        return output_file
+        # Method 1: Use gTTS for Hindi text-to-speech (for shorter texts or if chunking failed)
+        try:
+            print("Trying to use gTTS...")
+            tts = gTTS(text=text, lang='hi', slow=False)
+            tts.save(output_file)
+            # Verify the file was created and is not empty
+            if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
+                print(f"Successfully created audio file with gTTS: {output_file} (size: {os.path.getsize(output_file)} bytes)")
+                return output_file
+            else:
+                print(f"gTTS created a file but it may be empty or invalid: {output_file}")
+                raise Exception("Generated audio file is empty or invalid")
+        except Exception as e:
+            print(f"gTTS error: {str(e)}")
+            # Method 2: Fallback to pyttsx3
+            try:
+                print("Falling back to pyttsx3...")
+                engine = pyttsx3.init()
+                # Try to find a Hindi voice, or use default
+                voices = engine.getProperty('voices')
+                found_hindi_voice = False
+                for voice in voices:
+                    print(f"Checking voice: {voice.name}")
+                    if 'hindi' in voice.name.lower():
+                        print(f"Found Hindi voice: {voice.name}")
+                        engine.setProperty('voice', voice.id)
+                        found_hindi_voice = True
+                        break
+                if not found_hindi_voice:
+                    print("No Hindi voice found, using default voice")
+                engine.save_to_file(text, output_file)
+                engine.runAndWait()
+                # Verify the file was created and is not empty
+                if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
+                    print(f"Successfully created audio file with pyttsx3: {output_file} (size: {os.path.getsize(output_file)} bytes)")
+                    return output_file
+                else:
+                    print(f"pyttsx3 created a file but it may be empty or invalid: {output_file}")
+                    raise Exception("Generated audio file is empty or invalid")
+            except Exception as e2:
+                print(f"pyttsx3 error: {str(e2)}")
+                # If all TTS methods fail, create a simple notification sound as fallback
+                try:
+                    print("Both TTS methods failed. Creating a simple audio notification instead.")
+                    # Generate a simple beep sound as a fallback (1 second, 440Hz)
+                    import numpy as np
+                    from scipy.io import wavfile
+                    sample_rate = 44100
+                    duration = 1  # seconds
+                    t = np.linspace(0, duration, int(sample_rate * duration))
+                    # Generate a simple tone
+                    frequency = 440  # Hz (A4 note)
+                    data = np.sin(2 * np.pi * frequency * t) * 32767
+                    data = data.astype(np.int16)
+                    # Convert output_file from mp3 to wav
+                    wav_output_file = output_file.replace('.mp3', '.wav')
+                    wavfile.write(wav_output_file, sample_rate, data)
+                    print(f"Created simple audio notification: {wav_output_file}")
+                    return wav_output_file
+                except Exception as e3:
+                    print(f"Failed to create fallback audio: {str(e3)}")
+                    return ""
+                return ""
+    except Exception as e:
+        print(f"TTS error: {str(e)}")
+        return ""
+def prepare_final_report(company_name: str, articles: List[NewsArticle],
+                         comparative_analysis: Dict[str, Any]) -> Dict[str, Any]:
+    """Prepare final report in the required format."""
+    article_data = []
+    for article in articles:
+        article_data.append({
+            "Title": article.title,
+            "Summary": article.summary,
+            "Sentiment": article.sentiment,
+            "Topics": article.topics
+        })
+    # Prepare a more detailed summary for TTS with actual content from articles
+    summary_text = f"{company_name} के बारे में समाचार विश्लेषण। "
+    # Add information about the number of articles found
+    summary_text += f"कुल {len(articles)} लेख मिले। "
+    # Add sentiment distribution
+    sentiment_counts = comparative_analysis["Sentiment Distribution"]
+    pos_count = sentiment_counts["Positive"] + sentiment_counts["Slightly Positive"]
+    neg_count = sentiment_counts["Negative"] + sentiment_counts["Slightly Negative"]
+    neu_count = sentiment_counts["Neutral"]
+    if pos_count > 0 or neg_count > 0 or neu_count > 0:
+        sentiment_detail = []
+        if sentiment_counts["Positive"] > 0:
+            sentiment_detail.append(f"{sentiment_counts['Positive']} पूर्ण सकारात्मक")
+        if sentiment_counts["Slightly Positive"] > 0:
+            sentiment_detail.append(f"{sentiment_counts['Slightly Positive']} हल्का सकारात्मक")
+        if sentiment_counts["Neutral"] > 0:
+            sentiment_detail.append(f"{sentiment_counts['Neutral']} तटस्थ")
+        if sentiment_counts["Slightly Negative"] > 0:
+            sentiment_detail.append(f"{sentiment_counts['Slightly Negative']} हल्का नकारात्मक")
+        if sentiment_counts["Negative"] > 0:
+            sentiment_detail.append(f"{sentiment_counts['Negative']} पूर्ण नकारात्मक")
+        summary_text += f"भावना विश्लेषण: {', '.join(sentiment_detail)}। "
+    # Add common topics with more detail
+    common_topics = comparative_analysis["Common Topics"][:5]
+    if common_topics:
+        summary_text += f"मुख्य विषय हैं: {', '.join(common_topics)}। "
+        # Add more context about the common topics
+        summary_text += "इन विषयों के बारे में लेखों में यह कहा गया है: "
+        # Find sentences related to common topics in the articles
+        topic_sentences = []
+        for topic in common_topics[:3]:  # Focus on top 3 topics
+            found = False
+            for article in articles:
+                if topic in article.content.lower():
+                    # Find sentences containing this topic
+                    sentences = sent_tokenize(article.content)
+                    for sentence in sentences:
+                        if topic in sentence.lower() and len(sentence) < 150:
+                            topic_sentences.append(f"{topic} के बारे में: {sentence}")
+                            found = True
+                            break
+                    if found:
+                        break
+        if topic_sentences:
+            summary_text += " ".join(topic_sentences[:3]) + " "
+    # Add article summaries
+    summary_text += "लेखों का सारांश: "
+    for i, article in enumerate(articles[:3]):  # Include up to 3 articles
+        summary_text += f"लेख {i+1}: {article.title}. {article.summary[:200]}... "
+        # Add sentiment for this specific article
+        summary_text += f"इस लेख का भावना: {article.sentiment}. "
+    # Add final sentiment analysis
+    summary_text += comparative_analysis["Final Sentiment Analysis"]
+    # Translate the detailed summary to Hindi
+    hindi_summary = translate_to_hindi(summary_text)
+    # Format the response according to the required format
+    return {
+        "Company": company_name,
+        "Articles": article_data,
+        "Comparative Sentiment Score": comparative_analysis,
+        "Final Sentiment Analysis": comparative_analysis["Final Sentiment Analysis"],
+        "Hindi Summary": hindi_summary
+    }