Upload 11 files
Browse files- .gitattributes +1 -35
- .gitignore +50 -0
- Dockerfile +49 -0
- README.md +143 -13
- Spacefile +8 -0
- api.py +332 -0
- app.py +496 -0
- generate_json_output.py +55 -0
- healthcheck.py +171 -0
- requirements.txt +39 -0
- utils.py +1132 -0
.gitattributes
CHANGED
@@ -1,35 +1 @@
|
|
1 |
-
*.
|
2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
1 |
+
*.map filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Python
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
*.so
|
6 |
+
.Python
|
7 |
+
build/
|
8 |
+
develop-eggs/
|
9 |
+
dist/
|
10 |
+
downloads/
|
11 |
+
eggs/
|
12 |
+
.eggs/
|
13 |
+
lib/
|
14 |
+
lib64/
|
15 |
+
parts/
|
16 |
+
sdist/
|
17 |
+
var/
|
18 |
+
wheels/
|
19 |
+
*.egg-info/
|
20 |
+
.installed.cfg
|
21 |
+
*.egg
|
22 |
+
|
23 |
+
# Virtual Environment
|
24 |
+
venv/
|
25 |
+
ENV/
|
26 |
+
env/
|
27 |
+
.env
|
28 |
+
|
29 |
+
# IDE
|
30 |
+
.idea/
|
31 |
+
.vscode/
|
32 |
+
*.swp
|
33 |
+
*.swo
|
34 |
+
.DS_Store
|
35 |
+
|
36 |
+
# Logs
|
37 |
+
logs/
|
38 |
+
*.log
|
39 |
+
|
40 |
+
# Audio files
|
41 |
+
audio_files/
|
42 |
+
*.mp3
|
43 |
+
*.wav
|
44 |
+
|
45 |
+
# Jupyter
|
46 |
+
.ipynb_checkpoints
|
47 |
+
|
48 |
+
# Model caches
|
49 |
+
.cache/
|
50 |
+
.local/
|
Dockerfile
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.9-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
# Install dependencies
|
6 |
+
COPY requirements.txt .
|
7 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
8 |
+
|
9 |
+
# Install additional dependencies needed for NLP tasks and TTS
|
10 |
+
RUN apt-get update && apt-get install -y \
|
11 |
+
build-essential \
|
12 |
+
curl \
|
13 |
+
software-properties-common \
|
14 |
+
git \
|
15 |
+
ffmpeg \
|
16 |
+
espeak \
|
17 |
+
libespeak-dev \
|
18 |
+
alsa-utils \
|
19 |
+
python3-pyaudio \
|
20 |
+
libasound2-dev \
|
21 |
+
&& rm -rf /var/lib/apt/lists/*
|
22 |
+
|
23 |
+
# Copy app files
|
24 |
+
COPY . .
|
25 |
+
|
26 |
+
# Create directory for audio files
|
27 |
+
RUN mkdir -p audio_files
|
28 |
+
|
29 |
+
# Set environment variables
|
30 |
+
ENV PYTHONDONTWRITEBYTECODE=1
|
31 |
+
ENV PYTHONUNBUFFERED=1
|
32 |
+
|
33 |
+
# Download NLTK data
|
34 |
+
RUN python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords')"
|
35 |
+
|
36 |
+
# Expose ports
|
37 |
+
EXPOSE 8000
|
38 |
+
EXPOSE 8501
|
39 |
+
|
40 |
+
# Create a shell script to run both services
|
41 |
+
RUN echo '#!/bin/bash\n\
|
42 |
+
uvicorn api:app --host 0.0.0.0 --port 8000 &\n\
|
43 |
+
streamlit run app.py --server.port 8501 --server.address 0.0.0.0\n'\
|
44 |
+
> /app/start.sh
|
45 |
+
|
46 |
+
RUN chmod +x /app/start.sh
|
47 |
+
|
48 |
+
# Start the application
|
49 |
+
CMD ["/app/start.sh"]
|
README.md
CHANGED
@@ -1,13 +1,143 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# News Summarization and Text-to-Speech Application
|
2 |
+
|
3 |
+
A web-based application that extracts news articles related to companies, performs sentiment analysis, conducts comparative analysis, and generates a text-to-speech output in Hindi.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- **News Extraction**: Scrapes at least 10 unique news articles about a given company using BeautifulSoup
|
8 |
+
- **Sentiment Analysis**: Analyzes the sentiment of each article (positive, negative, neutral)
|
9 |
+
- **Comparative Analysis**: Compares sentiment across articles to derive insights
|
10 |
+
- **Text-to-Speech**: Converts summarized content to Hindi speech
|
11 |
+
- **User Interface**: Simple web interface built with Streamlit
|
12 |
+
- **API Communication**: Backend and frontend communicate through APIs
|
13 |
+
|
14 |
+
## Project Structure
|
15 |
+
|
16 |
+
```
|
17 |
+
.
|
18 |
+
├── app.py # Main Streamlit application
|
19 |
+
├── api.py # API endpoints
|
20 |
+
├── utils.py # Utility functions for scraping, sentiment analysis, etc.
|
21 |
+
├── healthcheck.py # Script to verify all dependencies and services
|
22 |
+
├── requirements.txt # Project dependencies
|
23 |
+
├── Dockerfile # Docker configuration for deployment
|
24 |
+
├── Spacefile # Hugging Face Spaces configuration
|
25 |
+
└── README.md # Project documentation
|
26 |
+
```
|
27 |
+
|
28 |
+
## Setup Instructions
|
29 |
+
|
30 |
+
1. **Clone the repository**:
|
31 |
+
```
|
32 |
+
git clone https://github.com/yourusername/news-summarization-tts.git
|
33 |
+
cd news-summarization-tts
|
34 |
+
```
|
35 |
+
|
36 |
+
2. **Create a virtual environment** (recommended):
|
37 |
+
```
|
38 |
+
python -m venv venv
|
39 |
+
source venv/bin/activate # On Windows: venv\Scripts\activate
|
40 |
+
```
|
41 |
+
|
42 |
+
3. **Install dependencies**:
|
43 |
+
```
|
44 |
+
pip install -r requirements.txt
|
45 |
+
```
|
46 |
+
|
47 |
+
4. **Install system dependencies** (for text-to-speech functionality):
|
48 |
+
- On Ubuntu/Debian:
|
49 |
+
```
|
50 |
+
sudo apt-get install espeak ffmpeg
|
51 |
+
```
|
52 |
+
- On Windows:
|
53 |
+
Download and install espeak from http://espeak.sourceforge.net/download.html
|
54 |
+
|
55 |
+
5. **Run the healthcheck** (to verify all dependencies are working):
|
56 |
+
```
|
57 |
+
python healthcheck.py
|
58 |
+
```
|
59 |
+
|
60 |
+
6. **Run the API server**:
|
61 |
+
```
|
62 |
+
uvicorn api:app --reload
|
63 |
+
```
|
64 |
+
|
65 |
+
7. **Run the Streamlit application** (in a separate terminal):
|
66 |
+
```
|
67 |
+
streamlit run app.py
|
68 |
+
```
|
69 |
+
|
70 |
+
## Models Used
|
71 |
+
|
72 |
+
- **News Summarization**: Extractive summarization using NLTK and NetworkX
|
73 |
+
- **Sentiment Analysis**: VADER for sentiment analysis and Hugging Face Transformers
|
74 |
+
- **Translation**: Google Translate API via deep-translator library
|
75 |
+
- **Text-to-Speech**: Google Text-to-Speech (gTTS) and pyttsx3 as fallback for Hindi conversion
|
76 |
+
|
77 |
+
## API Documentation
|
78 |
+
|
79 |
+
### Endpoints
|
80 |
+
|
81 |
+
- `POST /api/get_news`: Fetches news articles about a company
|
82 |
+
- Request body: `{"company_name": "Tesla"}`
|
83 |
+
- Returns a list of articles with metadata
|
84 |
+
|
85 |
+
- `POST /api/analyze_sentiment`: Performs sentiment analysis on articles
|
86 |
+
- Request body: `{"articles": [article_list]}`
|
87 |
+
- Returns sentiment analysis for each article
|
88 |
+
|
89 |
+
- `POST /api/generate_speech`: Converts text to Hindi speech
|
90 |
+
- Request body: `{"text": "summarized_text"}`
|
91 |
+
- Returns a URL to the generated audio file
|
92 |
+
|
93 |
+
- `POST /api/complete_analysis`: Performs complete analysis including fetching news, sentiment analysis, and generating speech
|
94 |
+
- Request body: `{"company_name": "Tesla"}`
|
95 |
+
- Returns complete analysis results
|
96 |
+
|
97 |
+
## Assumptions & Limitations
|
98 |
+
|
99 |
+
- The application scrapes publicly available news articles that don't require JavaScript rendering
|
100 |
+
- Sentiment analysis accuracy depends on the model used and may not capture context-specific nuances
|
101 |
+
- Hindi translation and TTS quality may vary based on technical terms
|
102 |
+
- The application requires an internet connection to fetch news articles and use cloud-based services
|
103 |
+
|
104 |
+
## Troubleshooting
|
105 |
+
|
106 |
+
If you encounter any issues:
|
107 |
+
|
108 |
+
1. Run the healthcheck script to verify all dependencies are working:
|
109 |
+
```
|
110 |
+
python healthcheck.py
|
111 |
+
```
|
112 |
+
|
113 |
+
2. Check that you have all the required system dependencies installed (espeak, ffmpeg).
|
114 |
+
|
115 |
+
3. If you encounter issues with specific components:
|
116 |
+
- Translation service requires an internet connection
|
117 |
+
- Text-to-speech uses gTTS by default, but falls back to pyttsx3 if needed
|
118 |
+
- Transformer models may take time to download on first run
|
119 |
+
|
120 |
+
## Deployment
|
121 |
+
|
122 |
+
This application is deployed on Hugging Face Spaces: [Link to deployment]
|
123 |
+
|
124 |
+
### Using Docker
|
125 |
+
|
126 |
+
You can also run the application using Docker:
|
127 |
+
|
128 |
+
```
|
129 |
+
docker build -t news-summarization-tts .
|
130 |
+
docker run -p 8501:8501 -p 8000:8000 news-summarization-tts
|
131 |
+
```
|
132 |
+
|
133 |
+
## Future Improvements
|
134 |
+
|
135 |
+
- Add support for more languages
|
136 |
+
- Implement advanced NLP techniques for better summarization
|
137 |
+
- Improve the user interface with more interactive visualizations
|
138 |
+
- Add historical data analysis for tracking sentiment over time
|
139 |
+
- Enhance TTS quality with dedicated Hindi speech models
|
140 |
+
|
141 |
+
## License
|
142 |
+
|
143 |
+
MIT
|
Spacefile
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Spacefile Docs: https://huggingface.co/docs/hub/spaces-config-reference
|
2 |
+
title: News Summarization and TTS
|
3 |
+
emoji: 📰
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: indigo
|
6 |
+
sdk: docker
|
7 |
+
app_port: 8501
|
8 |
+
pinned: false
|
api.py
ADDED
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fastapi import FastAPI, HTTPException, Response, File, UploadFile, Form
|
2 |
+
from fastapi.responses import FileResponse, JSONResponse
|
3 |
+
from fastapi.middleware.cors import CORSMiddleware
|
4 |
+
from pydantic import BaseModel
|
5 |
+
from typing import List, Dict, Any, Optional
|
6 |
+
import os
|
7 |
+
import json
|
8 |
+
import uuid
|
9 |
+
import asyncio
|
10 |
+
import uvicorn
|
11 |
+
from utils import (search_news, analyze_article_sentiment, perform_comparative_analysis,
|
12 |
+
translate_to_hindi, text_to_speech, prepare_final_report, NewsArticle)
|
13 |
+
|
14 |
+
# Initialize FastAPI app
|
15 |
+
app = FastAPI(
|
16 |
+
title="News Summarization and TTS API",
|
17 |
+
description="API for extracting news, performing sentiment analysis, and generating Hindi TTS audio",
|
18 |
+
version="1.0.0"
|
19 |
+
)
|
20 |
+
|
21 |
+
# Add CORS middleware
|
22 |
+
app.add_middleware(
|
23 |
+
CORSMiddleware,
|
24 |
+
allow_origins=["*"], # Allow all origins
|
25 |
+
allow_credentials=True,
|
26 |
+
allow_methods=["*"], # Allow all methods
|
27 |
+
allow_headers=["*"], # Allow all headers
|
28 |
+
)
|
29 |
+
|
30 |
+
# Define request/response models
|
31 |
+
class CompanyRequest(BaseModel):
|
32 |
+
company_name: str
|
33 |
+
|
34 |
+
class TextToSpeechRequest(BaseModel):
|
35 |
+
text: str
|
36 |
+
output_filename: Optional[str] = None
|
37 |
+
|
38 |
+
class SentimentAnalysisRequest(BaseModel):
|
39 |
+
articles: List[Dict[str, Any]]
|
40 |
+
|
41 |
+
class NewsResponse(BaseModel):
|
42 |
+
articles: List[Dict[str, Any]]
|
43 |
+
|
44 |
+
class SentimentResponse(BaseModel):
|
45 |
+
sentiment_analysis: Dict[str, Any]
|
46 |
+
|
47 |
+
class TextToSpeechResponse(BaseModel):
|
48 |
+
audio_file: str
|
49 |
+
text: str
|
50 |
+
|
51 |
+
# Create a directory for audio files if it doesn't exist
|
52 |
+
os.makedirs("audio_files", exist_ok=True)
|
53 |
+
|
54 |
+
# API endpoints
|
55 |
+
@app.get("/")
|
56 |
+
async def root():
|
57 |
+
"""Root endpoint to check if API is running."""
|
58 |
+
return {"message": "News Summarization and TTS API is running"}
|
59 |
+
|
60 |
+
@app.post("/api/get_news", response_model=NewsResponse)
|
61 |
+
async def get_news(request: CompanyRequest):
|
62 |
+
"""Fetch news articles about a specific company."""
|
63 |
+
try:
|
64 |
+
company_name = request.company_name
|
65 |
+
articles = search_news(company_name)
|
66 |
+
|
67 |
+
if not articles:
|
68 |
+
raise HTTPException(status_code=404, detail=f"No news articles found for {company_name}")
|
69 |
+
|
70 |
+
# Convert NewsArticle objects to dictionaries
|
71 |
+
article_data = [article.to_dict() for article in articles]
|
72 |
+
|
73 |
+
return {"articles": article_data}
|
74 |
+
|
75 |
+
except Exception as e:
|
76 |
+
raise HTTPException(status_code=500, detail=str(e))
|
77 |
+
|
78 |
+
@app.post("/api/analyze_sentiment", response_model=SentimentResponse)
|
79 |
+
async def analyze_sentiment(request: SentimentAnalysisRequest):
|
80 |
+
"""Analyze sentiment of provided articles."""
|
81 |
+
try:
|
82 |
+
# Convert dictionaries back to NewsArticle objects
|
83 |
+
articles = []
|
84 |
+
for article_dict in request.articles:
|
85 |
+
article = NewsArticle(
|
86 |
+
title=article_dict["title"],
|
87 |
+
url=article_dict["url"],
|
88 |
+
content=article_dict["content"],
|
89 |
+
summary=article_dict.get("summary", ""),
|
90 |
+
source=article_dict.get("source", ""),
|
91 |
+
date=article_dict.get("date", ""),
|
92 |
+
sentiment=article_dict.get("sentiment", ""),
|
93 |
+
topics=article_dict.get("topics", [])
|
94 |
+
)
|
95 |
+
articles.append(article)
|
96 |
+
|
97 |
+
# Perform detailed sentiment analysis for each article
|
98 |
+
detailed_sentiment = [analyze_article_sentiment(article) for article in articles]
|
99 |
+
|
100 |
+
# Perform comparative analysis
|
101 |
+
comparative_analysis = perform_comparative_analysis(articles)
|
102 |
+
|
103 |
+
return {
|
104 |
+
"sentiment_analysis": {
|
105 |
+
"detailed_sentiment": detailed_sentiment,
|
106 |
+
"comparative_analysis": comparative_analysis
|
107 |
+
}
|
108 |
+
}
|
109 |
+
|
110 |
+
except Exception as e:
|
111 |
+
raise HTTPException(status_code=500, detail=str(e))
|
112 |
+
|
113 |
+
@app.post("/api/generate_speech", response_model=TextToSpeechResponse)
|
114 |
+
async def generate_speech(request: TextToSpeechRequest):
|
115 |
+
"""Convert text to Hindi speech."""
|
116 |
+
try:
|
117 |
+
text = request.text
|
118 |
+
|
119 |
+
# Generate a unique filename if not provided
|
120 |
+
output_filename = request.output_filename
|
121 |
+
if not output_filename:
|
122 |
+
unique_id = uuid.uuid4().hex
|
123 |
+
output_filename = f"audio_files/{unique_id}.mp3"
|
124 |
+
elif not output_filename.startswith("audio_files/"):
|
125 |
+
output_filename = f"audio_files/{output_filename}"
|
126 |
+
|
127 |
+
# Translate text to Hindi
|
128 |
+
hindi_text = translate_to_hindi(text)
|
129 |
+
|
130 |
+
# Convert text to speech
|
131 |
+
audio_file = text_to_speech(hindi_text, output_filename)
|
132 |
+
|
133 |
+
if not audio_file:
|
134 |
+
raise HTTPException(status_code=500, detail="Failed to generate audio file")
|
135 |
+
|
136 |
+
return {
|
137 |
+
"audio_file": audio_file,
|
138 |
+
"text": hindi_text
|
139 |
+
}
|
140 |
+
|
141 |
+
except Exception as e:
|
142 |
+
raise HTTPException(status_code=500, detail=str(e))
|
143 |
+
|
144 |
+
@app.post("/api/complete_analysis")
|
145 |
+
async def complete_analysis(request: CompanyRequest):
|
146 |
+
"""Perform complete analysis for a company."""
|
147 |
+
try:
|
148 |
+
company_name = request.company_name
|
149 |
+
|
150 |
+
# Log the start of analysis
|
151 |
+
print(f"Starting complete analysis for company: {company_name}")
|
152 |
+
|
153 |
+
# Step 1: Get news articles
|
154 |
+
print("Step 1: Fetching news articles...")
|
155 |
+
articles = search_news(company_name, num_articles=5) # Increased from default 3 to 5
|
156 |
+
print(f"Found {len(articles)} articles for {company_name}")
|
157 |
+
|
158 |
+
if not articles:
|
159 |
+
raise HTTPException(status_code=404, detail=f"No news articles found for {company_name}")
|
160 |
+
|
161 |
+
# Step 2: Perform comparative analysis
|
162 |
+
print("Step 2: Performing comparative analysis...")
|
163 |
+
comparative_analysis = perform_comparative_analysis(articles)
|
164 |
+
print("Comparative analysis completed")
|
165 |
+
|
166 |
+
# Step 3: Prepare final report
|
167 |
+
print("Step 3: Preparing final report...")
|
168 |
+
final_report = prepare_final_report(company_name, articles, comparative_analysis)
|
169 |
+
print("Final report prepared")
|
170 |
+
|
171 |
+
# Step 4: Generate Hindi TTS
|
172 |
+
print("Step 4: Generating Hindi TTS...")
|
173 |
+
unique_id = uuid.uuid4().hex
|
174 |
+
output_filename = f"audio_files/{unique_id}.mp3"
|
175 |
+
|
176 |
+
# Use the Hindi summary for TTS
|
177 |
+
hindi_text = final_report["Hindi Summary"]
|
178 |
+
print(f"Converting Hindi text to speech (length: {len(hindi_text)} characters)")
|
179 |
+
|
180 |
+
audio_file = text_to_speech(hindi_text, output_filename)
|
181 |
+
|
182 |
+
# Format the response to match the example output exactly
|
183 |
+
formatted_response = {
|
184 |
+
"Company": company_name,
|
185 |
+
"Articles": final_report["Articles"],
|
186 |
+
"Comparative Sentiment Score": {
|
187 |
+
"Sentiment Distribution": comparative_analysis["Sentiment Distribution"],
|
188 |
+
"Coverage Differences": comparative_analysis["Coverage Differences"],
|
189 |
+
"Topic Overlap": {
|
190 |
+
"Common Topics": comparative_analysis["Topic Overlap"]["Common Topics Across All"],
|
191 |
+
}
|
192 |
+
},
|
193 |
+
"Final Sentiment Analysis": comparative_analysis["Final Sentiment Analysis"],
|
194 |
+
}
|
195 |
+
|
196 |
+
# Format the unique topics by article to match the expected output exactly
|
197 |
+
unique_topics = comparative_analysis["Topic Overlap"]["Unique Topics By Article"]
|
198 |
+
for article_idx, topics in unique_topics.items():
|
199 |
+
article_num = int(article_idx) + 1
|
200 |
+
formatted_response["Comparative Sentiment Score"]["Topic Overlap"][f"Unique Topics in Article {article_num}"] = topics
|
201 |
+
|
202 |
+
# If we don't have more than 1 article, create some example comparisons to match format
|
203 |
+
if len(articles) <= 1:
|
204 |
+
formatted_response["Comparative Sentiment Score"]["Coverage Differences"] = [
|
205 |
+
{
|
206 |
+
"Comparison": f"Only one article about {company_name} was found, limiting comparative analysis.",
|
207 |
+
"Impact": "Unable to compare coverage across multiple sources for more comprehensive insights."
|
208 |
+
}
|
209 |
+
]
|
210 |
+
|
211 |
+
# Add audio information
|
212 |
+
if not audio_file:
|
213 |
+
print("Warning: Failed to generate audio file")
|
214 |
+
formatted_response["Audio"] = "Failed to generate audio"
|
215 |
+
else:
|
216 |
+
print(f"Audio file generated: {audio_file}")
|
217 |
+
formatted_response["Audio"] = f"[Play Hindi Speech]"
|
218 |
+
# Store the actual audio file path in a hidden field
|
219 |
+
formatted_response["_audio_file_path"] = audio_file
|
220 |
+
|
221 |
+
# Add the Hindi Summary to the response as well (needed for rendering in Streamlit)
|
222 |
+
formatted_response["Hindi Summary"] = final_report["Hindi Summary"]
|
223 |
+
|
224 |
+
print("Complete analysis finished successfully")
|
225 |
+
return formatted_response
|
226 |
+
|
227 |
+
except HTTPException as he:
|
228 |
+
# Re-raise HTTP exceptions
|
229 |
+
print(f"HTTP Exception: {he.detail}")
|
230 |
+
raise
|
231 |
+
|
232 |
+
except Exception as e:
|
233 |
+
# For any other exception, provide detailed error information
|
234 |
+
import traceback
|
235 |
+
error_trace = traceback.format_exc()
|
236 |
+
error_message = f"Error processing request: {str(e)}"
|
237 |
+
print(f"ERROR: {error_message}")
|
238 |
+
print(f"Traceback: {error_trace}")
|
239 |
+
|
240 |
+
# Return a more user-friendly error message
|
241 |
+
user_message = "An error occurred during analysis. "
|
242 |
+
|
243 |
+
if "timeout" in str(e).lower():
|
244 |
+
user_message += "There was a timeout when connecting to news sources. Please try again or try another company name."
|
245 |
+
elif "connection" in str(e).lower():
|
246 |
+
user_message += "There was a connection issue with one of the news sources. Please check your internet connection."
|
247 |
+
elif "not found" in str(e).lower():
|
248 |
+
user_message += f"No information could be found for {company_name}. Please try another company name."
|
249 |
+
else:
|
250 |
+
user_message += "Please try again with a different company name or check the server logs for more details."
|
251 |
+
|
252 |
+
raise HTTPException(status_code=500, detail=user_message)
|
253 |
+
|
254 |
+
@app.get("/api/audio/{file_name}")
|
255 |
+
async def get_audio(file_name: str):
|
256 |
+
"""Serve audio files."""
|
257 |
+
file_path = f"audio_files/{file_name}"
|
258 |
+
|
259 |
+
# Make sure the audio_files directory exists
|
260 |
+
os.makedirs("audio_files", exist_ok=True)
|
261 |
+
|
262 |
+
if not os.path.exists(file_path):
|
263 |
+
print(f"Audio file not found: {file_path}")
|
264 |
+
# Check if any audio files exist in the directory
|
265 |
+
audio_files = os.listdir("audio_files") if os.path.exists("audio_files") else []
|
266 |
+
print(f"Available audio files: {audio_files}")
|
267 |
+
raise HTTPException(status_code=404, detail=f"Audio file {file_name} not found")
|
268 |
+
|
269 |
+
try:
|
270 |
+
# Verify the file can be opened and is not corrupt
|
271 |
+
with open(file_path, "rb") as f:
|
272 |
+
file_size = os.path.getsize(file_path)
|
273 |
+
print(f"Serving audio file: {file_path} (size: {file_size} bytes)")
|
274 |
+
if file_size == 0:
|
275 |
+
raise HTTPException(status_code=500, detail="Audio file is empty")
|
276 |
+
except Exception as e:
|
277 |
+
print(f"Error accessing audio file {file_path}: {str(e)}")
|
278 |
+
raise HTTPException(status_code=500, detail=f"Error accessing audio file: {str(e)}")
|
279 |
+
|
280 |
+
# Set appropriate headers for audio file
|
281 |
+
headers = {
|
282 |
+
"Cache-Control": "no-cache, no-store, must-revalidate",
|
283 |
+
"Pragma": "no-cache",
|
284 |
+
"Expires": "0",
|
285 |
+
"Content-Disposition": f"attachment; filename={file_name}"
|
286 |
+
}
|
287 |
+
|
288 |
+
# Determine the correct media type based on file extension
|
289 |
+
media_type = "audio/mpeg"
|
290 |
+
if file_name.lower().endswith(".wav"):
|
291 |
+
media_type = "audio/wav"
|
292 |
+
|
293 |
+
return FileResponse(
|
294 |
+
path=file_path,
|
295 |
+
media_type=media_type,
|
296 |
+
headers=headers,
|
297 |
+
filename=file_name
|
298 |
+
)
|
299 |
+
|
300 |
+
@app.post("/api/example_format")
|
301 |
+
async def get_example_format(request: CompanyRequest):
|
302 |
+
"""
|
303 |
+
Get analysis results in the example format specified.
|
304 |
+
This endpoint provides results that exactly match the requested output format.
|
305 |
+
"""
|
306 |
+
try:
|
307 |
+
# Get the base analysis
|
308 |
+
company_name = request.company_name
|
309 |
+
result = await complete_analysis(request)
|
310 |
+
|
311 |
+
# Format it to match the example output
|
312 |
+
formatted_output = {
|
313 |
+
"Company": result["Company"],
|
314 |
+
"Articles": result["Articles"],
|
315 |
+
"Comparative Sentiment Score": {
|
316 |
+
"Sentiment Distribution": result["Comparative Sentiment Score"]["Sentiment Distribution"],
|
317 |
+
"Coverage Differences": result["Comparative Sentiment Score"]["Coverage Differences"],
|
318 |
+
"Topic Overlap": result["Comparative Sentiment Score"]["Topic Overlap"]
|
319 |
+
},
|
320 |
+
"Final Sentiment Analysis": result["Final Sentiment Analysis"],
|
321 |
+
"Audio": "[Play Hindi Speech]" if result.get("Audio") else "No audio available"
|
322 |
+
}
|
323 |
+
|
324 |
+
return formatted_output
|
325 |
+
|
326 |
+
except HTTPException:
|
327 |
+
raise
|
328 |
+
except Exception as e:
|
329 |
+
raise HTTPException(status_code=500, detail=f"Error generating example format: {str(e)}")
|
330 |
+
|
331 |
+
if __name__ == "__main__":
|
332 |
+
uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)
|
app.py
ADDED
@@ -0,0 +1,496 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import requests
|
3 |
+
import pandas as pd
|
4 |
+
import json
|
5 |
+
import os
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import seaborn as sns
|
8 |
+
import base64
|
9 |
+
from io import BytesIO
|
10 |
+
from PIL import Image, ImageEnhance
|
11 |
+
import time
|
12 |
+
from typing import Dict, Any, List
|
13 |
+
|
14 |
+
# API Base URL - Change this to match your deployment
|
15 |
+
API_BASE_URL = "http://localhost:8000"
|
16 |
+
|
17 |
+
# New function to generate the example output format
|
18 |
+
def generate_example_output(company_name: str) -> str:
|
19 |
+
"""
|
20 |
+
Generate output in the example format for the given company.
|
21 |
+
Returns the formatted JSON as a string.
|
22 |
+
"""
|
23 |
+
try:
|
24 |
+
# Make API request to get the analysis data
|
25 |
+
url = f"{API_BASE_URL}/api/complete_analysis"
|
26 |
+
response = requests.post(url, json={"company_name": company_name})
|
27 |
+
response.raise_for_status()
|
28 |
+
data = response.json()
|
29 |
+
|
30 |
+
# Format the data to match the example output format exactly
|
31 |
+
formatted_output = {
|
32 |
+
"Company": data["Company"],
|
33 |
+
"Articles": data["Articles"],
|
34 |
+
"Comparative Sentiment Score": {
|
35 |
+
"Sentiment Distribution": data["Comparative Sentiment Score"]["Sentiment Distribution"],
|
36 |
+
"Coverage Differences": data["Comparative Sentiment Score"]["Coverage Differences"],
|
37 |
+
"Topic Overlap": data["Comparative Sentiment Score"]["Topic Overlap"]
|
38 |
+
},
|
39 |
+
"Final Sentiment Analysis": data["Final Sentiment Analysis"],
|
40 |
+
"Audio": "[Play Hindi Speech]" if data.get("Audio") else "No audio available"
|
41 |
+
}
|
42 |
+
|
43 |
+
# Convert to JSON string with proper formatting
|
44 |
+
return json.dumps(formatted_output, indent=2)
|
45 |
+
|
46 |
+
except Exception as e:
|
47 |
+
return json.dumps({
|
48 |
+
"error": str(e),
|
49 |
+
"message": "Failed to generate example output"
|
50 |
+
}, indent=2)
|
51 |
+
|
52 |
+
# Function to run in terminal mode
|
53 |
+
def run_terminal_mode():
|
54 |
+
"""Run the app in terminal mode to output JSON"""
|
55 |
+
print("News Analysis Terminal Mode")
|
56 |
+
company_name = input("Enter company name: ")
|
57 |
+
print(f"Analyzing {company_name}...")
|
58 |
+
output = generate_example_output(company_name)
|
59 |
+
print(output)
|
60 |
+
|
61 |
+
# Check if run directly or imported
|
62 |
+
if __name__ == "__main__":
|
63 |
+
# Check if terminal mode is requested via command line args
|
64 |
+
import sys
|
65 |
+
if len(sys.argv) > 1 and sys.argv[1] == "--terminal":
|
66 |
+
run_terminal_mode()
|
67 |
+
else:
|
68 |
+
# Continue with the Streamlit app
|
69 |
+
|
70 |
+
# App title and description
|
71 |
+
st.set_page_config(
|
72 |
+
page_title="News Summarization & TTS",
|
73 |
+
page_icon="📰",
|
74 |
+
layout="wide",
|
75 |
+
initial_sidebar_state="expanded"
|
76 |
+
)
|
77 |
+
|
78 |
+
# Custom CSS for better UI
|
79 |
+
st.markdown("""
|
80 |
+
<style>
|
81 |
+
.main-header {
|
82 |
+
font-size: 2.5rem;
|
83 |
+
font-weight: 700;
|
84 |
+
color: #1E3A8A;
|
85 |
+
margin-bottom: 1rem;
|
86 |
+
}
|
87 |
+
.sub-header {
|
88 |
+
font-size: 1.5rem;
|
89 |
+
font-weight: 600;
|
90 |
+
color: #2563EB;
|
91 |
+
margin-top: 1rem;
|
92 |
+
margin-bottom: 0.5rem;
|
93 |
+
}
|
94 |
+
.card {
|
95 |
+
padding: 1.5rem;
|
96 |
+
border-radius: 0.5rem;
|
97 |
+
background-color: #F8FAFC;
|
98 |
+
border: 1px solid #E2E8F0;
|
99 |
+
margin-bottom: 1rem;
|
100 |
+
}
|
101 |
+
.positive {
|
102 |
+
color: #059669;
|
103 |
+
font-weight: 600;
|
104 |
+
}
|
105 |
+
.negative {
|
106 |
+
color: #DC2626;
|
107 |
+
font-weight: 600;
|
108 |
+
}
|
109 |
+
.neutral {
|
110 |
+
color: #6B7280;
|
111 |
+
font-weight: 600;
|
112 |
+
}
|
113 |
+
.topic-tag {
|
114 |
+
display: inline-block;
|
115 |
+
padding: 0.25rem 0.5rem;
|
116 |
+
border-radius: 2rem;
|
117 |
+
background-color: #E5E7EB;
|
118 |
+
color: #1F2937;
|
119 |
+
font-size: 0.75rem;
|
120 |
+
margin-right: 0.5rem;
|
121 |
+
margin-bottom: 0.5rem;
|
122 |
+
}
|
123 |
+
.audio-container {
|
124 |
+
width: 100%;
|
125 |
+
padding: 1rem;
|
126 |
+
background-color: #F3F4F6;
|
127 |
+
border-radius: 0.5rem;
|
128 |
+
margin-top: 1rem;
|
129 |
+
}
|
130 |
+
.info-text {
|
131 |
+
font-size: 0.9rem;
|
132 |
+
color: #4B5563;
|
133 |
+
}
|
134 |
+
.article-title {
|
135 |
+
font-size: 1.2rem;
|
136 |
+
font-weight: 600;
|
137 |
+
color: #111827;
|
138 |
+
margin-bottom: 0.5rem;
|
139 |
+
margin-top: 0.5rem;
|
140 |
+
}
|
141 |
+
.article-summary {
|
142 |
+
font-size: 0.9rem;
|
143 |
+
color: #374151;
|
144 |
+
margin-bottom: 0.5rem;
|
145 |
+
}
|
146 |
+
.article-meta {
|
147 |
+
font-size: 0.8rem;
|
148 |
+
color: #6B7280;
|
149 |
+
margin-bottom: 0.5rem;
|
150 |
+
}
|
151 |
+
.section-divider {
|
152 |
+
height: 1px;
|
153 |
+
background-color: #E5E7EB;
|
154 |
+
margin: 1.5rem 0;
|
155 |
+
}
|
156 |
+
.chart-container {
|
157 |
+
background-color: white;
|
158 |
+
padding: 1rem;
|
159 |
+
border-radius: 0.5rem;
|
160 |
+
border: 1px solid #E2E8F0;
|
161 |
+
}
|
162 |
+
</style>
|
163 |
+
""", unsafe_allow_html=True)
|
164 |
+
|
165 |
+
# Function to make API requests
|
166 |
+
def make_api_request(endpoint: str, data: Dict[str, Any] = None, method: str = "POST") -> Dict[str, Any]:
|
167 |
+
"""Make API request to the backend."""
|
168 |
+
url = f"{API_BASE_URL}{endpoint}"
|
169 |
+
|
170 |
+
try:
|
171 |
+
if method == "GET":
|
172 |
+
response = requests.get(url)
|
173 |
+
else:
|
174 |
+
response = requests.post(url, json=data)
|
175 |
+
|
176 |
+
response.raise_for_status()
|
177 |
+
return response.json()
|
178 |
+
except requests.exceptions.ConnectionError:
|
179 |
+
st.error("⚠️ Connection Error: Cannot connect to the API server. Please ensure the API server is running at " + API_BASE_URL)
|
180 |
+
return {}
|
181 |
+
except requests.exceptions.Timeout:
|
182 |
+
st.error("⚠️ Timeout Error: The request took too long to complete. Please try again with a different company name.")
|
183 |
+
return {}
|
184 |
+
except requests.exceptions.HTTPError as e:
|
185 |
+
if e.response.status_code == 404:
|
186 |
+
st.error("⚠️ No articles found for this company. Please try another company name.")
|
187 |
+
elif e.response.status_code == 500:
|
188 |
+
# Try to get detailed error message
|
189 |
+
try:
|
190 |
+
error_detail = e.response.json().get("detail", "Unknown server error")
|
191 |
+
st.error(f"⚠️ Server Error: {error_detail}")
|
192 |
+
except:
|
193 |
+
st.error("⚠️ Internal Server Error: Something went wrong on the server. Please try again later.")
|
194 |
+
else:
|
195 |
+
st.error(f"⚠️ HTTP Error: {str(e)}")
|
196 |
+
return {}
|
197 |
+
except Exception as e:
|
198 |
+
st.error(f"⚠️ Error: {str(e)}")
|
199 |
+
return {}
|
200 |
+
|
201 |
+
# Function to create sentiment color
|
202 |
+
def get_sentiment_color(sentiment: str) -> str:
|
203 |
+
"""Return CSS class for sentiment."""
|
204 |
+
if sentiment == "Positive":
|
205 |
+
return "positive"
|
206 |
+
elif sentiment == "Negative":
|
207 |
+
return "negative"
|
208 |
+
else:
|
209 |
+
return "neutral"
|
210 |
+
|
211 |
+
# Function to create visualization for sentiment distribution
|
212 |
+
def plot_sentiment_distribution(sentiment_data: Dict[str, int]):
|
213 |
+
"""Create and display a bar chart for sentiment distribution."""
|
214 |
+
labels = ["Positive", "Neutral", "Negative"]
|
215 |
+
values = [sentiment_data[label] for label in labels]
|
216 |
+
colors = ["#059669", "#6B7280", "#DC2626"]
|
217 |
+
|
218 |
+
fig, ax = plt.subplots(figsize=(10, 6))
|
219 |
+
ax.bar(labels, values, color=colors)
|
220 |
+
ax.set_title("Sentiment Distribution", fontsize=16, fontweight='bold')
|
221 |
+
ax.set_ylabel("Number of Articles", fontsize=12)
|
222 |
+
ax.grid(axis='y', linestyle='--', alpha=0.7)
|
223 |
+
|
224 |
+
# Add value labels on top of bars
|
225 |
+
for i, v in enumerate(values):
|
226 |
+
ax.text(i, v + 0.1, str(v), ha='center', fontweight='bold')
|
227 |
+
|
228 |
+
return fig
|
229 |
+
|
230 |
+
# Function to display article information
|
231 |
+
def display_article(article: Dict[str, Any], index: int):
|
232 |
+
"""Display article information in a card layout."""
|
233 |
+
st.markdown(f"<div class='card'>", unsafe_allow_html=True)
|
234 |
+
|
235 |
+
# Article title and sentiment
|
236 |
+
sentiment = article.get("Sentiment", "Neutral")
|
237 |
+
sentiment_class = get_sentiment_color(sentiment)
|
238 |
+
|
239 |
+
st.markdown(f"<h3 class='article-title'>{index+1}. {article['Title']}</h3>", unsafe_allow_html=True)
|
240 |
+
st.markdown(f"<span class='{sentiment_class}'>{sentiment}</span>", unsafe_allow_html=True)
|
241 |
+
|
242 |
+
# Article summary
|
243 |
+
st.markdown("<div class='article-summary'>", unsafe_allow_html=True)
|
244 |
+
st.markdown(f"{article.get('Summary', 'No summary available.')}", unsafe_allow_html=True)
|
245 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
246 |
+
|
247 |
+
# Topics
|
248 |
+
if "Topics" in article and article["Topics"]:
|
249 |
+
st.markdown("<div>", unsafe_allow_html=True)
|
250 |
+
for topic in article["Topics"]:
|
251 |
+
st.markdown(f"<span class='topic-tag'>{topic}</span>", unsafe_allow_html=True)
|
252 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
253 |
+
|
254 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
255 |
+
|
256 |
+
# App layout
|
257 |
+
st.markdown("<h1 class='main-header'>📰 News Summarization & Text-to-Speech</h1>", unsafe_allow_html=True)
|
258 |
+
st.markdown("""
|
259 |
+
<p class='info-text'>
|
260 |
+
This application extracts news articles about a company, performs sentiment analysis, conducts comparative analysis,
|
261 |
+
and generates a text-to-speech output in Hindi. Enter a company name to get started.
|
262 |
+
</p>
|
263 |
+
""", unsafe_allow_html=True)
|
264 |
+
|
265 |
+
# Sidebar
|
266 |
+
st.sidebar.image("https://cdn-icons-png.flaticon.com/512/2593/2593073.png", width=100)
|
267 |
+
st.sidebar.title("News Analysis Settings")
|
268 |
+
|
269 |
+
# Company selection
|
270 |
+
company_input_method = st.sidebar.radio(
|
271 |
+
"Select company input method:",
|
272 |
+
options=["Text Input", "Choose from List"]
|
273 |
+
)
|
274 |
+
|
275 |
+
if company_input_method == "Text Input":
|
276 |
+
company_name = st.sidebar.text_input("Enter Company Name:", placeholder="e.g., Tesla")
|
277 |
+
else:
|
278 |
+
companies = ["Apple", "Google", "Microsoft", "Amazon", "Tesla", "Meta", "Netflix", "Uber", "Airbnb", "Twitter"]
|
279 |
+
company_name = st.sidebar.selectbox("Select Company:", companies)
|
280 |
+
|
281 |
+
# Analysis settings
|
282 |
+
max_articles = st.sidebar.slider("Maximum Articles to Analyze:", min_value=5, max_value=20, value=10)
|
283 |
+
st.sidebar.markdown("---")
|
284 |
+
|
285 |
+
# Analysis button
|
286 |
+
analyze_button = st.sidebar.button("Analyze Company News", type="primary")
|
287 |
+
|
288 |
+
# Audio playback settings
|
289 |
+
st.sidebar.markdown("## Audio Settings")
|
290 |
+
audio_speed = st.sidebar.select_slider("TTS Speech Speed:", options=["Slow", "Normal", "Fast"], value="Normal")
|
291 |
+
st.sidebar.markdown("---")
|
292 |
+
|
293 |
+
# Add option to see JSON in example format
|
294 |
+
st.sidebar.markdown("## Developer Options")
|
295 |
+
show_json = st.sidebar.checkbox("Show JSON output in example format")
|
296 |
+
st.sidebar.markdown("---")
|
297 |
+
|
298 |
+
# About section
|
299 |
+
with st.sidebar.expander("About This App"):
|
300 |
+
st.markdown("""
|
301 |
+
This application performs:
|
302 |
+
- News extraction from multiple sources
|
303 |
+
- Sentiment analysis of the content
|
304 |
+
- Topic identification and comparative analysis
|
305 |
+
- Text-to-speech conversion to Hindi
|
306 |
+
|
307 |
+
Built with Streamlit, FastAPI, and various NLP tools.
|
308 |
+
""")
|
309 |
+
|
310 |
+
# Main content area
|
311 |
+
if analyze_button and company_name:
|
312 |
+
with st.spinner(f"Analyzing news for {company_name}... This may take a minute"):
|
313 |
+
# Perform complete analysis
|
314 |
+
response = make_api_request(
|
315 |
+
"/api/complete_analysis",
|
316 |
+
{"company_name": company_name}
|
317 |
+
)
|
318 |
+
|
319 |
+
if not response:
|
320 |
+
st.error("Failed to retrieve data. Please try again.")
|
321 |
+
elif "detail" in response:
|
322 |
+
st.error(response["detail"])
|
323 |
+
else:
|
324 |
+
# Display company header
|
325 |
+
st.markdown(f"<h2 class='sub-header'>Analysis Results for {response['Company']}</h2>", unsafe_allow_html=True)
|
326 |
+
|
327 |
+
# Display sentiment summary
|
328 |
+
col1, col2 = st.columns([2, 1])
|
329 |
+
|
330 |
+
with col1:
|
331 |
+
st.markdown("<div class='card'>", unsafe_allow_html=True)
|
332 |
+
st.markdown("<h3 class='sub-header'>Sentiment Overview</h3>", unsafe_allow_html=True)
|
333 |
+
st.markdown(f"{response['Final Sentiment Analysis']}")
|
334 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
335 |
+
|
336 |
+
with col2:
|
337 |
+
sentiment_data = response["Comparative Sentiment Score"]["Sentiment Distribution"]
|
338 |
+
fig = plot_sentiment_distribution(sentiment_data)
|
339 |
+
st.pyplot(fig)
|
340 |
+
|
341 |
+
st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
|
342 |
+
|
343 |
+
# Display Hindi TTS audio
|
344 |
+
if "Audio" in response and response["Audio"]:
|
345 |
+
st.markdown("<h3 class='sub-header'>Hindi Audio Summary</h3>", unsafe_allow_html=True)
|
346 |
+
|
347 |
+
audio_message = response["Audio"]
|
348 |
+
|
349 |
+
if audio_message == "Failed to generate audio":
|
350 |
+
st.warning("Hindi audio could not be generated. However, you can still read the Hindi text below.")
|
351 |
+
else:
|
352 |
+
try:
|
353 |
+
# Check if the response contains the actual audio file path
|
354 |
+
audio_file_path = response.get("_audio_file_path")
|
355 |
+
|
356 |
+
if audio_file_path:
|
357 |
+
# Extract the filename
|
358 |
+
audio_filename = os.path.basename(audio_file_path)
|
359 |
+
audio_url = f"{API_BASE_URL}/api/audio/{audio_filename}"
|
360 |
+
else:
|
361 |
+
# If no path is provided, just display a message
|
362 |
+
st.info("Audio is available but the path was not provided.")
|
363 |
+
audio_url = None
|
364 |
+
|
365 |
+
if audio_url:
|
366 |
+
# Attempt to download the audio file
|
367 |
+
audio_response = requests.get(audio_url)
|
368 |
+
if audio_response.status_code == 200:
|
369 |
+
# Save temporarily
|
370 |
+
temp_audio_path = f"temp_audio_{os.path.basename(audio_url)}"
|
371 |
+
with open(temp_audio_path, "wb") as f:
|
372 |
+
f.write(audio_response.content)
|
373 |
+
|
374 |
+
# Play from local file
|
375 |
+
st.markdown("<div class='audio-container'>", unsafe_allow_html=True)
|
376 |
+
st.audio(temp_audio_path, format="audio/mp3")
|
377 |
+
|
378 |
+
# Display audio download link
|
379 |
+
st.markdown(f"<a href='{audio_url}' download='hindi_summary.mp3'>Download Hindi Audio</a>", unsafe_allow_html=True)
|
380 |
+
|
381 |
+
# Clean up temp file (optional)
|
382 |
+
# os.remove(temp_audio_path) # Uncomment to delete after use
|
383 |
+
else:
|
384 |
+
st.warning(f"Unable to load audio file (HTTP {audio_response.status_code}). You can still read the Hindi text below.")
|
385 |
+
else:
|
386 |
+
st.info("Hindi audio summary would be available here.")
|
387 |
+
except Exception as e:
|
388 |
+
st.warning(f"Error playing audio: {str(e)}. You can still read the Hindi text below.")
|
389 |
+
|
390 |
+
# Display the Hindi text with better formatting
|
391 |
+
with st.expander("Show Hindi Text"):
|
392 |
+
hindi_text = response.get("Hindi Summary", "Hindi text not available.")
|
393 |
+
|
394 |
+
# Format the text for better readability
|
395 |
+
paragraphs = hindi_text.split("। ")
|
396 |
+
|
397 |
+
for paragraph in paragraphs:
|
398 |
+
if paragraph.strip():
|
399 |
+
# Add a period if it doesn't end with one
|
400 |
+
if not paragraph.strip().endswith("।"):
|
401 |
+
paragraph += "।"
|
402 |
+
st.markdown(f"<p style='font-size: 16px; margin-bottom: 10px;'>{paragraph}</p>", unsafe_allow_html=True)
|
403 |
+
|
404 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
405 |
+
|
406 |
+
st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
|
407 |
+
|
408 |
+
# Display articles
|
409 |
+
st.markdown("<h3 class='sub-header'>News Articles</h3>", unsafe_allow_html=True)
|
410 |
+
articles = response.get("Articles", [])
|
411 |
+
|
412 |
+
if not articles:
|
413 |
+
st.info("No articles found for this company.")
|
414 |
+
else:
|
415 |
+
for i, article in enumerate(articles):
|
416 |
+
display_article(article, i)
|
417 |
+
|
418 |
+
st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
|
419 |
+
|
420 |
+
# Display comparative analysis
|
421 |
+
st.markdown("<h3 class='sub-header'>Comparative Analysis</h3>", unsafe_allow_html=True)
|
422 |
+
|
423 |
+
# Display topic overlap
|
424 |
+
topic_data = response["Comparative Sentiment Score"]["Topic Overlap"]
|
425 |
+
|
426 |
+
col1, col2 = st.columns(2)
|
427 |
+
|
428 |
+
with col1:
|
429 |
+
st.markdown("<div class='card'>", unsafe_allow_html=True)
|
430 |
+
st.markdown("<h4>Common Topics</h4>", unsafe_allow_html=True)
|
431 |
+
|
432 |
+
common_topics = topic_data.get("Common Topics Across All", [])
|
433 |
+
if common_topics:
|
434 |
+
for topic in common_topics:
|
435 |
+
st.markdown(f"<span class='topic-tag'>{topic}</span>", unsafe_allow_html=True)
|
436 |
+
else:
|
437 |
+
st.info("No common topics found across articles.")
|
438 |
+
|
439 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
440 |
+
|
441 |
+
with col2:
|
442 |
+
st.markdown("<div class='card'>", unsafe_allow_html=True)
|
443 |
+
st.markdown("<h4>Coverage Comparison</h4>", unsafe_allow_html=True)
|
444 |
+
|
445 |
+
comparisons = response["Comparative Sentiment Score"].get("Coverage Differences", [])
|
446 |
+
if comparisons:
|
447 |
+
for i, comparison in enumerate(comparisons[:3]): # Show only top 3 comparisons
|
448 |
+
st.markdown(f"<p><strong>{i+1}.</strong> {comparison.get('Comparison', '')}</p>", unsafe_allow_html=True)
|
449 |
+
st.markdown(f"<p class='info-text'>{comparison.get('Impact', '')}</p>", unsafe_allow_html=True)
|
450 |
+
else:
|
451 |
+
st.info("No comparative insights available.")
|
452 |
+
|
453 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
454 |
+
|
455 |
+
# Display full comparison in expander
|
456 |
+
with st.expander("View All Comparisons"):
|
457 |
+
comparisons = response["Comparative Sentiment Score"].get("Coverage Differences", [])
|
458 |
+
for i, comparison in enumerate(comparisons):
|
459 |
+
st.markdown(f"<p><strong>{i+1}.</strong> {comparison.get('Comparison', '')}</p>", unsafe_allow_html=True)
|
460 |
+
st.markdown(f"<p class='info-text'>{comparison.get('Impact', '')}</p>", unsafe_allow_html=True)
|
461 |
+
st.markdown("<hr>", unsafe_allow_html=True)
|
462 |
+
|
463 |
+
# Show JSON in example format if requested
|
464 |
+
if show_json:
|
465 |
+
st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
|
466 |
+
st.markdown("<h3 class='sub-header'>Example JSON Format</h3>", unsafe_allow_html=True)
|
467 |
+
|
468 |
+
# Get the formatted JSON
|
469 |
+
json_output = generate_example_output(company_name)
|
470 |
+
|
471 |
+
# Display the JSON in a code block
|
472 |
+
st.code(json_output, language="json")
|
473 |
+
else:
|
474 |
+
# Display placeholder
|
475 |
+
st.markdown("<div class='card'>", unsafe_allow_html=True)
|
476 |
+
st.markdown("<h3 class='sub-header'>Enter a Company Name to Begin Analysis</h3>", unsafe_allow_html=True)
|
477 |
+
st.markdown("""
|
478 |
+
<p class='info-text'>
|
479 |
+
This application will:
|
480 |
+
</p>
|
481 |
+
<ul class='info-text'>
|
482 |
+
<li>Extract news articles from multiple sources</li>
|
483 |
+
<li>Analyze sentiment (positive, negative, neutral)</li>
|
484 |
+
<li>Identify key topics in each article</li>
|
485 |
+
<li>Perform comparative analysis across articles</li>
|
486 |
+
<li>Generate Hindi speech output summarizing the findings</li>
|
487 |
+
</ul>
|
488 |
+
""", unsafe_allow_html=True)
|
489 |
+
st.markdown("</div>", unsafe_allow_html=True)
|
490 |
+
|
491 |
+
# Sample output image
|
492 |
+
st.image("https://miro.medium.com/max/1400/1*Ger-949PgQnaje2oa9XMdw.png", caption="Sample sentiment analysis visualization")
|
493 |
+
|
494 |
+
# Footer
|
495 |
+
st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
|
496 |
+
st.markdown("<p class='info-text' style='text-align: center;'>News Summarization & Text-to-Speech Application | Developed with Streamlit and FastAPI</p>", unsafe_allow_html=True)
|
generate_json_output.py
ADDED
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
|
3 |
+
import requests
|
4 |
+
import json
|
5 |
+
import sys
|
6 |
+
|
7 |
+
def generate_json_output(company_name, api_url="http://localhost:8000"):
|
8 |
+
"""
|
9 |
+
Generate output in the example format for the given company.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
company_name (str): Name of the company to analyze
|
13 |
+
api_url (str): Base URL of the API
|
14 |
+
|
15 |
+
Returns:
|
16 |
+
str: Formatted JSON string
|
17 |
+
"""
|
18 |
+
try:
|
19 |
+
# Make API request to get the analysis data
|
20 |
+
url = f"{api_url}/api/complete_analysis"
|
21 |
+
response = requests.post(url, json={"company_name": company_name})
|
22 |
+
response.raise_for_status()
|
23 |
+
data = response.json()
|
24 |
+
|
25 |
+
# Format the data to match the example output format exactly
|
26 |
+
formatted_output = {
|
27 |
+
"Company": data["Company"],
|
28 |
+
"Articles": data["Articles"],
|
29 |
+
"Comparative Sentiment Score": {
|
30 |
+
"Sentiment Distribution": data["Comparative Sentiment Score"]["Sentiment Distribution"],
|
31 |
+
"Coverage Differences": data["Comparative Sentiment Score"]["Coverage Differences"],
|
32 |
+
"Topic Overlap": data["Comparative Sentiment Score"]["Topic Overlap"]
|
33 |
+
},
|
34 |
+
"Final Sentiment Analysis": data["Final Sentiment Analysis"],
|
35 |
+
"Audio": "[Play Hindi Speech]" if data.get("Audio") else "No audio available"
|
36 |
+
}
|
37 |
+
|
38 |
+
# Convert to JSON string with proper formatting
|
39 |
+
return json.dumps(formatted_output, indent=2)
|
40 |
+
|
41 |
+
except Exception as e:
|
42 |
+
return json.dumps({
|
43 |
+
"error": str(e),
|
44 |
+
"message": "Failed to generate example output"
|
45 |
+
}, indent=2)
|
46 |
+
|
47 |
+
if __name__ == "__main__":
|
48 |
+
# Get company name from command line arguments or prompt for it
|
49 |
+
if len(sys.argv) > 1:
|
50 |
+
company_name = sys.argv[1]
|
51 |
+
else:
|
52 |
+
company_name = input("Enter company name: ")
|
53 |
+
|
54 |
+
print(f"Input:\nCompany Name: {company_name}")
|
55 |
+
print("Output:", generate_json_output(company_name))
|
healthcheck.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Healthcheck script to verify the functionality of all components of the application.
|
3 |
+
Run this script to check if all dependencies are correctly installed and working.
|
4 |
+
"""
|
5 |
+
|
6 |
+
import sys
|
7 |
+
import os
|
8 |
+
import time
|
9 |
+
import traceback
|
10 |
+
|
11 |
+
def run_checks():
|
12 |
+
print("Starting health check for News Summarization and TTS Application...")
|
13 |
+
checks_passed = 0
|
14 |
+
checks_failed = 0
|
15 |
+
|
16 |
+
# Check 1: Verify imports
|
17 |
+
print("\n1. Checking imports...")
|
18 |
+
try:
|
19 |
+
# Standard libraries
|
20 |
+
import json
|
21 |
+
import re
|
22 |
+
|
23 |
+
# Web and API dependencies
|
24 |
+
import requests
|
25 |
+
import fastapi
|
26 |
+
import uvicorn
|
27 |
+
import streamlit
|
28 |
+
|
29 |
+
# Data processing
|
30 |
+
import pandas
|
31 |
+
import numpy
|
32 |
+
import bs4
|
33 |
+
|
34 |
+
# NLP
|
35 |
+
import nltk
|
36 |
+
import networkx
|
37 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
38 |
+
|
39 |
+
# ML and Transformers
|
40 |
+
import torch
|
41 |
+
import transformers
|
42 |
+
from transformers import pipeline
|
43 |
+
|
44 |
+
# TTS and Translation
|
45 |
+
import deep_translator
|
46 |
+
from deep_translator import GoogleTranslator
|
47 |
+
import gtts
|
48 |
+
import pyttsx3
|
49 |
+
|
50 |
+
print("✅ All imports successful.")
|
51 |
+
checks_passed += 1
|
52 |
+
except ImportError as e:
|
53 |
+
print(f"❌ Import error: {str(e)}")
|
54 |
+
print(f"Traceback: {traceback.format_exc()}")
|
55 |
+
checks_failed += 1
|
56 |
+
|
57 |
+
# Check 2: Verify NLTK data
|
58 |
+
print("\n2. Checking NLTK data...")
|
59 |
+
try:
|
60 |
+
import nltk
|
61 |
+
nltk.data.find('tokenizers/punkt')
|
62 |
+
nltk.data.find('corpora/stopwords')
|
63 |
+
print("✅ NLTK data verified.")
|
64 |
+
checks_passed += 1
|
65 |
+
except LookupError as e:
|
66 |
+
print(f"❌ NLTK data error: {str(e)}")
|
67 |
+
print("Trying to download necessary NLTK data...")
|
68 |
+
try:
|
69 |
+
nltk.download('punkt')
|
70 |
+
nltk.download('stopwords')
|
71 |
+
print("✅ NLTK data downloaded successfully.")
|
72 |
+
checks_passed += 1
|
73 |
+
except Exception as e:
|
74 |
+
print(f"❌ Failed to download NLTK data: {str(e)}")
|
75 |
+
checks_failed += 1
|
76 |
+
|
77 |
+
# Check 3: Test translation
|
78 |
+
print("\n3. Testing translation service...")
|
79 |
+
try:
|
80 |
+
from deep_translator import GoogleTranslator
|
81 |
+
translator = GoogleTranslator(source='en', target='hi')
|
82 |
+
text = "Hello, this is a test."
|
83 |
+
translated = translator.translate(text)
|
84 |
+
print(f"Original text: {text}")
|
85 |
+
print(f"Translated text: {translated}")
|
86 |
+
if translated and len(translated) > 0:
|
87 |
+
print("✅ Translation service working.")
|
88 |
+
checks_passed += 1
|
89 |
+
else:
|
90 |
+
print("❌ Translation returned empty result.")
|
91 |
+
checks_failed += 1
|
92 |
+
except Exception as e:
|
93 |
+
print(f"❌ Translation error: {str(e)}")
|
94 |
+
print(f"Traceback: {traceback.format_exc()}")
|
95 |
+
checks_failed += 1
|
96 |
+
|
97 |
+
# Check 4: Test TTS
|
98 |
+
print("\n4. Testing Text-to-Speech service...")
|
99 |
+
try:
|
100 |
+
from gtts import gTTS
|
101 |
+
test_text = "परीक्षण पाठ" # "Test text" in Hindi
|
102 |
+
test_file = 'test_audio.mp3'
|
103 |
+
|
104 |
+
# Try gTTS
|
105 |
+
tts = gTTS(text=test_text, lang='hi', slow=False)
|
106 |
+
tts.save(test_file)
|
107 |
+
|
108 |
+
if os.path.exists(test_file) and os.path.getsize(test_file) > 0:
|
109 |
+
print("✅ gTTS service working.")
|
110 |
+
# Clean up test file
|
111 |
+
try:
|
112 |
+
os.remove(test_file)
|
113 |
+
except:
|
114 |
+
pass
|
115 |
+
checks_passed += 1
|
116 |
+
else:
|
117 |
+
print("❌ gTTS failed to generate a valid audio file.")
|
118 |
+
checks_failed += 1
|
119 |
+
except Exception as e:
|
120 |
+
print(f"❌ Text-to-Speech error: {str(e)}")
|
121 |
+
print(f"Traceback: {traceback.format_exc()}")
|
122 |
+
checks_failed += 1
|
123 |
+
|
124 |
+
# Check 5: Test sentiment analysis
|
125 |
+
print("\n5. Testing sentiment analysis...")
|
126 |
+
try:
|
127 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
128 |
+
analyzer = SentimentIntensityAnalyzer()
|
129 |
+
test_text = "This product is excellent and I love it!"
|
130 |
+
scores = analyzer.polarity_scores(test_text)
|
131 |
+
print(f"Sentiment scores for '{test_text}': {scores}")
|
132 |
+
if 'compound' in scores:
|
133 |
+
print("✅ Sentiment analysis working.")
|
134 |
+
checks_passed += 1
|
135 |
+
else:
|
136 |
+
print("❌ Sentiment analysis returned unexpected result.")
|
137 |
+
checks_failed += 1
|
138 |
+
except Exception as e:
|
139 |
+
print(f"❌ Sentiment analysis error: {str(e)}")
|
140 |
+
print(f"Traceback: {traceback.format_exc()}")
|
141 |
+
checks_failed += 1
|
142 |
+
|
143 |
+
# Check 6: Test Transformers
|
144 |
+
print("\n6. Testing Transformer models...")
|
145 |
+
try:
|
146 |
+
from transformers import pipeline
|
147 |
+
sentiment_task = pipeline("sentiment-analysis", return_all_scores=False)
|
148 |
+
result = sentiment_task("I love using this application!")
|
149 |
+
print(f"Transformer sentiment analysis result: {result}")
|
150 |
+
print("✅ Transformer models working.")
|
151 |
+
checks_passed += 1
|
152 |
+
except Exception as e:
|
153 |
+
print(f"❌ Transformer models error: {str(e)}")
|
154 |
+
print(f"Traceback: {traceback.format_exc()}")
|
155 |
+
checks_failed += 1
|
156 |
+
|
157 |
+
# Summary
|
158 |
+
print("\n" + "="*50)
|
159 |
+
print(f"Health Check Summary: {checks_passed} checks passed, {checks_failed} checks failed")
|
160 |
+
|
161 |
+
if checks_failed == 0:
|
162 |
+
print("\n✅ All systems operational! The application should run correctly.")
|
163 |
+
return True
|
164 |
+
else:
|
165 |
+
print("\n❌ Some checks failed. Please review the errors above.")
|
166 |
+
return False
|
167 |
+
|
168 |
+
if __name__ == "__main__":
|
169 |
+
success = run_checks()
|
170 |
+
if not success:
|
171 |
+
sys.exit(1)
|
requirements.txt
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Core dependencies
|
2 |
+
streamlit==1.27.0
|
3 |
+
fastapi==0.103.1
|
4 |
+
uvicorn==0.23.2
|
5 |
+
requests==2.31.0
|
6 |
+
beautifulsoup4==4.12.2
|
7 |
+
pandas==2.1.0
|
8 |
+
numpy==1.25.2
|
9 |
+
scipy==1.10.1
|
10 |
+
|
11 |
+
# NLP and Sentiment Analysis
|
12 |
+
transformers==4.33.1
|
13 |
+
torch==2.0.1
|
14 |
+
nltk==3.8.1
|
15 |
+
vaderSentiment==3.3.2
|
16 |
+
|
17 |
+
# Text-to-Speech
|
18 |
+
gTTS==2.3.2
|
19 |
+
pyttsx3==2.90
|
20 |
+
deep-translator==1.11.4
|
21 |
+
|
22 |
+
# Data Processing and Visualization
|
23 |
+
matplotlib==3.7.3
|
24 |
+
seaborn==0.12.2
|
25 |
+
scikit-learn==1.3.0
|
26 |
+
networkx==3.1
|
27 |
+
|
28 |
+
# API and Web
|
29 |
+
aiohttp==3.8.5
|
30 |
+
httpx==0.24.1
|
31 |
+
pydantic==2.3.0
|
32 |
+
python-dotenv==1.0.0
|
33 |
+
python-multipart==0.0.6
|
34 |
+
|
35 |
+
# HuggingFace Spaces
|
36 |
+
huggingface-hub==0.16.4
|
37 |
+
|
38 |
+
# Added from the code block
|
39 |
+
pydub==0.25.1
|
utils.py
ADDED
@@ -0,0 +1,1132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import requests
|
2 |
+
import re
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import time
|
6 |
+
from typing import List, Dict, Any, Tuple, Optional
|
7 |
+
from bs4 import BeautifulSoup
|
8 |
+
import pandas as pd
|
9 |
+
import numpy as np
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
12 |
+
from nltk.cluster.util import cosine_distance
|
13 |
+
import networkx as nx
|
14 |
+
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
|
15 |
+
from collections import Counter
|
16 |
+
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
|
17 |
+
from deep_translator import GoogleTranslator
|
18 |
+
from gtts import gTTS
|
19 |
+
import pyttsx3
|
20 |
+
|
21 |
+
# Download necessary NLTK data
|
22 |
+
import nltk
|
23 |
+
try:
|
24 |
+
nltk.data.find('tokenizers/punkt')
|
25 |
+
nltk.data.find('corpora/stopwords')
|
26 |
+
except LookupError:
|
27 |
+
nltk.download('punkt')
|
28 |
+
nltk.download('stopwords')
|
29 |
+
|
30 |
+
# Initialize sentiment analyzer
|
31 |
+
vader_analyzer = SentimentIntensityAnalyzer()
|
32 |
+
|
33 |
+
# Initialize advanced sentiment model
|
34 |
+
sentiment_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
|
35 |
+
sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
|
36 |
+
sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
|
37 |
+
advanced_sentiment = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer)
|
38 |
+
|
39 |
+
# Initialize translator
|
40 |
+
translator = GoogleTranslator(source='en', target='hi')
|
41 |
+
|
42 |
+
class NewsArticle:
|
43 |
+
def __init__(self, title: str, url: str, content: str, summary: str = "", source: str = "",
|
44 |
+
date: str = "", sentiment: str = "", topics: List[str] = None):
|
45 |
+
self.title = title
|
46 |
+
self.url = url
|
47 |
+
self.content = content
|
48 |
+
self.summary = summary if summary else self.generate_summary(content)
|
49 |
+
self.source = source
|
50 |
+
self.date = date
|
51 |
+
self.sentiment = sentiment if sentiment else self.analyze_sentiment(content, title)
|
52 |
+
self.topics = topics if topics else self.extract_topics(content)
|
53 |
+
|
54 |
+
def to_dict(self) -> Dict[str, Any]:
|
55 |
+
return {
|
56 |
+
"title": self.title,
|
57 |
+
"url": self.url,
|
58 |
+
"content": self.content,
|
59 |
+
"summary": self.summary,
|
60 |
+
"source": self.source,
|
61 |
+
"date": self.date,
|
62 |
+
"sentiment": self.sentiment,
|
63 |
+
"topics": self.topics
|
64 |
+
}
|
65 |
+
|
66 |
+
@staticmethod
|
67 |
+
def analyze_sentiment(text: str, title: str = "") -> str:
|
68 |
+
"""
|
69 |
+
Analyze sentiment using a combination of methods for more accurate results.
|
70 |
+
We give more weight to the title sentiment and use advanced model when possible.
|
71 |
+
"""
|
72 |
+
# Set thresholds for VADER sentiment
|
73 |
+
threshold_positive = 0.05 # Default 0.05
|
74 |
+
threshold_negative = -0.05 # Default -0.05
|
75 |
+
|
76 |
+
# Use VADER for basic sentiment analysis on both title and content
|
77 |
+
try:
|
78 |
+
title_scores = vader_analyzer.polarity_scores(title) if title else {'compound': 0}
|
79 |
+
content_scores = vader_analyzer.polarity_scores(text)
|
80 |
+
|
81 |
+
# Weight the title more heavily (title sentiment is often more reliable)
|
82 |
+
title_weight = 0.6 if title else 0
|
83 |
+
content_weight = 1.0 - title_weight
|
84 |
+
|
85 |
+
compound_score = (title_weight * title_scores['compound']) + (content_weight * content_scores['compound'])
|
86 |
+
|
87 |
+
# Try to use the advanced model for additional insight (for short texts)
|
88 |
+
advanced_result = None
|
89 |
+
advanced_score = 0
|
90 |
+
|
91 |
+
try:
|
92 |
+
# Use title + first part of content for advanced model
|
93 |
+
sample_text = title + ". " + text[:300] if title else text[:300]
|
94 |
+
advanced_result = advanced_sentiment(sample_text)[0]
|
95 |
+
|
96 |
+
# Map advanced model results to a -1 to 1 scale similar to VADER
|
97 |
+
label = advanced_result['label']
|
98 |
+
confidence = advanced_result['score']
|
99 |
+
|
100 |
+
# Map the 1-5 star rating to a -1 to 1 scale
|
101 |
+
if label == '1 star' or label == '2 stars':
|
102 |
+
advanced_score = -confidence
|
103 |
+
elif label == '4 stars' or label == '5 stars':
|
104 |
+
advanced_score = confidence
|
105 |
+
else: # 3 stars is neutral
|
106 |
+
advanced_score = 0
|
107 |
+
|
108 |
+
# Combine VADER and advanced model scores
|
109 |
+
# Give more weight to advanced model when confidence is high
|
110 |
+
if confidence > 0.8:
|
111 |
+
compound_score = (0.4 * compound_score) + (0.6 * advanced_score)
|
112 |
+
else:
|
113 |
+
compound_score = (0.7 * compound_score) + (0.3 * advanced_score)
|
114 |
+
|
115 |
+
except Exception as e:
|
116 |
+
print(f"Advanced sentiment analysis failed: {str(e)}")
|
117 |
+
# Continue with just VADER if advanced model fails
|
118 |
+
pass
|
119 |
+
|
120 |
+
# Fine-grained sentiment mapping
|
121 |
+
if compound_score >= 0.3:
|
122 |
+
return "Positive"
|
123 |
+
elif compound_score >= threshold_positive:
|
124 |
+
return "Slightly Positive"
|
125 |
+
elif compound_score <= -0.3:
|
126 |
+
return "Negative"
|
127 |
+
elif compound_score <= threshold_negative:
|
128 |
+
return "Slightly Negative"
|
129 |
+
else:
|
130 |
+
return "Neutral"
|
131 |
+
|
132 |
+
except Exception as e:
|
133 |
+
print(f"Sentiment analysis error: {str(e)}")
|
134 |
+
return "Neutral" # Default fallback
|
135 |
+
|
136 |
+
@staticmethod
|
137 |
+
def generate_summary(text: str, num_sentences: int = 5) -> str:
|
138 |
+
# Generate summary using extractive summarization
|
139 |
+
if not text or len(text) < 100:
|
140 |
+
return text
|
141 |
+
|
142 |
+
# Tokenize sentences
|
143 |
+
sentences = sent_tokenize(text)
|
144 |
+
if len(sentences) <= num_sentences:
|
145 |
+
return text
|
146 |
+
|
147 |
+
# Calculate sentence similarity and rank them
|
148 |
+
similarity_matrix = build_similarity_matrix(sentences)
|
149 |
+
scores = nx.pagerank(nx.from_numpy_array(similarity_matrix))
|
150 |
+
|
151 |
+
# Select top sentences
|
152 |
+
ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
|
153 |
+
summary_sentences = [ranked_sentences[i][1] for i in range(min(num_sentences, len(ranked_sentences)))]
|
154 |
+
|
155 |
+
# Maintain original order
|
156 |
+
original_order = []
|
157 |
+
for sentence in sentences:
|
158 |
+
if sentence in summary_sentences and sentence not in original_order:
|
159 |
+
original_order.append(sentence)
|
160 |
+
if len(original_order) >= num_sentences:
|
161 |
+
break
|
162 |
+
|
163 |
+
return " ".join(original_order)
|
164 |
+
|
165 |
+
@staticmethod
|
166 |
+
def extract_topics(text: str, num_topics: int = 5) -> List[str]:
|
167 |
+
# Extract key topics from text based on term frequency
|
168 |
+
stop_words = set(stopwords.words('english'))
|
169 |
+
words = word_tokenize(text.lower())
|
170 |
+
|
171 |
+
# Filter out stopwords and short words
|
172 |
+
filtered_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 3]
|
173 |
+
|
174 |
+
# Count word frequencies
|
175 |
+
word_counts = Counter(filtered_words)
|
176 |
+
|
177 |
+
# Return most common words as topics
|
178 |
+
topics = [word for word, _ in word_counts.most_common(num_topics)]
|
179 |
+
return topics
|
180 |
+
|
181 |
+
def build_similarity_matrix(sentences: List[str]) -> np.ndarray:
|
182 |
+
"""Build similarity matrix for sentences based on cosine similarity."""
|
183 |
+
# Number of sentences
|
184 |
+
n = len(sentences)
|
185 |
+
|
186 |
+
# Initialize similarity matrix
|
187 |
+
similarity_matrix = np.zeros((n, n))
|
188 |
+
|
189 |
+
# Calculate similarity between each pair of sentences
|
190 |
+
for i in range(n):
|
191 |
+
for j in range(n):
|
192 |
+
if i != j:
|
193 |
+
similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])
|
194 |
+
|
195 |
+
return similarity_matrix
|
196 |
+
|
197 |
+
def sentence_similarity(sent1: str, sent2: str) -> float:
|
198 |
+
"""Calculate similarity between two sentences using cosine similarity."""
|
199 |
+
# Tokenize sentences
|
200 |
+
words1 = [word.lower() for word in word_tokenize(sent1) if word.isalpha()]
|
201 |
+
words2 = [word.lower() for word in word_tokenize(sent2) if word.isalpha()]
|
202 |
+
|
203 |
+
# Get all unique words
|
204 |
+
all_words = list(set(words1 + words2))
|
205 |
+
|
206 |
+
# Create word vectors
|
207 |
+
vector1 = [1 if word in words1 else 0 for word in all_words]
|
208 |
+
vector2 = [1 if word in words2 else 0 for word in all_words]
|
209 |
+
|
210 |
+
# Calculate cosine similarity
|
211 |
+
if not any(vector1) or not any(vector2):
|
212 |
+
return 0.0
|
213 |
+
|
214 |
+
return 1 - cosine_distance(vector1, vector2)
|
215 |
+
|
216 |
+
def search_news(company_name: str, num_articles: int = 10) -> List[NewsArticle]:
|
217 |
+
"""Search for news articles about a given company."""
|
218 |
+
# List to store articles
|
219 |
+
articles = []
|
220 |
+
|
221 |
+
# Define search queries and news sources
|
222 |
+
search_queries = [
|
223 |
+
f"{company_name} news",
|
224 |
+
f"{company_name} financial news",
|
225 |
+
f"{company_name} business news",
|
226 |
+
f"{company_name} recent news",
|
227 |
+
f"{company_name} company news",
|
228 |
+
f"{company_name} stock",
|
229 |
+
f"{company_name} market"
|
230 |
+
]
|
231 |
+
|
232 |
+
# Updated news sources with more reliable sources
|
233 |
+
news_sources = [
|
234 |
+
{
|
235 |
+
"base_url": "https://finance.yahoo.com/quote/",
|
236 |
+
"article_patterns": ["news", "finance", "articles"],
|
237 |
+
"direct_access": True
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"base_url": "https://www.reuters.com/search/news?blob=",
|
241 |
+
"article_patterns": ["article", "business", "companies", "markets"],
|
242 |
+
"direct_access": False
|
243 |
+
},
|
244 |
+
{
|
245 |
+
"base_url": "https://www.marketwatch.com/search?q=",
|
246 |
+
"article_patterns": ["story", "articles", "news"],
|
247 |
+
"direct_access": False
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"base_url": "https://www.fool.com/search?q=",
|
251 |
+
"article_patterns": ["article", "investing", "stock"],
|
252 |
+
"direct_access": False
|
253 |
+
},
|
254 |
+
{
|
255 |
+
"base_url": "https://seekingalpha.com/search?q=",
|
256 |
+
"article_patterns": ["article", "news", "stock", "analysis"],
|
257 |
+
"direct_access": False
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"base_url": "https://www.zacks.com/search.php?q=",
|
261 |
+
"article_patterns": ["stock", "research", "analyst"],
|
262 |
+
"direct_access": False
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"base_url": "https://economictimes.indiatimes.com/search?q=",
|
266 |
+
"article_patterns": ["articleshow", "news", "industry"],
|
267 |
+
"direct_access": False
|
268 |
+
},
|
269 |
+
{
|
270 |
+
"base_url": "https://www.bloomberg.com/search?query=",
|
271 |
+
"article_patterns": ["news", "articles"],
|
272 |
+
"direct_access": False
|
273 |
+
}
|
274 |
+
]
|
275 |
+
|
276 |
+
print(f"Starting search for news about {company_name}...")
|
277 |
+
|
278 |
+
# Search each source with each query until we have enough articles
|
279 |
+
for query in search_queries:
|
280 |
+
if len(articles) >= num_articles:
|
281 |
+
break
|
282 |
+
|
283 |
+
for source in news_sources:
|
284 |
+
if len(articles) >= num_articles:
|
285 |
+
break
|
286 |
+
|
287 |
+
try:
|
288 |
+
source_base = source["base_url"]
|
289 |
+
article_patterns = source["article_patterns"]
|
290 |
+
direct_access = source["direct_access"]
|
291 |
+
|
292 |
+
# Construct search URL
|
293 |
+
if direct_access:
|
294 |
+
# Try to fetch the stock symbol for Yahoo Finance
|
295 |
+
if "yahoo" in source_base:
|
296 |
+
try:
|
297 |
+
# First try the company name directly (for known tickers)
|
298 |
+
search_url = f"{source_base}{company_name}/news"
|
299 |
+
print(f"Trying direct ticker access: {search_url}")
|
300 |
+
|
301 |
+
# Fetch to check if valid
|
302 |
+
headers = {
|
303 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
304 |
+
}
|
305 |
+
test_response = requests.get(search_url, headers=headers, timeout=10)
|
306 |
+
|
307 |
+
# If we got a 404, try searching for the symbol first
|
308 |
+
if test_response.status_code == 404:
|
309 |
+
print("Company name not a valid ticker, searching for symbol...")
|
310 |
+
symbol_url = f"https://finance.yahoo.com/lookup?s={company_name}"
|
311 |
+
symbol_response = requests.get(symbol_url, headers=headers, timeout=10)
|
312 |
+
|
313 |
+
if symbol_response.status_code == 200:
|
314 |
+
symbol_soup = BeautifulSoup(symbol_response.text, 'html.parser')
|
315 |
+
# Try to find the first stock symbol result
|
316 |
+
symbol_row = symbol_soup.select_one("tr.data-row0")
|
317 |
+
if symbol_row:
|
318 |
+
symbol_cell = symbol_row.select_one("td:first-child a")
|
319 |
+
if symbol_cell:
|
320 |
+
symbol = symbol_cell.text.strip()
|
321 |
+
search_url = f"{source_base}{symbol}/news"
|
322 |
+
print(f"Found symbol {symbol}, using URL: {search_url}")
|
323 |
+
except Exception as e:
|
324 |
+
print(f"Error getting stock symbol: {str(e)}")
|
325 |
+
search_url = f"{source_base}{company_name}/news"
|
326 |
+
else:
|
327 |
+
search_url = f"{source_base}{company_name}/news"
|
328 |
+
else:
|
329 |
+
search_url = f"{source_base}{query.replace(' ', '+')}"
|
330 |
+
|
331 |
+
print(f"Searching {search_url}")
|
332 |
+
|
333 |
+
# Fetch search results with retry mechanism
|
334 |
+
max_retries = 3
|
335 |
+
retry_count = 0
|
336 |
+
response = None
|
337 |
+
|
338 |
+
while retry_count < max_retries:
|
339 |
+
try:
|
340 |
+
headers = {
|
341 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
342 |
+
"Accept": "text/html,application/xhtml+xml,application/xml",
|
343 |
+
"Accept-Language": "en-US,en;q=0.9",
|
344 |
+
"Referer": "https://www.google.com/"
|
345 |
+
}
|
346 |
+
response = requests.get(search_url, headers=headers, timeout=15)
|
347 |
+
if response.status_code == 200:
|
348 |
+
break
|
349 |
+
retry_count += 1
|
350 |
+
print(f"Retry {retry_count}/{max_retries} for {search_url} (status: {response.status_code})")
|
351 |
+
time.sleep(1) # Short delay before retry
|
352 |
+
except Exception as e:
|
353 |
+
retry_count += 1
|
354 |
+
print(f"Request error (attempt {retry_count}/{max_retries}): {str(e)}")
|
355 |
+
time.sleep(1)
|
356 |
+
|
357 |
+
if not response or response.status_code != 200:
|
358 |
+
print(f"Failed to fetch results from {search_url} after {max_retries} attempts")
|
359 |
+
continue
|
360 |
+
|
361 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
362 |
+
|
363 |
+
# Extract article links - using more flexible patterns
|
364 |
+
links = soup.find_all('a', href=True)
|
365 |
+
article_links = []
|
366 |
+
|
367 |
+
# Domain for resolving relative URLs
|
368 |
+
domain = response.url.split('/')[0] + '//' + response.url.split('/')[2]
|
369 |
+
print(f"Domain for resolving URLs: {domain}")
|
370 |
+
|
371 |
+
for link in links:
|
372 |
+
href = link['href']
|
373 |
+
link_text = link.text.strip()
|
374 |
+
|
375 |
+
# Skip empty links or navigation elements
|
376 |
+
if not link_text or len(link_text) < 10 or href.startswith('#'):
|
377 |
+
continue
|
378 |
+
|
379 |
+
# Check if the link matches any of our article patterns
|
380 |
+
is_article_link = False
|
381 |
+
for pattern in article_patterns:
|
382 |
+
if pattern in href.lower():
|
383 |
+
is_article_link = True
|
384 |
+
break
|
385 |
+
|
386 |
+
# Check for the company name in link text or URL (less restrictive now)
|
387 |
+
contains_company = (
|
388 |
+
company_name.lower() in link_text.lower() or
|
389 |
+
company_name.lower() in href.lower()
|
390 |
+
)
|
391 |
+
|
392 |
+
if is_article_link or contains_company:
|
393 |
+
# Convert relative URLs to absolute
|
394 |
+
if href.startswith('/'):
|
395 |
+
href = f"{domain}{href}"
|
396 |
+
elif not href.startswith(('http://', 'https://')):
|
397 |
+
href = f"{domain}/{href}"
|
398 |
+
|
399 |
+
# Avoid duplicates
|
400 |
+
if href not in article_links:
|
401 |
+
article_links.append(href)
|
402 |
+
print(f"Found potential article: {link_text[:50]}... at {href}")
|
403 |
+
|
404 |
+
print(f"Found {len(article_links)} potential article links from {search_url}")
|
405 |
+
|
406 |
+
# Process each article link
|
407 |
+
for link in article_links[:5]: # Increased from 3 to 5
|
408 |
+
if len(articles) >= num_articles:
|
409 |
+
break
|
410 |
+
|
411 |
+
try:
|
412 |
+
print(f"Fetching article: {link}")
|
413 |
+
article_response = requests.get(link, headers=headers, timeout=15)
|
414 |
+
|
415 |
+
if article_response.status_code != 200:
|
416 |
+
print(f"Failed to fetch article: {article_response.status_code}")
|
417 |
+
continue
|
418 |
+
|
419 |
+
article_soup = BeautifulSoup(article_response.text, 'html.parser')
|
420 |
+
|
421 |
+
# Extract article title - more robust method
|
422 |
+
title = None
|
423 |
+
|
424 |
+
# Try different elements that could contain the title
|
425 |
+
for title_tag in ['h1', 'h2', '.headline', '.title', 'title']:
|
426 |
+
if title:
|
427 |
+
break
|
428 |
+
|
429 |
+
if title_tag.startswith('.'):
|
430 |
+
elements = article_soup.select(title_tag)
|
431 |
+
else:
|
432 |
+
elements = article_soup.find_all(title_tag)
|
433 |
+
|
434 |
+
for element in elements:
|
435 |
+
candidate = element.text.strip()
|
436 |
+
if len(candidate) > 5 and len(candidate) < 200: # Reasonable title length
|
437 |
+
title = candidate
|
438 |
+
break
|
439 |
+
|
440 |
+
if not title:
|
441 |
+
print("Could not find a suitable title")
|
442 |
+
continue
|
443 |
+
|
444 |
+
# Check if title contains company name (case insensitive)
|
445 |
+
if company_name.lower() not in title.lower():
|
446 |
+
# Try alternative check - sometimes the title doesn't explicitly mention the company
|
447 |
+
meta_description = article_soup.find('meta', attrs={'name': 'description'}) or \
|
448 |
+
article_soup.find('meta', attrs={'property': 'og:description'})
|
449 |
+
|
450 |
+
if meta_description and 'content' in meta_description.attrs:
|
451 |
+
meta_text = meta_description['content']
|
452 |
+
if company_name.lower() not in meta_text.lower():
|
453 |
+
# One more check in the page content
|
454 |
+
page_text = article_soup.get_text().lower()
|
455 |
+
company_mentions = page_text.count(company_name.lower())
|
456 |
+
if company_mentions < 2: # Require at least 2 mentions
|
457 |
+
print(f"Article doesn't seem to be about {company_name}: {title}")
|
458 |
+
continue
|
459 |
+
|
460 |
+
# Extract article content - improved method
|
461 |
+
content = ""
|
462 |
+
|
463 |
+
# Try multiple content extraction strategies
|
464 |
+
content_containers = []
|
465 |
+
|
466 |
+
# 1. Look for article/main content containers
|
467 |
+
for container in ['article', 'main', '.article-body', '.story-body', '.story-content',
|
468 |
+
'.article-content', '.content-body', '.entry-content']:
|
469 |
+
if container.startswith('.'):
|
470 |
+
elements = article_soup.select(container)
|
471 |
+
else:
|
472 |
+
elements = article_soup.find_all(container)
|
473 |
+
|
474 |
+
content_containers.extend(elements)
|
475 |
+
|
476 |
+
# 2. If no specific containers, fallback to div with article-like classes
|
477 |
+
if not content_containers:
|
478 |
+
for div in article_soup.find_all('div', class_=True):
|
479 |
+
classes = div.get('class', [])
|
480 |
+
for cls in classes:
|
481 |
+
if any(term in cls.lower() for term in ['article', 'story', 'content', 'body', 'text']):
|
482 |
+
content_containers.append(div)
|
483 |
+
break
|
484 |
+
|
485 |
+
# 3. Extract paragraphs from containers
|
486 |
+
processed_paragraphs = set() # To avoid duplicates
|
487 |
+
|
488 |
+
for container in content_containers:
|
489 |
+
for p in container.find_all('p'):
|
490 |
+
p_text = p.text.strip()
|
491 |
+
# Avoid very short or duplicate paragraphs
|
492 |
+
if len(p_text) > 30 and p_text not in processed_paragraphs:
|
493 |
+
content += p_text + " "
|
494 |
+
processed_paragraphs.add(p_text)
|
495 |
+
|
496 |
+
# 4. If still no content, try all paragraphs
|
497 |
+
if not content:
|
498 |
+
for p in article_soup.find_all('p'):
|
499 |
+
p_text = p.text.strip()
|
500 |
+
if len(p_text) > 30 and p_text not in processed_paragraphs:
|
501 |
+
content += p_text + " "
|
502 |
+
processed_paragraphs.add(p_text)
|
503 |
+
|
504 |
+
content = content.strip()
|
505 |
+
|
506 |
+
# Skip if content is too short
|
507 |
+
if len(content) < 300: # Reduced from 500 to be less restrictive
|
508 |
+
print(f"Article content too short: {len(content)} characters")
|
509 |
+
continue
|
510 |
+
|
511 |
+
# Extract source name - more robust method
|
512 |
+
source = None
|
513 |
+
|
514 |
+
# Try to get from meta tags
|
515 |
+
meta_site_name = article_soup.find('meta', attrs={'property': 'og:site_name'})
|
516 |
+
if meta_site_name and 'content' in meta_site_name.attrs:
|
517 |
+
source = meta_site_name['content']
|
518 |
+
else:
|
519 |
+
# Extract from URL
|
520 |
+
try:
|
521 |
+
from urllib.parse import urlparse
|
522 |
+
parsed_url = urlparse(link)
|
523 |
+
source = parsed_url.netloc
|
524 |
+
except:
|
525 |
+
source = response.url.split('/')[2]
|
526 |
+
|
527 |
+
# Extract date - improved method
|
528 |
+
date = ""
|
529 |
+
|
530 |
+
# Try multiple date extraction strategies
|
531 |
+
# 1. Look for time element
|
532 |
+
date_tag = article_soup.find('time')
|
533 |
+
|
534 |
+
# 2. Look for meta tags with date
|
535 |
+
if not date and (not date_tag or not date_tag.get('datetime')):
|
536 |
+
for meta_name in ['article:published_time', 'date', 'publish-date', 'article:modified_time']:
|
537 |
+
meta_date = article_soup.find('meta', attrs={'property': meta_name}) or \
|
538 |
+
article_soup.find('meta', attrs={'name': meta_name})
|
539 |
+
|
540 |
+
if meta_date and 'content' in meta_date.attrs:
|
541 |
+
date = meta_date['content']
|
542 |
+
break
|
543 |
+
|
544 |
+
# 3. Look for spans/divs with date-related classes
|
545 |
+
if not date:
|
546 |
+
date_classes = ['date', 'time', 'published', 'posted', 'datetime']
|
547 |
+
for cls in date_classes:
|
548 |
+
elements = article_soup.find_all(['span', 'div', 'p'], class_=lambda x: x and cls.lower() in x.lower())
|
549 |
+
if elements:
|
550 |
+
date = elements[0].text.strip()
|
551 |
+
break
|
552 |
+
|
553 |
+
# If we got this far, we have a valid article
|
554 |
+
print(f"Successfully extracted article: {title}")
|
555 |
+
|
556 |
+
# Create article object and add to list
|
557 |
+
article = NewsArticle(
|
558 |
+
title=title,
|
559 |
+
url=link,
|
560 |
+
content=content,
|
561 |
+
source=source,
|
562 |
+
date=date
|
563 |
+
)
|
564 |
+
|
565 |
+
# Check if similar article already exists to avoid duplicates
|
566 |
+
is_duplicate = False
|
567 |
+
for existing_article in articles:
|
568 |
+
if sentence_similarity(existing_article.title, title) > 0.7: # Lowered threshold
|
569 |
+
is_duplicate = True
|
570 |
+
print(f"Found duplicate article: {title}")
|
571 |
+
break
|
572 |
+
|
573 |
+
if not is_duplicate:
|
574 |
+
articles.append(article)
|
575 |
+
print(f"Added article: {title}")
|
576 |
+
|
577 |
+
except Exception as e:
|
578 |
+
print(f"Error processing article {link}: {str(e)}")
|
579 |
+
continue
|
580 |
+
|
581 |
+
except Exception as e:
|
582 |
+
print(f"Error searching {source_base} with query {query}: {str(e)}")
|
583 |
+
continue
|
584 |
+
|
585 |
+
# If we couldn't find enough articles, create some dummy articles to prevent errors
|
586 |
+
if not articles and num_articles > 0:
|
587 |
+
print(f"No articles found for {company_name}. Creating a dummy article to prevent errors.")
|
588 |
+
|
589 |
+
dummy_article = NewsArticle(
|
590 |
+
title=f"{company_name} Information",
|
591 |
+
url="#",
|
592 |
+
content=f"Information about {company_name} was not found or could not be retrieved. This is a placeholder.",
|
593 |
+
source="System",
|
594 |
+
date="",
|
595 |
+
sentiment="Neutral",
|
596 |
+
topics=["information", "company", "placeholder"]
|
597 |
+
)
|
598 |
+
|
599 |
+
articles.append(dummy_article)
|
600 |
+
|
601 |
+
# Return collected articles
|
602 |
+
print(f"Returning {len(articles)} articles for {company_name}")
|
603 |
+
return articles[:num_articles]
|
604 |
+
|
605 |
+
def analyze_article_sentiment(article: NewsArticle) -> Dict[str, Any]:
|
606 |
+
"""Perform detailed sentiment analysis on an article."""
|
607 |
+
# Use VADER for paragraph-level sentiment
|
608 |
+
paragraphs = article.content.split('\n')
|
609 |
+
paragraph_sentiments = []
|
610 |
+
|
611 |
+
overall_scores = {
|
612 |
+
'pos': 0,
|
613 |
+
'neg': 0,
|
614 |
+
'neu': 0,
|
615 |
+
'compound': 0
|
616 |
+
}
|
617 |
+
|
618 |
+
for paragraph in paragraphs:
|
619 |
+
if len(paragraph.strip()) < 20: # Skip short paragraphs
|
620 |
+
continue
|
621 |
+
|
622 |
+
scores = vader_analyzer.polarity_scores(paragraph)
|
623 |
+
paragraph_sentiments.append({
|
624 |
+
'text': paragraph[:100] + '...' if len(paragraph) > 100 else paragraph,
|
625 |
+
'scores': scores
|
626 |
+
})
|
627 |
+
|
628 |
+
overall_scores['pos'] += scores['pos']
|
629 |
+
overall_scores['neg'] += scores['neg']
|
630 |
+
overall_scores['neu'] += scores['neu']
|
631 |
+
overall_scores['compound'] += scores['compound']
|
632 |
+
|
633 |
+
num_paragraphs = len(paragraph_sentiments)
|
634 |
+
if num_paragraphs > 0:
|
635 |
+
overall_scores['pos'] /= num_paragraphs
|
636 |
+
overall_scores['neg'] /= num_paragraphs
|
637 |
+
overall_scores['neu'] /= num_paragraphs
|
638 |
+
overall_scores['compound'] /= num_paragraphs
|
639 |
+
|
640 |
+
# Use advanced model for overall sentiment
|
641 |
+
try:
|
642 |
+
# Truncate content if too long
|
643 |
+
truncated_content = article.content[:512] if len(article.content) > 512 else article.content
|
644 |
+
advanced_result = advanced_sentiment(truncated_content)[0]
|
645 |
+
advanced_sentiment_label = advanced_result['label']
|
646 |
+
advanced_confidence = advanced_result['score']
|
647 |
+
except Exception as e:
|
648 |
+
print(f"Error with advanced sentiment analysis: {str(e)}")
|
649 |
+
advanced_sentiment_label = "Error"
|
650 |
+
advanced_confidence = 0.0
|
651 |
+
|
652 |
+
# Determine final sentiment
|
653 |
+
if overall_scores['compound'] >= 0.05:
|
654 |
+
final_sentiment = "Positive"
|
655 |
+
elif overall_scores['compound'] <= -0.05:
|
656 |
+
final_sentiment = "Negative"
|
657 |
+
else:
|
658 |
+
final_sentiment = "Neutral"
|
659 |
+
|
660 |
+
return {
|
661 |
+
'article_title': article.title,
|
662 |
+
'overall_sentiment': final_sentiment,
|
663 |
+
'vader_scores': overall_scores,
|
664 |
+
'advanced_sentiment': {
|
665 |
+
'label': advanced_sentiment_label,
|
666 |
+
'confidence': advanced_confidence
|
667 |
+
},
|
668 |
+
'paragraph_analysis': paragraph_sentiments,
|
669 |
+
'positive_ratio': overall_scores['pos'],
|
670 |
+
'negative_ratio': overall_scores['neg'],
|
671 |
+
'neutral_ratio': overall_scores['neu']
|
672 |
+
}
|
673 |
+
|
674 |
+
def perform_comparative_analysis(articles: List[NewsArticle]) -> Dict[str, Any]:
|
675 |
+
"""Perform comparative analysis across multiple articles."""
|
676 |
+
# Sentiment distribution with expanded categories
|
677 |
+
sentiment_counts = {
|
678 |
+
"Positive": 0,
|
679 |
+
"Slightly Positive": 0,
|
680 |
+
"Neutral": 0,
|
681 |
+
"Slightly Negative": 0,
|
682 |
+
"Negative": 0
|
683 |
+
}
|
684 |
+
|
685 |
+
for article in articles:
|
686 |
+
if article.sentiment in sentiment_counts:
|
687 |
+
sentiment_counts[article.sentiment] += 1
|
688 |
+
else:
|
689 |
+
# Fallback for any unexpected sentiment values
|
690 |
+
sentiment_counts["Neutral"] += 1
|
691 |
+
|
692 |
+
# Topic analysis
|
693 |
+
all_topics = []
|
694 |
+
for article in articles:
|
695 |
+
all_topics.extend(article.topics)
|
696 |
+
|
697 |
+
topic_counts = Counter(all_topics)
|
698 |
+
common_topics = [topic for topic, count in topic_counts.most_common(10)]
|
699 |
+
|
700 |
+
# Identify unique topics per article
|
701 |
+
unique_topics_by_article = {}
|
702 |
+
for i, article in enumerate(articles):
|
703 |
+
other_articles_topics = []
|
704 |
+
for j, other_article in enumerate(articles):
|
705 |
+
if i != j:
|
706 |
+
other_articles_topics.extend(other_article.topics)
|
707 |
+
|
708 |
+
unique_topics = [topic for topic in article.topics if topic not in other_articles_topics]
|
709 |
+
unique_topics_by_article[i] = unique_topics
|
710 |
+
|
711 |
+
# Generate comparisons
|
712 |
+
comparisons = []
|
713 |
+
|
714 |
+
# If we have more than one article, generate meaningful comparisons
|
715 |
+
if len(articles) > 1:
|
716 |
+
for i in range(len(articles) - 1):
|
717 |
+
for j in range(i + 1, len(articles)):
|
718 |
+
article1 = articles[i]
|
719 |
+
article2 = articles[j]
|
720 |
+
|
721 |
+
# Compare sentiments - more nuanced now with new categories
|
722 |
+
if article1.sentiment != article2.sentiment:
|
723 |
+
# Group sentiments for better comparison
|
724 |
+
sent1_group = get_sentiment_group(article1.sentiment)
|
725 |
+
sent2_group = get_sentiment_group(article2.sentiment)
|
726 |
+
|
727 |
+
if sent1_group != sent2_group:
|
728 |
+
comparison = {
|
729 |
+
"Articles": [article1.title, article2.title],
|
730 |
+
"Comparison": f"'{article1.title}' presents a {sent1_group.lower()} view ({article1.sentiment}), while '{article2.title}' has a {sent2_group.lower()} view ({article2.sentiment}).",
|
731 |
+
"Impact": "This difference in sentiment highlights varying perspectives on the company's situation."
|
732 |
+
}
|
733 |
+
comparisons.append(comparison)
|
734 |
+
else:
|
735 |
+
# Even if in same group, note the difference if one is stronger
|
736 |
+
if "Slightly" in article1.sentiment and "Slightly" not in article2.sentiment or \
|
737 |
+
"Slightly" in article2.sentiment and "Slightly" not in article1.sentiment:
|
738 |
+
stronger = article1 if "Slightly" not in article1.sentiment else article2
|
739 |
+
weaker = article2 if stronger == article1 else article1
|
740 |
+
|
741 |
+
comparison = {
|
742 |
+
"Articles": [stronger.title, weaker.title],
|
743 |
+
"Comparison": f"'{stronger.title}' expresses a stronger {sent1_group.lower()} sentiment ({stronger.sentiment}) than '{weaker.title}' ({weaker.sentiment}).",
|
744 |
+
"Impact": "The difference in intensity suggests varying degrees of confidence about the company."
|
745 |
+
}
|
746 |
+
comparisons.append(comparison)
|
747 |
+
|
748 |
+
# Compare topics
|
749 |
+
common_topics_between_two = set(article1.topics).intersection(set(article2.topics))
|
750 |
+
if common_topics_between_two:
|
751 |
+
comparison = {
|
752 |
+
"Articles": [article1.title, article2.title],
|
753 |
+
"Comparison": f"Both articles discuss {', '.join(common_topics_between_two)}.",
|
754 |
+
"Impact": "The common topics indicate key areas of focus around the company."
|
755 |
+
}
|
756 |
+
comparisons.append(comparison)
|
757 |
+
|
758 |
+
# Compare unique topics
|
759 |
+
unique_to_article1 = set(article1.topics) - set(article2.topics)
|
760 |
+
unique_to_article2 = set(article2.topics) - set(article1.topics)
|
761 |
+
|
762 |
+
if unique_to_article1 and unique_to_article2:
|
763 |
+
comparison = {
|
764 |
+
"Articles": [article1.title, article2.title],
|
765 |
+
"Comparison": f"'{article1.title}' uniquely covers {', '.join(unique_to_article1)}, while '{article2.title}' focuses on {', '.join(unique_to_article2)}.",
|
766 |
+
"Impact": "Different sources emphasize varying aspects of the company, offering a broader perspective."
|
767 |
+
}
|
768 |
+
comparisons.append(comparison)
|
769 |
+
else:
|
770 |
+
# If we only have one article, create a dummy comparison
|
771 |
+
if articles:
|
772 |
+
article = articles[0]
|
773 |
+
topics_str = ", ".join(article.topics[:3]) if article.topics else "no specific topics"
|
774 |
+
sentiment_group = get_sentiment_group(article.sentiment)
|
775 |
+
|
776 |
+
comparisons = [
|
777 |
+
{
|
778 |
+
"Comparison": f"Only found one article: '{article.title}' with a {article.sentiment.lower()} sentiment ({sentiment_group} overall).",
|
779 |
+
"Impact": f"Limited coverage focused on {topics_str}. More articles would provide a more balanced view."
|
780 |
+
},
|
781 |
+
{
|
782 |
+
"Comparison": f"The article discusses {topics_str} in relation to {article.source}.",
|
783 |
+
"Impact": "Single source reporting limits perspective. Consider searching for additional sources."
|
784 |
+
}
|
785 |
+
]
|
786 |
+
|
787 |
+
# Generate overall sentiment analysis
|
788 |
+
# Combine slightly positive with positive and slightly negative with negative for summary
|
789 |
+
pos_count = sentiment_counts["Positive"] + sentiment_counts["Slightly Positive"]
|
790 |
+
neg_count = sentiment_counts["Negative"] + sentiment_counts["Slightly Negative"]
|
791 |
+
neu_count = sentiment_counts["Neutral"]
|
792 |
+
total = pos_count + neg_count + neu_count
|
793 |
+
|
794 |
+
# For display, we'll keep detailed counts but summarize the analysis text
|
795 |
+
if total == 0:
|
796 |
+
final_analysis = "No sentiment data available."
|
797 |
+
else:
|
798 |
+
pos_ratio = pos_count / total
|
799 |
+
neg_ratio = neg_count / total
|
800 |
+
|
801 |
+
# Show more details on the sentiment breakdown
|
802 |
+
sentiment_detail = []
|
803 |
+
if sentiment_counts["Positive"] > 0:
|
804 |
+
sentiment_detail.append(f"{sentiment_counts['Positive']} strongly positive")
|
805 |
+
if sentiment_counts["Slightly Positive"] > 0:
|
806 |
+
sentiment_detail.append(f"{sentiment_counts['Slightly Positive']} slightly positive")
|
807 |
+
if sentiment_counts["Neutral"] > 0:
|
808 |
+
sentiment_detail.append(f"{sentiment_counts['Neutral']} neutral")
|
809 |
+
if sentiment_counts["Slightly Negative"] > 0:
|
810 |
+
sentiment_detail.append(f"{sentiment_counts['Slightly Negative']} slightly negative")
|
811 |
+
if sentiment_counts["Negative"] > 0:
|
812 |
+
sentiment_detail.append(f"{sentiment_counts['Negative']} strongly negative")
|
813 |
+
|
814 |
+
sentiment_breakdown = ", ".join(sentiment_detail)
|
815 |
+
|
816 |
+
if pos_ratio > 0.6:
|
817 |
+
final_analysis = f"The company has primarily positive coverage ({pos_count}/{total} articles positive: {sentiment_breakdown}). This suggests a favorable market perception."
|
818 |
+
elif neg_ratio > 0.6:
|
819 |
+
final_analysis = f"The company has primarily negative coverage ({neg_count}/{total} articles negative: {sentiment_breakdown}). This could indicate challenges or controversies."
|
820 |
+
elif pos_ratio > neg_ratio:
|
821 |
+
final_analysis = f"The company has mixed coverage with a positive lean ({sentiment_breakdown})."
|
822 |
+
elif neg_ratio > pos_ratio:
|
823 |
+
final_analysis = f"The company has mixed coverage with a negative lean ({sentiment_breakdown})."
|
824 |
+
else:
|
825 |
+
final_analysis = f"The company has balanced coverage ({sentiment_breakdown})."
|
826 |
+
|
827 |
+
# If we only have the dummy article, customize the final analysis
|
828 |
+
if len(articles) == 1 and articles[0].url == "#":
|
829 |
+
final_analysis = "Limited news data available. The analysis is based on a placeholder article."
|
830 |
+
|
831 |
+
return {
|
832 |
+
"Sentiment Distribution": sentiment_counts,
|
833 |
+
"Common Topics": common_topics,
|
834 |
+
"Topic Overlap": {
|
835 |
+
"Common Topics Across All": common_topics[:5],
|
836 |
+
"Unique Topics By Article": unique_topics_by_article
|
837 |
+
},
|
838 |
+
"Coverage Differences": comparisons[:10], # Limit to top 10 comparisons
|
839 |
+
"Final Sentiment Analysis": final_analysis
|
840 |
+
}
|
841 |
+
|
842 |
+
def get_sentiment_group(sentiment: str) -> str:
|
843 |
+
"""Group sentiments into broader categories for comparison."""
|
844 |
+
if sentiment in ["Positive", "Slightly Positive"]:
|
845 |
+
return "Positive"
|
846 |
+
elif sentiment in ["Negative", "Slightly Negative"]:
|
847 |
+
return "Negative"
|
848 |
+
else:
|
849 |
+
return "Neutral"
|
850 |
+
|
851 |
+
def translate_to_hindi(text: str) -> str:
|
852 |
+
"""Translate text to Hindi using deep_translator."""
|
853 |
+
try:
|
854 |
+
# Split text into chunks if too long (Google Translator has a limit)
|
855 |
+
max_chunk_size = 4500 # deep_translator's GoogleTranslator has a limit of 5000 chars
|
856 |
+
chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
|
857 |
+
|
858 |
+
translated_chunks = []
|
859 |
+
for chunk in chunks:
|
860 |
+
# Translate the chunk
|
861 |
+
translated = translator.translate(chunk)
|
862 |
+
translated_chunks.append(translated)
|
863 |
+
time.sleep(0.5) # Short delay to avoid rate limiting
|
864 |
+
|
865 |
+
return ''.join(translated_chunks)
|
866 |
+
except Exception as e:
|
867 |
+
print(f"Translation error: {str(e)}")
|
868 |
+
# Fallback to simple placeholder for Hindi text if translation fails
|
869 |
+
return "अनुवाद त्रुटि हुई।" # "Translation error occurred" in Hindi
|
870 |
+
|
871 |
+
def text_to_speech(text: str, output_file: str = 'output.mp3') -> str:
|
872 |
+
"""Convert text to speech in Hindi."""
|
873 |
+
try:
|
874 |
+
# Ensure output directory exists
|
875 |
+
output_dir = os.path.dirname(output_file)
|
876 |
+
if output_dir:
|
877 |
+
os.makedirs(output_dir, exist_ok=True)
|
878 |
+
print(f"Ensuring output directory exists: {output_dir}")
|
879 |
+
|
880 |
+
# If text is too short, add some padding to avoid TTS errors
|
881 |
+
if len(text.strip()) < 5:
|
882 |
+
text = text + " " + "नमस्कार" * 3 # Add some padding text
|
883 |
+
print("Text was too short, adding padding")
|
884 |
+
|
885 |
+
print(f"Attempting to generate TTS for text of length {len(text)} characters")
|
886 |
+
|
887 |
+
# For long texts, split into chunks for better TTS quality
|
888 |
+
if len(text) > 3000:
|
889 |
+
print("Text is long, splitting into chunks for better TTS quality")
|
890 |
+
|
891 |
+
# Split at sentence boundaries
|
892 |
+
sentences = re.split(r'(।|\.|\?|\!)', text)
|
893 |
+
chunks = []
|
894 |
+
current_chunk = ""
|
895 |
+
|
896 |
+
# Combine sentences into chunks of appropriate size
|
897 |
+
for i in range(0, len(sentences), 2):
|
898 |
+
if i+1 < len(sentences): # Make sure we have the punctuation part
|
899 |
+
sentence = sentences[i] + sentences[i+1]
|
900 |
+
else:
|
901 |
+
sentence = sentences[i]
|
902 |
+
|
903 |
+
if len(current_chunk) + len(sentence) < 3000:
|
904 |
+
current_chunk += sentence
|
905 |
+
else:
|
906 |
+
if current_chunk:
|
907 |
+
chunks.append(current_chunk)
|
908 |
+
current_chunk = sentence
|
909 |
+
|
910 |
+
if current_chunk: # Add the last chunk
|
911 |
+
chunks.append(current_chunk)
|
912 |
+
|
913 |
+
print(f"Split text into {len(chunks)} chunks for TTS processing")
|
914 |
+
|
915 |
+
# Process each chunk and combine into one audio file
|
916 |
+
temp_files = []
|
917 |
+
for i, chunk in enumerate(chunks):
|
918 |
+
temp_output = f"{output_file}.part{i}.mp3"
|
919 |
+
try:
|
920 |
+
# Try gTTS for each chunk
|
921 |
+
tts = gTTS(text=chunk, lang='hi', slow=False)
|
922 |
+
tts.save(temp_output)
|
923 |
+
if os.path.exists(temp_output) and os.path.getsize(temp_output) > 0:
|
924 |
+
temp_files.append(temp_output)
|
925 |
+
else:
|
926 |
+
print(f"Failed to create chunk {i} with gTTS")
|
927 |
+
raise Exception(f"gTTS failed for chunk {i}")
|
928 |
+
except Exception as e:
|
929 |
+
print(f"Error with gTTS for chunk {i}: {str(e)}")
|
930 |
+
break
|
931 |
+
|
932 |
+
# If we have temp files, combine them
|
933 |
+
if temp_files:
|
934 |
+
try:
|
935 |
+
# Use pydub to concatenate audio files
|
936 |
+
from pydub import AudioSegment
|
937 |
+
combined = AudioSegment.empty()
|
938 |
+
for temp_file in temp_files:
|
939 |
+
audio = AudioSegment.from_mp3(temp_file)
|
940 |
+
combined += audio
|
941 |
+
|
942 |
+
combined.export(output_file, format="mp3")
|
943 |
+
|
944 |
+
# Clean up temp files
|
945 |
+
for temp_file in temp_files:
|
946 |
+
try:
|
947 |
+
os.remove(temp_file)
|
948 |
+
except:
|
949 |
+
pass
|
950 |
+
|
951 |
+
print(f"Successfully combined {len(temp_files)} audio chunks into {output_file}")
|
952 |
+
return output_file
|
953 |
+
except Exception as e:
|
954 |
+
print(f"Error combining audio files: {str(e)}")
|
955 |
+
# Try to return the first chunk at least
|
956 |
+
if os.path.exists(temp_files[0]):
|
957 |
+
import shutil
|
958 |
+
shutil.copy(temp_files[0], output_file)
|
959 |
+
print(f"Returning first chunk as fallback: {output_file}")
|
960 |
+
return output_file
|
961 |
+
|
962 |
+
# Method 1: Use gTTS for Hindi text-to-speech (for shorter texts or if chunking failed)
|
963 |
+
try:
|
964 |
+
print("Trying to use gTTS...")
|
965 |
+
tts = gTTS(text=text, lang='hi', slow=False)
|
966 |
+
tts.save(output_file)
|
967 |
+
|
968 |
+
# Verify the file was created and is not empty
|
969 |
+
if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
|
970 |
+
print(f"Successfully created audio file with gTTS: {output_file} (size: {os.path.getsize(output_file)} bytes)")
|
971 |
+
return output_file
|
972 |
+
else:
|
973 |
+
print(f"gTTS created a file but it may be empty or invalid: {output_file}")
|
974 |
+
raise Exception("Generated audio file is empty or invalid")
|
975 |
+
|
976 |
+
except Exception as e:
|
977 |
+
print(f"gTTS error: {str(e)}")
|
978 |
+
|
979 |
+
# Method 2: Fallback to pyttsx3
|
980 |
+
try:
|
981 |
+
print("Falling back to pyttsx3...")
|
982 |
+
engine = pyttsx3.init()
|
983 |
+
# Try to find a Hindi voice, or use default
|
984 |
+
voices = engine.getProperty('voices')
|
985 |
+
found_hindi_voice = False
|
986 |
+
|
987 |
+
for voice in voices:
|
988 |
+
print(f"Checking voice: {voice.name}")
|
989 |
+
if 'hindi' in voice.name.lower():
|
990 |
+
print(f"Found Hindi voice: {voice.name}")
|
991 |
+
engine.setProperty('voice', voice.id)
|
992 |
+
found_hindi_voice = True
|
993 |
+
break
|
994 |
+
|
995 |
+
if not found_hindi_voice:
|
996 |
+
print("No Hindi voice found, using default voice")
|
997 |
+
|
998 |
+
engine.save_to_file(text, output_file)
|
999 |
+
engine.runAndWait()
|
1000 |
+
|
1001 |
+
# Verify the file was created and is not empty
|
1002 |
+
if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
|
1003 |
+
print(f"Successfully created audio file with pyttsx3: {output_file} (size: {os.path.getsize(output_file)} bytes)")
|
1004 |
+
return output_file
|
1005 |
+
else:
|
1006 |
+
print(f"pyttsx3 created a file but it may be empty or invalid: {output_file}")
|
1007 |
+
raise Exception("Generated audio file is empty or invalid")
|
1008 |
+
|
1009 |
+
except Exception as e2:
|
1010 |
+
print(f"pyttsx3 error: {str(e2)}")
|
1011 |
+
|
1012 |
+
# If all TTS methods fail, create a simple notification sound as fallback
|
1013 |
+
try:
|
1014 |
+
print("Both TTS methods failed. Creating a simple audio notification instead.")
|
1015 |
+
# Generate a simple beep sound as a fallback (1 second, 440Hz)
|
1016 |
+
import numpy as np
|
1017 |
+
from scipy.io import wavfile
|
1018 |
+
|
1019 |
+
sample_rate = 44100
|
1020 |
+
duration = 1 # seconds
|
1021 |
+
t = np.linspace(0, duration, int(sample_rate * duration))
|
1022 |
+
|
1023 |
+
# Generate a simple tone
|
1024 |
+
frequency = 440 # Hz (A4 note)
|
1025 |
+
data = np.sin(2 * np.pi * frequency * t) * 32767
|
1026 |
+
data = data.astype(np.int16)
|
1027 |
+
|
1028 |
+
# Convert output_file from mp3 to wav
|
1029 |
+
wav_output_file = output_file.replace('.mp3', '.wav')
|
1030 |
+
wavfile.write(wav_output_file, sample_rate, data)
|
1031 |
+
|
1032 |
+
print(f"Created simple audio notification: {wav_output_file}")
|
1033 |
+
return wav_output_file
|
1034 |
+
|
1035 |
+
except Exception as e3:
|
1036 |
+
print(f"Failed to create fallback audio: {str(e3)}")
|
1037 |
+
return ""
|
1038 |
+
|
1039 |
+
return ""
|
1040 |
+
except Exception as e:
|
1041 |
+
print(f"TTS error: {str(e)}")
|
1042 |
+
return ""
|
1043 |
+
|
1044 |
+
def prepare_final_report(company_name: str, articles: List[NewsArticle],
|
1045 |
+
comparative_analysis: Dict[str, Any]) -> Dict[str, Any]:
|
1046 |
+
"""Prepare final report in the required format."""
|
1047 |
+
article_data = []
|
1048 |
+
|
1049 |
+
for article in articles:
|
1050 |
+
article_data.append({
|
1051 |
+
"Title": article.title,
|
1052 |
+
"Summary": article.summary,
|
1053 |
+
"Sentiment": article.sentiment,
|
1054 |
+
"Topics": article.topics
|
1055 |
+
})
|
1056 |
+
|
1057 |
+
# Prepare a more detailed summary for TTS with actual content from articles
|
1058 |
+
summary_text = f"{company_name} के बारे में समाचार विश्लेषण। "
|
1059 |
+
|
1060 |
+
# Add information about the number of articles found
|
1061 |
+
summary_text += f"कुल {len(articles)} लेख मिले। "
|
1062 |
+
|
1063 |
+
# Add sentiment distribution
|
1064 |
+
sentiment_counts = comparative_analysis["Sentiment Distribution"]
|
1065 |
+
pos_count = sentiment_counts["Positive"] + sentiment_counts["Slightly Positive"]
|
1066 |
+
neg_count = sentiment_counts["Negative"] + sentiment_counts["Slightly Negative"]
|
1067 |
+
neu_count = sentiment_counts["Neutral"]
|
1068 |
+
|
1069 |
+
if pos_count > 0 or neg_count > 0 or neu_count > 0:
|
1070 |
+
sentiment_detail = []
|
1071 |
+
if sentiment_counts["Positive"] > 0:
|
1072 |
+
sentiment_detail.append(f"{sentiment_counts['Positive']} पूर्ण सकारात्मक")
|
1073 |
+
if sentiment_counts["Slightly Positive"] > 0:
|
1074 |
+
sentiment_detail.append(f"{sentiment_counts['Slightly Positive']} हल्का सकारात्मक")
|
1075 |
+
if sentiment_counts["Neutral"] > 0:
|
1076 |
+
sentiment_detail.append(f"{sentiment_counts['Neutral']} तटस्थ")
|
1077 |
+
if sentiment_counts["Slightly Negative"] > 0:
|
1078 |
+
sentiment_detail.append(f"{sentiment_counts['Slightly Negative']} हल्का नकारात्मक")
|
1079 |
+
if sentiment_counts["Negative"] > 0:
|
1080 |
+
sentiment_detail.append(f"{sentiment_counts['Negative']} पूर्ण नकारात्मक")
|
1081 |
+
|
1082 |
+
summary_text += f"भावना विश्लेषण: {', '.join(sentiment_detail)}। "
|
1083 |
+
|
1084 |
+
# Add common topics with more detail
|
1085 |
+
common_topics = comparative_analysis["Common Topics"][:5]
|
1086 |
+
if common_topics:
|
1087 |
+
summary_text += f"मुख्य विषय हैं: {', '.join(common_topics)}। "
|
1088 |
+
|
1089 |
+
# Add more context about the common topics
|
1090 |
+
summary_text += "इन विषयों के बारे में लेखों में यह कहा गया है: "
|
1091 |
+
|
1092 |
+
# Find sentences related to common topics in the articles
|
1093 |
+
topic_sentences = []
|
1094 |
+
for topic in common_topics[:3]: # Focus on top 3 topics
|
1095 |
+
found = False
|
1096 |
+
for article in articles:
|
1097 |
+
if topic in article.content.lower():
|
1098 |
+
# Find sentences containing this topic
|
1099 |
+
sentences = sent_tokenize(article.content)
|
1100 |
+
for sentence in sentences:
|
1101 |
+
if topic in sentence.lower() and len(sentence) < 150:
|
1102 |
+
topic_sentences.append(f"{topic} के बारे में: {sentence}")
|
1103 |
+
found = True
|
1104 |
+
break
|
1105 |
+
if found:
|
1106 |
+
break
|
1107 |
+
|
1108 |
+
if topic_sentences:
|
1109 |
+
summary_text += " ".join(topic_sentences[:3]) + " "
|
1110 |
+
|
1111 |
+
# Add article summaries
|
1112 |
+
summary_text += "लेखों का सारांश: "
|
1113 |
+
for i, article in enumerate(articles[:3]): # Include up to 3 articles
|
1114 |
+
summary_text += f"लेख {i+1}: {article.title}. {article.summary[:200]}... "
|
1115 |
+
|
1116 |
+
# Add sentiment for this specific article
|
1117 |
+
summary_text += f"इस लेख का भावना: {article.sentiment}. "
|
1118 |
+
|
1119 |
+
# Add final sentiment analysis
|
1120 |
+
summary_text += comparative_analysis["Final Sentiment Analysis"]
|
1121 |
+
|
1122 |
+
# Translate the detailed summary to Hindi
|
1123 |
+
hindi_summary = translate_to_hindi(summary_text)
|
1124 |
+
|
1125 |
+
# Format the response according to the required format
|
1126 |
+
return {
|
1127 |
+
"Company": company_name,
|
1128 |
+
"Articles": article_data,
|
1129 |
+
"Comparative Sentiment Score": comparative_analysis,
|
1130 |
+
"Final Sentiment Analysis": comparative_analysis["Final Sentiment Analysis"],
|
1131 |
+
"Hindi Summary": hindi_summary
|
1132 |
+
}
|