BSJ2004 commited on
Commit
47af8ed
·
verified ·
1 Parent(s): 565e727

Upload 11 files

Browse files
Files changed (11) hide show
  1. .gitattributes +1 -35
  2. .gitignore +50 -0
  3. Dockerfile +49 -0
  4. README.md +143 -13
  5. Spacefile +8 -0
  6. api.py +332 -0
  7. app.py +496 -0
  8. generate_json_output.py +55 -0
  9. healthcheck.py +171 -0
  10. requirements.txt +39 -0
  11. utils.py +1132 -0
.gitattributes CHANGED
@@ -1,35 +1 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.map filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+
23
+ # Virtual Environment
24
+ venv/
25
+ ENV/
26
+ env/
27
+ .env
28
+
29
+ # IDE
30
+ .idea/
31
+ .vscode/
32
+ *.swp
33
+ *.swo
34
+ .DS_Store
35
+
36
+ # Logs
37
+ logs/
38
+ *.log
39
+
40
+ # Audio files
41
+ audio_files/
42
+ *.mp3
43
+ *.wav
44
+
45
+ # Jupyter
46
+ .ipynb_checkpoints
47
+
48
+ # Model caches
49
+ .cache/
50
+ .local/
Dockerfile ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install dependencies
6
+ COPY requirements.txt .
7
+ RUN pip install --no-cache-dir -r requirements.txt
8
+
9
+ # Install additional dependencies needed for NLP tasks and TTS
10
+ RUN apt-get update && apt-get install -y \
11
+ build-essential \
12
+ curl \
13
+ software-properties-common \
14
+ git \
15
+ ffmpeg \
16
+ espeak \
17
+ libespeak-dev \
18
+ alsa-utils \
19
+ python3-pyaudio \
20
+ libasound2-dev \
21
+ && rm -rf /var/lib/apt/lists/*
22
+
23
+ # Copy app files
24
+ COPY . .
25
+
26
+ # Create directory for audio files
27
+ RUN mkdir -p audio_files
28
+
29
+ # Set environment variables
30
+ ENV PYTHONDONTWRITEBYTECODE=1
31
+ ENV PYTHONUNBUFFERED=1
32
+
33
+ # Download NLTK data
34
+ RUN python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords')"
35
+
36
+ # Expose ports
37
+ EXPOSE 8000
38
+ EXPOSE 8501
39
+
40
+ # Create a shell script to run both services
41
+ RUN echo '#!/bin/bash\n\
42
+ uvicorn api:app --host 0.0.0.0 --port 8000 &\n\
43
+ streamlit run app.py --server.port 8501 --server.address 0.0.0.0\n'\
44
+ > /app/start.sh
45
+
46
+ RUN chmod +x /app/start.sh
47
+
48
+ # Start the application
49
+ CMD ["/app/start.sh"]
README.md CHANGED
@@ -1,13 +1,143 @@
1
- ---
2
- title: Text1123
3
- emoji: 🏢
4
- colorFrom: purple
5
- colorTo: indigo
6
- sdk: streamlit
7
- sdk_version: 1.43.2
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # News Summarization and Text-to-Speech Application
2
+
3
+ A web-based application that extracts news articles related to companies, performs sentiment analysis, conducts comparative analysis, and generates a text-to-speech output in Hindi.
4
+
5
+ ## Features
6
+
7
+ - **News Extraction**: Scrapes at least 10 unique news articles about a given company using BeautifulSoup
8
+ - **Sentiment Analysis**: Analyzes the sentiment of each article (positive, negative, neutral)
9
+ - **Comparative Analysis**: Compares sentiment across articles to derive insights
10
+ - **Text-to-Speech**: Converts summarized content to Hindi speech
11
+ - **User Interface**: Simple web interface built with Streamlit
12
+ - **API Communication**: Backend and frontend communicate through APIs
13
+
14
+ ## Project Structure
15
+
16
+ ```
17
+ .
18
+ ├── app.py # Main Streamlit application
19
+ ├── api.py # API endpoints
20
+ ├── utils.py # Utility functions for scraping, sentiment analysis, etc.
21
+ ├── healthcheck.py # Script to verify all dependencies and services
22
+ ├── requirements.txt # Project dependencies
23
+ ├── Dockerfile # Docker configuration for deployment
24
+ ├── Spacefile # Hugging Face Spaces configuration
25
+ └── README.md # Project documentation
26
+ ```
27
+
28
+ ## Setup Instructions
29
+
30
+ 1. **Clone the repository**:
31
+ ```
32
+ git clone https://github.com/yourusername/news-summarization-tts.git
33
+ cd news-summarization-tts
34
+ ```
35
+
36
+ 2. **Create a virtual environment** (recommended):
37
+ ```
38
+ python -m venv venv
39
+ source venv/bin/activate # On Windows: venv\Scripts\activate
40
+ ```
41
+
42
+ 3. **Install dependencies**:
43
+ ```
44
+ pip install -r requirements.txt
45
+ ```
46
+
47
+ 4. **Install system dependencies** (for text-to-speech functionality):
48
+ - On Ubuntu/Debian:
49
+ ```
50
+ sudo apt-get install espeak ffmpeg
51
+ ```
52
+ - On Windows:
53
+ Download and install espeak from http://espeak.sourceforge.net/download.html
54
+
55
+ 5. **Run the healthcheck** (to verify all dependencies are working):
56
+ ```
57
+ python healthcheck.py
58
+ ```
59
+
60
+ 6. **Run the API server**:
61
+ ```
62
+ uvicorn api:app --reload
63
+ ```
64
+
65
+ 7. **Run the Streamlit application** (in a separate terminal):
66
+ ```
67
+ streamlit run app.py
68
+ ```
69
+
70
+ ## Models Used
71
+
72
+ - **News Summarization**: Extractive summarization using NLTK and NetworkX
73
+ - **Sentiment Analysis**: VADER for sentiment analysis and Hugging Face Transformers
74
+ - **Translation**: Google Translate API via deep-translator library
75
+ - **Text-to-Speech**: Google Text-to-Speech (gTTS) and pyttsx3 as fallback for Hindi conversion
76
+
77
+ ## API Documentation
78
+
79
+ ### Endpoints
80
+
81
+ - `POST /api/get_news`: Fetches news articles about a company
82
+ - Request body: `{"company_name": "Tesla"}`
83
+ - Returns a list of articles with metadata
84
+
85
+ - `POST /api/analyze_sentiment`: Performs sentiment analysis on articles
86
+ - Request body: `{"articles": [article_list]}`
87
+ - Returns sentiment analysis for each article
88
+
89
+ - `POST /api/generate_speech`: Converts text to Hindi speech
90
+ - Request body: `{"text": "summarized_text"}`
91
+ - Returns a URL to the generated audio file
92
+
93
+ - `POST /api/complete_analysis`: Performs complete analysis including fetching news, sentiment analysis, and generating speech
94
+ - Request body: `{"company_name": "Tesla"}`
95
+ - Returns complete analysis results
96
+
97
+ ## Assumptions & Limitations
98
+
99
+ - The application scrapes publicly available news articles that don't require JavaScript rendering
100
+ - Sentiment analysis accuracy depends on the model used and may not capture context-specific nuances
101
+ - Hindi translation and TTS quality may vary based on technical terms
102
+ - The application requires an internet connection to fetch news articles and use cloud-based services
103
+
104
+ ## Troubleshooting
105
+
106
+ If you encounter any issues:
107
+
108
+ 1. Run the healthcheck script to verify all dependencies are working:
109
+ ```
110
+ python healthcheck.py
111
+ ```
112
+
113
+ 2. Check that you have all the required system dependencies installed (espeak, ffmpeg).
114
+
115
+ 3. If you encounter issues with specific components:
116
+ - Translation service requires an internet connection
117
+ - Text-to-speech uses gTTS by default, but falls back to pyttsx3 if needed
118
+ - Transformer models may take time to download on first run
119
+
120
+ ## Deployment
121
+
122
+ This application is deployed on Hugging Face Spaces: [Link to deployment]
123
+
124
+ ### Using Docker
125
+
126
+ You can also run the application using Docker:
127
+
128
+ ```
129
+ docker build -t news-summarization-tts .
130
+ docker run -p 8501:8501 -p 8000:8000 news-summarization-tts
131
+ ```
132
+
133
+ ## Future Improvements
134
+
135
+ - Add support for more languages
136
+ - Implement advanced NLP techniques for better summarization
137
+ - Improve the user interface with more interactive visualizations
138
+ - Add historical data analysis for tracking sentiment over time
139
+ - Enhance TTS quality with dedicated Hindi speech models
140
+
141
+ ## License
142
+
143
+ MIT
Spacefile ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Spacefile Docs: https://huggingface.co/docs/hub/spaces-config-reference
2
+ title: News Summarization and TTS
3
+ emoji: 📰
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: docker
7
+ app_port: 8501
8
+ pinned: false
api.py ADDED
@@ -0,0 +1,332 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, HTTPException, Response, File, UploadFile, Form
2
+ from fastapi.responses import FileResponse, JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from pydantic import BaseModel
5
+ from typing import List, Dict, Any, Optional
6
+ import os
7
+ import json
8
+ import uuid
9
+ import asyncio
10
+ import uvicorn
11
+ from utils import (search_news, analyze_article_sentiment, perform_comparative_analysis,
12
+ translate_to_hindi, text_to_speech, prepare_final_report, NewsArticle)
13
+
14
+ # Initialize FastAPI app
15
+ app = FastAPI(
16
+ title="News Summarization and TTS API",
17
+ description="API for extracting news, performing sentiment analysis, and generating Hindi TTS audio",
18
+ version="1.0.0"
19
+ )
20
+
21
+ # Add CORS middleware
22
+ app.add_middleware(
23
+ CORSMiddleware,
24
+ allow_origins=["*"], # Allow all origins
25
+ allow_credentials=True,
26
+ allow_methods=["*"], # Allow all methods
27
+ allow_headers=["*"], # Allow all headers
28
+ )
29
+
30
+ # Define request/response models
31
+ class CompanyRequest(BaseModel):
32
+ company_name: str
33
+
34
+ class TextToSpeechRequest(BaseModel):
35
+ text: str
36
+ output_filename: Optional[str] = None
37
+
38
+ class SentimentAnalysisRequest(BaseModel):
39
+ articles: List[Dict[str, Any]]
40
+
41
+ class NewsResponse(BaseModel):
42
+ articles: List[Dict[str, Any]]
43
+
44
+ class SentimentResponse(BaseModel):
45
+ sentiment_analysis: Dict[str, Any]
46
+
47
+ class TextToSpeechResponse(BaseModel):
48
+ audio_file: str
49
+ text: str
50
+
51
+ # Create a directory for audio files if it doesn't exist
52
+ os.makedirs("audio_files", exist_ok=True)
53
+
54
+ # API endpoints
55
+ @app.get("/")
56
+ async def root():
57
+ """Root endpoint to check if API is running."""
58
+ return {"message": "News Summarization and TTS API is running"}
59
+
60
+ @app.post("/api/get_news", response_model=NewsResponse)
61
+ async def get_news(request: CompanyRequest):
62
+ """Fetch news articles about a specific company."""
63
+ try:
64
+ company_name = request.company_name
65
+ articles = search_news(company_name)
66
+
67
+ if not articles:
68
+ raise HTTPException(status_code=404, detail=f"No news articles found for {company_name}")
69
+
70
+ # Convert NewsArticle objects to dictionaries
71
+ article_data = [article.to_dict() for article in articles]
72
+
73
+ return {"articles": article_data}
74
+
75
+ except Exception as e:
76
+ raise HTTPException(status_code=500, detail=str(e))
77
+
78
+ @app.post("/api/analyze_sentiment", response_model=SentimentResponse)
79
+ async def analyze_sentiment(request: SentimentAnalysisRequest):
80
+ """Analyze sentiment of provided articles."""
81
+ try:
82
+ # Convert dictionaries back to NewsArticle objects
83
+ articles = []
84
+ for article_dict in request.articles:
85
+ article = NewsArticle(
86
+ title=article_dict["title"],
87
+ url=article_dict["url"],
88
+ content=article_dict["content"],
89
+ summary=article_dict.get("summary", ""),
90
+ source=article_dict.get("source", ""),
91
+ date=article_dict.get("date", ""),
92
+ sentiment=article_dict.get("sentiment", ""),
93
+ topics=article_dict.get("topics", [])
94
+ )
95
+ articles.append(article)
96
+
97
+ # Perform detailed sentiment analysis for each article
98
+ detailed_sentiment = [analyze_article_sentiment(article) for article in articles]
99
+
100
+ # Perform comparative analysis
101
+ comparative_analysis = perform_comparative_analysis(articles)
102
+
103
+ return {
104
+ "sentiment_analysis": {
105
+ "detailed_sentiment": detailed_sentiment,
106
+ "comparative_analysis": comparative_analysis
107
+ }
108
+ }
109
+
110
+ except Exception as e:
111
+ raise HTTPException(status_code=500, detail=str(e))
112
+
113
+ @app.post("/api/generate_speech", response_model=TextToSpeechResponse)
114
+ async def generate_speech(request: TextToSpeechRequest):
115
+ """Convert text to Hindi speech."""
116
+ try:
117
+ text = request.text
118
+
119
+ # Generate a unique filename if not provided
120
+ output_filename = request.output_filename
121
+ if not output_filename:
122
+ unique_id = uuid.uuid4().hex
123
+ output_filename = f"audio_files/{unique_id}.mp3"
124
+ elif not output_filename.startswith("audio_files/"):
125
+ output_filename = f"audio_files/{output_filename}"
126
+
127
+ # Translate text to Hindi
128
+ hindi_text = translate_to_hindi(text)
129
+
130
+ # Convert text to speech
131
+ audio_file = text_to_speech(hindi_text, output_filename)
132
+
133
+ if not audio_file:
134
+ raise HTTPException(status_code=500, detail="Failed to generate audio file")
135
+
136
+ return {
137
+ "audio_file": audio_file,
138
+ "text": hindi_text
139
+ }
140
+
141
+ except Exception as e:
142
+ raise HTTPException(status_code=500, detail=str(e))
143
+
144
+ @app.post("/api/complete_analysis")
145
+ async def complete_analysis(request: CompanyRequest):
146
+ """Perform complete analysis for a company."""
147
+ try:
148
+ company_name = request.company_name
149
+
150
+ # Log the start of analysis
151
+ print(f"Starting complete analysis for company: {company_name}")
152
+
153
+ # Step 1: Get news articles
154
+ print("Step 1: Fetching news articles...")
155
+ articles = search_news(company_name, num_articles=5) # Increased from default 3 to 5
156
+ print(f"Found {len(articles)} articles for {company_name}")
157
+
158
+ if not articles:
159
+ raise HTTPException(status_code=404, detail=f"No news articles found for {company_name}")
160
+
161
+ # Step 2: Perform comparative analysis
162
+ print("Step 2: Performing comparative analysis...")
163
+ comparative_analysis = perform_comparative_analysis(articles)
164
+ print("Comparative analysis completed")
165
+
166
+ # Step 3: Prepare final report
167
+ print("Step 3: Preparing final report...")
168
+ final_report = prepare_final_report(company_name, articles, comparative_analysis)
169
+ print("Final report prepared")
170
+
171
+ # Step 4: Generate Hindi TTS
172
+ print("Step 4: Generating Hindi TTS...")
173
+ unique_id = uuid.uuid4().hex
174
+ output_filename = f"audio_files/{unique_id}.mp3"
175
+
176
+ # Use the Hindi summary for TTS
177
+ hindi_text = final_report["Hindi Summary"]
178
+ print(f"Converting Hindi text to speech (length: {len(hindi_text)} characters)")
179
+
180
+ audio_file = text_to_speech(hindi_text, output_filename)
181
+
182
+ # Format the response to match the example output exactly
183
+ formatted_response = {
184
+ "Company": company_name,
185
+ "Articles": final_report["Articles"],
186
+ "Comparative Sentiment Score": {
187
+ "Sentiment Distribution": comparative_analysis["Sentiment Distribution"],
188
+ "Coverage Differences": comparative_analysis["Coverage Differences"],
189
+ "Topic Overlap": {
190
+ "Common Topics": comparative_analysis["Topic Overlap"]["Common Topics Across All"],
191
+ }
192
+ },
193
+ "Final Sentiment Analysis": comparative_analysis["Final Sentiment Analysis"],
194
+ }
195
+
196
+ # Format the unique topics by article to match the expected output exactly
197
+ unique_topics = comparative_analysis["Topic Overlap"]["Unique Topics By Article"]
198
+ for article_idx, topics in unique_topics.items():
199
+ article_num = int(article_idx) + 1
200
+ formatted_response["Comparative Sentiment Score"]["Topic Overlap"][f"Unique Topics in Article {article_num}"] = topics
201
+
202
+ # If we don't have more than 1 article, create some example comparisons to match format
203
+ if len(articles) <= 1:
204
+ formatted_response["Comparative Sentiment Score"]["Coverage Differences"] = [
205
+ {
206
+ "Comparison": f"Only one article about {company_name} was found, limiting comparative analysis.",
207
+ "Impact": "Unable to compare coverage across multiple sources for more comprehensive insights."
208
+ }
209
+ ]
210
+
211
+ # Add audio information
212
+ if not audio_file:
213
+ print("Warning: Failed to generate audio file")
214
+ formatted_response["Audio"] = "Failed to generate audio"
215
+ else:
216
+ print(f"Audio file generated: {audio_file}")
217
+ formatted_response["Audio"] = f"[Play Hindi Speech]"
218
+ # Store the actual audio file path in a hidden field
219
+ formatted_response["_audio_file_path"] = audio_file
220
+
221
+ # Add the Hindi Summary to the response as well (needed for rendering in Streamlit)
222
+ formatted_response["Hindi Summary"] = final_report["Hindi Summary"]
223
+
224
+ print("Complete analysis finished successfully")
225
+ return formatted_response
226
+
227
+ except HTTPException as he:
228
+ # Re-raise HTTP exceptions
229
+ print(f"HTTP Exception: {he.detail}")
230
+ raise
231
+
232
+ except Exception as e:
233
+ # For any other exception, provide detailed error information
234
+ import traceback
235
+ error_trace = traceback.format_exc()
236
+ error_message = f"Error processing request: {str(e)}"
237
+ print(f"ERROR: {error_message}")
238
+ print(f"Traceback: {error_trace}")
239
+
240
+ # Return a more user-friendly error message
241
+ user_message = "An error occurred during analysis. "
242
+
243
+ if "timeout" in str(e).lower():
244
+ user_message += "There was a timeout when connecting to news sources. Please try again or try another company name."
245
+ elif "connection" in str(e).lower():
246
+ user_message += "There was a connection issue with one of the news sources. Please check your internet connection."
247
+ elif "not found" in str(e).lower():
248
+ user_message += f"No information could be found for {company_name}. Please try another company name."
249
+ else:
250
+ user_message += "Please try again with a different company name or check the server logs for more details."
251
+
252
+ raise HTTPException(status_code=500, detail=user_message)
253
+
254
+ @app.get("/api/audio/{file_name}")
255
+ async def get_audio(file_name: str):
256
+ """Serve audio files."""
257
+ file_path = f"audio_files/{file_name}"
258
+
259
+ # Make sure the audio_files directory exists
260
+ os.makedirs("audio_files", exist_ok=True)
261
+
262
+ if not os.path.exists(file_path):
263
+ print(f"Audio file not found: {file_path}")
264
+ # Check if any audio files exist in the directory
265
+ audio_files = os.listdir("audio_files") if os.path.exists("audio_files") else []
266
+ print(f"Available audio files: {audio_files}")
267
+ raise HTTPException(status_code=404, detail=f"Audio file {file_name} not found")
268
+
269
+ try:
270
+ # Verify the file can be opened and is not corrupt
271
+ with open(file_path, "rb") as f:
272
+ file_size = os.path.getsize(file_path)
273
+ print(f"Serving audio file: {file_path} (size: {file_size} bytes)")
274
+ if file_size == 0:
275
+ raise HTTPException(status_code=500, detail="Audio file is empty")
276
+ except Exception as e:
277
+ print(f"Error accessing audio file {file_path}: {str(e)}")
278
+ raise HTTPException(status_code=500, detail=f"Error accessing audio file: {str(e)}")
279
+
280
+ # Set appropriate headers for audio file
281
+ headers = {
282
+ "Cache-Control": "no-cache, no-store, must-revalidate",
283
+ "Pragma": "no-cache",
284
+ "Expires": "0",
285
+ "Content-Disposition": f"attachment; filename={file_name}"
286
+ }
287
+
288
+ # Determine the correct media type based on file extension
289
+ media_type = "audio/mpeg"
290
+ if file_name.lower().endswith(".wav"):
291
+ media_type = "audio/wav"
292
+
293
+ return FileResponse(
294
+ path=file_path,
295
+ media_type=media_type,
296
+ headers=headers,
297
+ filename=file_name
298
+ )
299
+
300
+ @app.post("/api/example_format")
301
+ async def get_example_format(request: CompanyRequest):
302
+ """
303
+ Get analysis results in the example format specified.
304
+ This endpoint provides results that exactly match the requested output format.
305
+ """
306
+ try:
307
+ # Get the base analysis
308
+ company_name = request.company_name
309
+ result = await complete_analysis(request)
310
+
311
+ # Format it to match the example output
312
+ formatted_output = {
313
+ "Company": result["Company"],
314
+ "Articles": result["Articles"],
315
+ "Comparative Sentiment Score": {
316
+ "Sentiment Distribution": result["Comparative Sentiment Score"]["Sentiment Distribution"],
317
+ "Coverage Differences": result["Comparative Sentiment Score"]["Coverage Differences"],
318
+ "Topic Overlap": result["Comparative Sentiment Score"]["Topic Overlap"]
319
+ },
320
+ "Final Sentiment Analysis": result["Final Sentiment Analysis"],
321
+ "Audio": "[Play Hindi Speech]" if result.get("Audio") else "No audio available"
322
+ }
323
+
324
+ return formatted_output
325
+
326
+ except HTTPException:
327
+ raise
328
+ except Exception as e:
329
+ raise HTTPException(status_code=500, detail=f"Error generating example format: {str(e)}")
330
+
331
+ if __name__ == "__main__":
332
+ uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)
app.py ADDED
@@ -0,0 +1,496 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ import pandas as pd
4
+ import json
5
+ import os
6
+ import matplotlib.pyplot as plt
7
+ import seaborn as sns
8
+ import base64
9
+ from io import BytesIO
10
+ from PIL import Image, ImageEnhance
11
+ import time
12
+ from typing import Dict, Any, List
13
+
14
+ # API Base URL - Change this to match your deployment
15
+ API_BASE_URL = "http://localhost:8000"
16
+
17
+ # New function to generate the example output format
18
+ def generate_example_output(company_name: str) -> str:
19
+ """
20
+ Generate output in the example format for the given company.
21
+ Returns the formatted JSON as a string.
22
+ """
23
+ try:
24
+ # Make API request to get the analysis data
25
+ url = f"{API_BASE_URL}/api/complete_analysis"
26
+ response = requests.post(url, json={"company_name": company_name})
27
+ response.raise_for_status()
28
+ data = response.json()
29
+
30
+ # Format the data to match the example output format exactly
31
+ formatted_output = {
32
+ "Company": data["Company"],
33
+ "Articles": data["Articles"],
34
+ "Comparative Sentiment Score": {
35
+ "Sentiment Distribution": data["Comparative Sentiment Score"]["Sentiment Distribution"],
36
+ "Coverage Differences": data["Comparative Sentiment Score"]["Coverage Differences"],
37
+ "Topic Overlap": data["Comparative Sentiment Score"]["Topic Overlap"]
38
+ },
39
+ "Final Sentiment Analysis": data["Final Sentiment Analysis"],
40
+ "Audio": "[Play Hindi Speech]" if data.get("Audio") else "No audio available"
41
+ }
42
+
43
+ # Convert to JSON string with proper formatting
44
+ return json.dumps(formatted_output, indent=2)
45
+
46
+ except Exception as e:
47
+ return json.dumps({
48
+ "error": str(e),
49
+ "message": "Failed to generate example output"
50
+ }, indent=2)
51
+
52
+ # Function to run in terminal mode
53
+ def run_terminal_mode():
54
+ """Run the app in terminal mode to output JSON"""
55
+ print("News Analysis Terminal Mode")
56
+ company_name = input("Enter company name: ")
57
+ print(f"Analyzing {company_name}...")
58
+ output = generate_example_output(company_name)
59
+ print(output)
60
+
61
+ # Check if run directly or imported
62
+ if __name__ == "__main__":
63
+ # Check if terminal mode is requested via command line args
64
+ import sys
65
+ if len(sys.argv) > 1 and sys.argv[1] == "--terminal":
66
+ run_terminal_mode()
67
+ else:
68
+ # Continue with the Streamlit app
69
+
70
+ # App title and description
71
+ st.set_page_config(
72
+ page_title="News Summarization & TTS",
73
+ page_icon="📰",
74
+ layout="wide",
75
+ initial_sidebar_state="expanded"
76
+ )
77
+
78
+ # Custom CSS for better UI
79
+ st.markdown("""
80
+ <style>
81
+ .main-header {
82
+ font-size: 2.5rem;
83
+ font-weight: 700;
84
+ color: #1E3A8A;
85
+ margin-bottom: 1rem;
86
+ }
87
+ .sub-header {
88
+ font-size: 1.5rem;
89
+ font-weight: 600;
90
+ color: #2563EB;
91
+ margin-top: 1rem;
92
+ margin-bottom: 0.5rem;
93
+ }
94
+ .card {
95
+ padding: 1.5rem;
96
+ border-radius: 0.5rem;
97
+ background-color: #F8FAFC;
98
+ border: 1px solid #E2E8F0;
99
+ margin-bottom: 1rem;
100
+ }
101
+ .positive {
102
+ color: #059669;
103
+ font-weight: 600;
104
+ }
105
+ .negative {
106
+ color: #DC2626;
107
+ font-weight: 600;
108
+ }
109
+ .neutral {
110
+ color: #6B7280;
111
+ font-weight: 600;
112
+ }
113
+ .topic-tag {
114
+ display: inline-block;
115
+ padding: 0.25rem 0.5rem;
116
+ border-radius: 2rem;
117
+ background-color: #E5E7EB;
118
+ color: #1F2937;
119
+ font-size: 0.75rem;
120
+ margin-right: 0.5rem;
121
+ margin-bottom: 0.5rem;
122
+ }
123
+ .audio-container {
124
+ width: 100%;
125
+ padding: 1rem;
126
+ background-color: #F3F4F6;
127
+ border-radius: 0.5rem;
128
+ margin-top: 1rem;
129
+ }
130
+ .info-text {
131
+ font-size: 0.9rem;
132
+ color: #4B5563;
133
+ }
134
+ .article-title {
135
+ font-size: 1.2rem;
136
+ font-weight: 600;
137
+ color: #111827;
138
+ margin-bottom: 0.5rem;
139
+ margin-top: 0.5rem;
140
+ }
141
+ .article-summary {
142
+ font-size: 0.9rem;
143
+ color: #374151;
144
+ margin-bottom: 0.5rem;
145
+ }
146
+ .article-meta {
147
+ font-size: 0.8rem;
148
+ color: #6B7280;
149
+ margin-bottom: 0.5rem;
150
+ }
151
+ .section-divider {
152
+ height: 1px;
153
+ background-color: #E5E7EB;
154
+ margin: 1.5rem 0;
155
+ }
156
+ .chart-container {
157
+ background-color: white;
158
+ padding: 1rem;
159
+ border-radius: 0.5rem;
160
+ border: 1px solid #E2E8F0;
161
+ }
162
+ </style>
163
+ """, unsafe_allow_html=True)
164
+
165
+ # Function to make API requests
166
+ def make_api_request(endpoint: str, data: Dict[str, Any] = None, method: str = "POST") -> Dict[str, Any]:
167
+ """Make API request to the backend."""
168
+ url = f"{API_BASE_URL}{endpoint}"
169
+
170
+ try:
171
+ if method == "GET":
172
+ response = requests.get(url)
173
+ else:
174
+ response = requests.post(url, json=data)
175
+
176
+ response.raise_for_status()
177
+ return response.json()
178
+ except requests.exceptions.ConnectionError:
179
+ st.error("⚠️ Connection Error: Cannot connect to the API server. Please ensure the API server is running at " + API_BASE_URL)
180
+ return {}
181
+ except requests.exceptions.Timeout:
182
+ st.error("⚠️ Timeout Error: The request took too long to complete. Please try again with a different company name.")
183
+ return {}
184
+ except requests.exceptions.HTTPError as e:
185
+ if e.response.status_code == 404:
186
+ st.error("⚠️ No articles found for this company. Please try another company name.")
187
+ elif e.response.status_code == 500:
188
+ # Try to get detailed error message
189
+ try:
190
+ error_detail = e.response.json().get("detail", "Unknown server error")
191
+ st.error(f"⚠️ Server Error: {error_detail}")
192
+ except:
193
+ st.error("⚠️ Internal Server Error: Something went wrong on the server. Please try again later.")
194
+ else:
195
+ st.error(f"⚠️ HTTP Error: {str(e)}")
196
+ return {}
197
+ except Exception as e:
198
+ st.error(f"⚠️ Error: {str(e)}")
199
+ return {}
200
+
201
+ # Function to create sentiment color
202
+ def get_sentiment_color(sentiment: str) -> str:
203
+ """Return CSS class for sentiment."""
204
+ if sentiment == "Positive":
205
+ return "positive"
206
+ elif sentiment == "Negative":
207
+ return "negative"
208
+ else:
209
+ return "neutral"
210
+
211
+ # Function to create visualization for sentiment distribution
212
+ def plot_sentiment_distribution(sentiment_data: Dict[str, int]):
213
+ """Create and display a bar chart for sentiment distribution."""
214
+ labels = ["Positive", "Neutral", "Negative"]
215
+ values = [sentiment_data[label] for label in labels]
216
+ colors = ["#059669", "#6B7280", "#DC2626"]
217
+
218
+ fig, ax = plt.subplots(figsize=(10, 6))
219
+ ax.bar(labels, values, color=colors)
220
+ ax.set_title("Sentiment Distribution", fontsize=16, fontweight='bold')
221
+ ax.set_ylabel("Number of Articles", fontsize=12)
222
+ ax.grid(axis='y', linestyle='--', alpha=0.7)
223
+
224
+ # Add value labels on top of bars
225
+ for i, v in enumerate(values):
226
+ ax.text(i, v + 0.1, str(v), ha='center', fontweight='bold')
227
+
228
+ return fig
229
+
230
+ # Function to display article information
231
+ def display_article(article: Dict[str, Any], index: int):
232
+ """Display article information in a card layout."""
233
+ st.markdown(f"<div class='card'>", unsafe_allow_html=True)
234
+
235
+ # Article title and sentiment
236
+ sentiment = article.get("Sentiment", "Neutral")
237
+ sentiment_class = get_sentiment_color(sentiment)
238
+
239
+ st.markdown(f"<h3 class='article-title'>{index+1}. {article['Title']}</h3>", unsafe_allow_html=True)
240
+ st.markdown(f"<span class='{sentiment_class}'>{sentiment}</span>", unsafe_allow_html=True)
241
+
242
+ # Article summary
243
+ st.markdown("<div class='article-summary'>", unsafe_allow_html=True)
244
+ st.markdown(f"{article.get('Summary', 'No summary available.')}", unsafe_allow_html=True)
245
+ st.markdown("</div>", unsafe_allow_html=True)
246
+
247
+ # Topics
248
+ if "Topics" in article and article["Topics"]:
249
+ st.markdown("<div>", unsafe_allow_html=True)
250
+ for topic in article["Topics"]:
251
+ st.markdown(f"<span class='topic-tag'>{topic}</span>", unsafe_allow_html=True)
252
+ st.markdown("</div>", unsafe_allow_html=True)
253
+
254
+ st.markdown("</div>", unsafe_allow_html=True)
255
+
256
+ # App layout
257
+ st.markdown("<h1 class='main-header'>📰 News Summarization & Text-to-Speech</h1>", unsafe_allow_html=True)
258
+ st.markdown("""
259
+ <p class='info-text'>
260
+ This application extracts news articles about a company, performs sentiment analysis, conducts comparative analysis,
261
+ and generates a text-to-speech output in Hindi. Enter a company name to get started.
262
+ </p>
263
+ """, unsafe_allow_html=True)
264
+
265
+ # Sidebar
266
+ st.sidebar.image("https://cdn-icons-png.flaticon.com/512/2593/2593073.png", width=100)
267
+ st.sidebar.title("News Analysis Settings")
268
+
269
+ # Company selection
270
+ company_input_method = st.sidebar.radio(
271
+ "Select company input method:",
272
+ options=["Text Input", "Choose from List"]
273
+ )
274
+
275
+ if company_input_method == "Text Input":
276
+ company_name = st.sidebar.text_input("Enter Company Name:", placeholder="e.g., Tesla")
277
+ else:
278
+ companies = ["Apple", "Google", "Microsoft", "Amazon", "Tesla", "Meta", "Netflix", "Uber", "Airbnb", "Twitter"]
279
+ company_name = st.sidebar.selectbox("Select Company:", companies)
280
+
281
+ # Analysis settings
282
+ max_articles = st.sidebar.slider("Maximum Articles to Analyze:", min_value=5, max_value=20, value=10)
283
+ st.sidebar.markdown("---")
284
+
285
+ # Analysis button
286
+ analyze_button = st.sidebar.button("Analyze Company News", type="primary")
287
+
288
+ # Audio playback settings
289
+ st.sidebar.markdown("## Audio Settings")
290
+ audio_speed = st.sidebar.select_slider("TTS Speech Speed:", options=["Slow", "Normal", "Fast"], value="Normal")
291
+ st.sidebar.markdown("---")
292
+
293
+ # Add option to see JSON in example format
294
+ st.sidebar.markdown("## Developer Options")
295
+ show_json = st.sidebar.checkbox("Show JSON output in example format")
296
+ st.sidebar.markdown("---")
297
+
298
+ # About section
299
+ with st.sidebar.expander("About This App"):
300
+ st.markdown("""
301
+ This application performs:
302
+ - News extraction from multiple sources
303
+ - Sentiment analysis of the content
304
+ - Topic identification and comparative analysis
305
+ - Text-to-speech conversion to Hindi
306
+
307
+ Built with Streamlit, FastAPI, and various NLP tools.
308
+ """)
309
+
310
+ # Main content area
311
+ if analyze_button and company_name:
312
+ with st.spinner(f"Analyzing news for {company_name}... This may take a minute"):
313
+ # Perform complete analysis
314
+ response = make_api_request(
315
+ "/api/complete_analysis",
316
+ {"company_name": company_name}
317
+ )
318
+
319
+ if not response:
320
+ st.error("Failed to retrieve data. Please try again.")
321
+ elif "detail" in response:
322
+ st.error(response["detail"])
323
+ else:
324
+ # Display company header
325
+ st.markdown(f"<h2 class='sub-header'>Analysis Results for {response['Company']}</h2>", unsafe_allow_html=True)
326
+
327
+ # Display sentiment summary
328
+ col1, col2 = st.columns([2, 1])
329
+
330
+ with col1:
331
+ st.markdown("<div class='card'>", unsafe_allow_html=True)
332
+ st.markdown("<h3 class='sub-header'>Sentiment Overview</h3>", unsafe_allow_html=True)
333
+ st.markdown(f"{response['Final Sentiment Analysis']}")
334
+ st.markdown("</div>", unsafe_allow_html=True)
335
+
336
+ with col2:
337
+ sentiment_data = response["Comparative Sentiment Score"]["Sentiment Distribution"]
338
+ fig = plot_sentiment_distribution(sentiment_data)
339
+ st.pyplot(fig)
340
+
341
+ st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
342
+
343
+ # Display Hindi TTS audio
344
+ if "Audio" in response and response["Audio"]:
345
+ st.markdown("<h3 class='sub-header'>Hindi Audio Summary</h3>", unsafe_allow_html=True)
346
+
347
+ audio_message = response["Audio"]
348
+
349
+ if audio_message == "Failed to generate audio":
350
+ st.warning("Hindi audio could not be generated. However, you can still read the Hindi text below.")
351
+ else:
352
+ try:
353
+ # Check if the response contains the actual audio file path
354
+ audio_file_path = response.get("_audio_file_path")
355
+
356
+ if audio_file_path:
357
+ # Extract the filename
358
+ audio_filename = os.path.basename(audio_file_path)
359
+ audio_url = f"{API_BASE_URL}/api/audio/{audio_filename}"
360
+ else:
361
+ # If no path is provided, just display a message
362
+ st.info("Audio is available but the path was not provided.")
363
+ audio_url = None
364
+
365
+ if audio_url:
366
+ # Attempt to download the audio file
367
+ audio_response = requests.get(audio_url)
368
+ if audio_response.status_code == 200:
369
+ # Save temporarily
370
+ temp_audio_path = f"temp_audio_{os.path.basename(audio_url)}"
371
+ with open(temp_audio_path, "wb") as f:
372
+ f.write(audio_response.content)
373
+
374
+ # Play from local file
375
+ st.markdown("<div class='audio-container'>", unsafe_allow_html=True)
376
+ st.audio(temp_audio_path, format="audio/mp3")
377
+
378
+ # Display audio download link
379
+ st.markdown(f"<a href='{audio_url}' download='hindi_summary.mp3'>Download Hindi Audio</a>", unsafe_allow_html=True)
380
+
381
+ # Clean up temp file (optional)
382
+ # os.remove(temp_audio_path) # Uncomment to delete after use
383
+ else:
384
+ st.warning(f"Unable to load audio file (HTTP {audio_response.status_code}). You can still read the Hindi text below.")
385
+ else:
386
+ st.info("Hindi audio summary would be available here.")
387
+ except Exception as e:
388
+ st.warning(f"Error playing audio: {str(e)}. You can still read the Hindi text below.")
389
+
390
+ # Display the Hindi text with better formatting
391
+ with st.expander("Show Hindi Text"):
392
+ hindi_text = response.get("Hindi Summary", "Hindi text not available.")
393
+
394
+ # Format the text for better readability
395
+ paragraphs = hindi_text.split("। ")
396
+
397
+ for paragraph in paragraphs:
398
+ if paragraph.strip():
399
+ # Add a period if it doesn't end with one
400
+ if not paragraph.strip().endswith("।"):
401
+ paragraph += "।"
402
+ st.markdown(f"<p style='font-size: 16px; margin-bottom: 10px;'>{paragraph}</p>", unsafe_allow_html=True)
403
+
404
+ st.markdown("</div>", unsafe_allow_html=True)
405
+
406
+ st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
407
+
408
+ # Display articles
409
+ st.markdown("<h3 class='sub-header'>News Articles</h3>", unsafe_allow_html=True)
410
+ articles = response.get("Articles", [])
411
+
412
+ if not articles:
413
+ st.info("No articles found for this company.")
414
+ else:
415
+ for i, article in enumerate(articles):
416
+ display_article(article, i)
417
+
418
+ st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
419
+
420
+ # Display comparative analysis
421
+ st.markdown("<h3 class='sub-header'>Comparative Analysis</h3>", unsafe_allow_html=True)
422
+
423
+ # Display topic overlap
424
+ topic_data = response["Comparative Sentiment Score"]["Topic Overlap"]
425
+
426
+ col1, col2 = st.columns(2)
427
+
428
+ with col1:
429
+ st.markdown("<div class='card'>", unsafe_allow_html=True)
430
+ st.markdown("<h4>Common Topics</h4>", unsafe_allow_html=True)
431
+
432
+ common_topics = topic_data.get("Common Topics Across All", [])
433
+ if common_topics:
434
+ for topic in common_topics:
435
+ st.markdown(f"<span class='topic-tag'>{topic}</span>", unsafe_allow_html=True)
436
+ else:
437
+ st.info("No common topics found across articles.")
438
+
439
+ st.markdown("</div>", unsafe_allow_html=True)
440
+
441
+ with col2:
442
+ st.markdown("<div class='card'>", unsafe_allow_html=True)
443
+ st.markdown("<h4>Coverage Comparison</h4>", unsafe_allow_html=True)
444
+
445
+ comparisons = response["Comparative Sentiment Score"].get("Coverage Differences", [])
446
+ if comparisons:
447
+ for i, comparison in enumerate(comparisons[:3]): # Show only top 3 comparisons
448
+ st.markdown(f"<p><strong>{i+1}.</strong> {comparison.get('Comparison', '')}</p>", unsafe_allow_html=True)
449
+ st.markdown(f"<p class='info-text'>{comparison.get('Impact', '')}</p>", unsafe_allow_html=True)
450
+ else:
451
+ st.info("No comparative insights available.")
452
+
453
+ st.markdown("</div>", unsafe_allow_html=True)
454
+
455
+ # Display full comparison in expander
456
+ with st.expander("View All Comparisons"):
457
+ comparisons = response["Comparative Sentiment Score"].get("Coverage Differences", [])
458
+ for i, comparison in enumerate(comparisons):
459
+ st.markdown(f"<p><strong>{i+1}.</strong> {comparison.get('Comparison', '')}</p>", unsafe_allow_html=True)
460
+ st.markdown(f"<p class='info-text'>{comparison.get('Impact', '')}</p>", unsafe_allow_html=True)
461
+ st.markdown("<hr>", unsafe_allow_html=True)
462
+
463
+ # Show JSON in example format if requested
464
+ if show_json:
465
+ st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
466
+ st.markdown("<h3 class='sub-header'>Example JSON Format</h3>", unsafe_allow_html=True)
467
+
468
+ # Get the formatted JSON
469
+ json_output = generate_example_output(company_name)
470
+
471
+ # Display the JSON in a code block
472
+ st.code(json_output, language="json")
473
+ else:
474
+ # Display placeholder
475
+ st.markdown("<div class='card'>", unsafe_allow_html=True)
476
+ st.markdown("<h3 class='sub-header'>Enter a Company Name to Begin Analysis</h3>", unsafe_allow_html=True)
477
+ st.markdown("""
478
+ <p class='info-text'>
479
+ This application will:
480
+ </p>
481
+ <ul class='info-text'>
482
+ <li>Extract news articles from multiple sources</li>
483
+ <li>Analyze sentiment (positive, negative, neutral)</li>
484
+ <li>Identify key topics in each article</li>
485
+ <li>Perform comparative analysis across articles</li>
486
+ <li>Generate Hindi speech output summarizing the findings</li>
487
+ </ul>
488
+ """, unsafe_allow_html=True)
489
+ st.markdown("</div>", unsafe_allow_html=True)
490
+
491
+ # Sample output image
492
+ st.image("https://miro.medium.com/max/1400/1*Ger-949PgQnaje2oa9XMdw.png", caption="Sample sentiment analysis visualization")
493
+
494
+ # Footer
495
+ st.markdown("<div class='section-divider'></div>", unsafe_allow_html=True)
496
+ st.markdown("<p class='info-text' style='text-align: center;'>News Summarization & Text-to-Speech Application | Developed with Streamlit and FastAPI</p>", unsafe_allow_html=True)
generate_json_output.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ import requests
4
+ import json
5
+ import sys
6
+
7
+ def generate_json_output(company_name, api_url="http://localhost:8000"):
8
+ """
9
+ Generate output in the example format for the given company.
10
+
11
+ Args:
12
+ company_name (str): Name of the company to analyze
13
+ api_url (str): Base URL of the API
14
+
15
+ Returns:
16
+ str: Formatted JSON string
17
+ """
18
+ try:
19
+ # Make API request to get the analysis data
20
+ url = f"{api_url}/api/complete_analysis"
21
+ response = requests.post(url, json={"company_name": company_name})
22
+ response.raise_for_status()
23
+ data = response.json()
24
+
25
+ # Format the data to match the example output format exactly
26
+ formatted_output = {
27
+ "Company": data["Company"],
28
+ "Articles": data["Articles"],
29
+ "Comparative Sentiment Score": {
30
+ "Sentiment Distribution": data["Comparative Sentiment Score"]["Sentiment Distribution"],
31
+ "Coverage Differences": data["Comparative Sentiment Score"]["Coverage Differences"],
32
+ "Topic Overlap": data["Comparative Sentiment Score"]["Topic Overlap"]
33
+ },
34
+ "Final Sentiment Analysis": data["Final Sentiment Analysis"],
35
+ "Audio": "[Play Hindi Speech]" if data.get("Audio") else "No audio available"
36
+ }
37
+
38
+ # Convert to JSON string with proper formatting
39
+ return json.dumps(formatted_output, indent=2)
40
+
41
+ except Exception as e:
42
+ return json.dumps({
43
+ "error": str(e),
44
+ "message": "Failed to generate example output"
45
+ }, indent=2)
46
+
47
+ if __name__ == "__main__":
48
+ # Get company name from command line arguments or prompt for it
49
+ if len(sys.argv) > 1:
50
+ company_name = sys.argv[1]
51
+ else:
52
+ company_name = input("Enter company name: ")
53
+
54
+ print(f"Input:\nCompany Name: {company_name}")
55
+ print("Output:", generate_json_output(company_name))
healthcheck.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Healthcheck script to verify the functionality of all components of the application.
3
+ Run this script to check if all dependencies are correctly installed and working.
4
+ """
5
+
6
+ import sys
7
+ import os
8
+ import time
9
+ import traceback
10
+
11
+ def run_checks():
12
+ print("Starting health check for News Summarization and TTS Application...")
13
+ checks_passed = 0
14
+ checks_failed = 0
15
+
16
+ # Check 1: Verify imports
17
+ print("\n1. Checking imports...")
18
+ try:
19
+ # Standard libraries
20
+ import json
21
+ import re
22
+
23
+ # Web and API dependencies
24
+ import requests
25
+ import fastapi
26
+ import uvicorn
27
+ import streamlit
28
+
29
+ # Data processing
30
+ import pandas
31
+ import numpy
32
+ import bs4
33
+
34
+ # NLP
35
+ import nltk
36
+ import networkx
37
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
38
+
39
+ # ML and Transformers
40
+ import torch
41
+ import transformers
42
+ from transformers import pipeline
43
+
44
+ # TTS and Translation
45
+ import deep_translator
46
+ from deep_translator import GoogleTranslator
47
+ import gtts
48
+ import pyttsx3
49
+
50
+ print("✅ All imports successful.")
51
+ checks_passed += 1
52
+ except ImportError as e:
53
+ print(f"❌ Import error: {str(e)}")
54
+ print(f"Traceback: {traceback.format_exc()}")
55
+ checks_failed += 1
56
+
57
+ # Check 2: Verify NLTK data
58
+ print("\n2. Checking NLTK data...")
59
+ try:
60
+ import nltk
61
+ nltk.data.find('tokenizers/punkt')
62
+ nltk.data.find('corpora/stopwords')
63
+ print("✅ NLTK data verified.")
64
+ checks_passed += 1
65
+ except LookupError as e:
66
+ print(f"❌ NLTK data error: {str(e)}")
67
+ print("Trying to download necessary NLTK data...")
68
+ try:
69
+ nltk.download('punkt')
70
+ nltk.download('stopwords')
71
+ print("✅ NLTK data downloaded successfully.")
72
+ checks_passed += 1
73
+ except Exception as e:
74
+ print(f"❌ Failed to download NLTK data: {str(e)}")
75
+ checks_failed += 1
76
+
77
+ # Check 3: Test translation
78
+ print("\n3. Testing translation service...")
79
+ try:
80
+ from deep_translator import GoogleTranslator
81
+ translator = GoogleTranslator(source='en', target='hi')
82
+ text = "Hello, this is a test."
83
+ translated = translator.translate(text)
84
+ print(f"Original text: {text}")
85
+ print(f"Translated text: {translated}")
86
+ if translated and len(translated) > 0:
87
+ print("✅ Translation service working.")
88
+ checks_passed += 1
89
+ else:
90
+ print("❌ Translation returned empty result.")
91
+ checks_failed += 1
92
+ except Exception as e:
93
+ print(f"❌ Translation error: {str(e)}")
94
+ print(f"Traceback: {traceback.format_exc()}")
95
+ checks_failed += 1
96
+
97
+ # Check 4: Test TTS
98
+ print("\n4. Testing Text-to-Speech service...")
99
+ try:
100
+ from gtts import gTTS
101
+ test_text = "परीक्षण पाठ" # "Test text" in Hindi
102
+ test_file = 'test_audio.mp3'
103
+
104
+ # Try gTTS
105
+ tts = gTTS(text=test_text, lang='hi', slow=False)
106
+ tts.save(test_file)
107
+
108
+ if os.path.exists(test_file) and os.path.getsize(test_file) > 0:
109
+ print("✅ gTTS service working.")
110
+ # Clean up test file
111
+ try:
112
+ os.remove(test_file)
113
+ except:
114
+ pass
115
+ checks_passed += 1
116
+ else:
117
+ print("❌ gTTS failed to generate a valid audio file.")
118
+ checks_failed += 1
119
+ except Exception as e:
120
+ print(f"❌ Text-to-Speech error: {str(e)}")
121
+ print(f"Traceback: {traceback.format_exc()}")
122
+ checks_failed += 1
123
+
124
+ # Check 5: Test sentiment analysis
125
+ print("\n5. Testing sentiment analysis...")
126
+ try:
127
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
128
+ analyzer = SentimentIntensityAnalyzer()
129
+ test_text = "This product is excellent and I love it!"
130
+ scores = analyzer.polarity_scores(test_text)
131
+ print(f"Sentiment scores for '{test_text}': {scores}")
132
+ if 'compound' in scores:
133
+ print("✅ Sentiment analysis working.")
134
+ checks_passed += 1
135
+ else:
136
+ print("❌ Sentiment analysis returned unexpected result.")
137
+ checks_failed += 1
138
+ except Exception as e:
139
+ print(f"❌ Sentiment analysis error: {str(e)}")
140
+ print(f"Traceback: {traceback.format_exc()}")
141
+ checks_failed += 1
142
+
143
+ # Check 6: Test Transformers
144
+ print("\n6. Testing Transformer models...")
145
+ try:
146
+ from transformers import pipeline
147
+ sentiment_task = pipeline("sentiment-analysis", return_all_scores=False)
148
+ result = sentiment_task("I love using this application!")
149
+ print(f"Transformer sentiment analysis result: {result}")
150
+ print("✅ Transformer models working.")
151
+ checks_passed += 1
152
+ except Exception as e:
153
+ print(f"❌ Transformer models error: {str(e)}")
154
+ print(f"Traceback: {traceback.format_exc()}")
155
+ checks_failed += 1
156
+
157
+ # Summary
158
+ print("\n" + "="*50)
159
+ print(f"Health Check Summary: {checks_passed} checks passed, {checks_failed} checks failed")
160
+
161
+ if checks_failed == 0:
162
+ print("\n✅ All systems operational! The application should run correctly.")
163
+ return True
164
+ else:
165
+ print("\n❌ Some checks failed. Please review the errors above.")
166
+ return False
167
+
168
+ if __name__ == "__main__":
169
+ success = run_checks()
170
+ if not success:
171
+ sys.exit(1)
requirements.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ streamlit==1.27.0
3
+ fastapi==0.103.1
4
+ uvicorn==0.23.2
5
+ requests==2.31.0
6
+ beautifulsoup4==4.12.2
7
+ pandas==2.1.0
8
+ numpy==1.25.2
9
+ scipy==1.10.1
10
+
11
+ # NLP and Sentiment Analysis
12
+ transformers==4.33.1
13
+ torch==2.0.1
14
+ nltk==3.8.1
15
+ vaderSentiment==3.3.2
16
+
17
+ # Text-to-Speech
18
+ gTTS==2.3.2
19
+ pyttsx3==2.90
20
+ deep-translator==1.11.4
21
+
22
+ # Data Processing and Visualization
23
+ matplotlib==3.7.3
24
+ seaborn==0.12.2
25
+ scikit-learn==1.3.0
26
+ networkx==3.1
27
+
28
+ # API and Web
29
+ aiohttp==3.8.5
30
+ httpx==0.24.1
31
+ pydantic==2.3.0
32
+ python-dotenv==1.0.0
33
+ python-multipart==0.0.6
34
+
35
+ # HuggingFace Spaces
36
+ huggingface-hub==0.16.4
37
+
38
+ # Added from the code block
39
+ pydub==0.25.1
utils.py ADDED
@@ -0,0 +1,1132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import re
3
+ import os
4
+ import json
5
+ import time
6
+ from typing import List, Dict, Any, Tuple, Optional
7
+ from bs4 import BeautifulSoup
8
+ import pandas as pd
9
+ import numpy as np
10
+ from nltk.corpus import stopwords
11
+ from nltk.tokenize import sent_tokenize, word_tokenize
12
+ from nltk.cluster.util import cosine_distance
13
+ import networkx as nx
14
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
15
+ from collections import Counter
16
+ from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
17
+ from deep_translator import GoogleTranslator
18
+ from gtts import gTTS
19
+ import pyttsx3
20
+
21
+ # Download necessary NLTK data
22
+ import nltk
23
+ try:
24
+ nltk.data.find('tokenizers/punkt')
25
+ nltk.data.find('corpora/stopwords')
26
+ except LookupError:
27
+ nltk.download('punkt')
28
+ nltk.download('stopwords')
29
+
30
+ # Initialize sentiment analyzer
31
+ vader_analyzer = SentimentIntensityAnalyzer()
32
+
33
+ # Initialize advanced sentiment model
34
+ sentiment_model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
35
+ sentiment_tokenizer = AutoTokenizer.from_pretrained(sentiment_model_name)
36
+ sentiment_model = AutoModelForSequenceClassification.from_pretrained(sentiment_model_name)
37
+ advanced_sentiment = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer)
38
+
39
+ # Initialize translator
40
+ translator = GoogleTranslator(source='en', target='hi')
41
+
42
+ class NewsArticle:
43
+ def __init__(self, title: str, url: str, content: str, summary: str = "", source: str = "",
44
+ date: str = "", sentiment: str = "", topics: List[str] = None):
45
+ self.title = title
46
+ self.url = url
47
+ self.content = content
48
+ self.summary = summary if summary else self.generate_summary(content)
49
+ self.source = source
50
+ self.date = date
51
+ self.sentiment = sentiment if sentiment else self.analyze_sentiment(content, title)
52
+ self.topics = topics if topics else self.extract_topics(content)
53
+
54
+ def to_dict(self) -> Dict[str, Any]:
55
+ return {
56
+ "title": self.title,
57
+ "url": self.url,
58
+ "content": self.content,
59
+ "summary": self.summary,
60
+ "source": self.source,
61
+ "date": self.date,
62
+ "sentiment": self.sentiment,
63
+ "topics": self.topics
64
+ }
65
+
66
+ @staticmethod
67
+ def analyze_sentiment(text: str, title: str = "") -> str:
68
+ """
69
+ Analyze sentiment using a combination of methods for more accurate results.
70
+ We give more weight to the title sentiment and use advanced model when possible.
71
+ """
72
+ # Set thresholds for VADER sentiment
73
+ threshold_positive = 0.05 # Default 0.05
74
+ threshold_negative = -0.05 # Default -0.05
75
+
76
+ # Use VADER for basic sentiment analysis on both title and content
77
+ try:
78
+ title_scores = vader_analyzer.polarity_scores(title) if title else {'compound': 0}
79
+ content_scores = vader_analyzer.polarity_scores(text)
80
+
81
+ # Weight the title more heavily (title sentiment is often more reliable)
82
+ title_weight = 0.6 if title else 0
83
+ content_weight = 1.0 - title_weight
84
+
85
+ compound_score = (title_weight * title_scores['compound']) + (content_weight * content_scores['compound'])
86
+
87
+ # Try to use the advanced model for additional insight (for short texts)
88
+ advanced_result = None
89
+ advanced_score = 0
90
+
91
+ try:
92
+ # Use title + first part of content for advanced model
93
+ sample_text = title + ". " + text[:300] if title else text[:300]
94
+ advanced_result = advanced_sentiment(sample_text)[0]
95
+
96
+ # Map advanced model results to a -1 to 1 scale similar to VADER
97
+ label = advanced_result['label']
98
+ confidence = advanced_result['score']
99
+
100
+ # Map the 1-5 star rating to a -1 to 1 scale
101
+ if label == '1 star' or label == '2 stars':
102
+ advanced_score = -confidence
103
+ elif label == '4 stars' or label == '5 stars':
104
+ advanced_score = confidence
105
+ else: # 3 stars is neutral
106
+ advanced_score = 0
107
+
108
+ # Combine VADER and advanced model scores
109
+ # Give more weight to advanced model when confidence is high
110
+ if confidence > 0.8:
111
+ compound_score = (0.4 * compound_score) + (0.6 * advanced_score)
112
+ else:
113
+ compound_score = (0.7 * compound_score) + (0.3 * advanced_score)
114
+
115
+ except Exception as e:
116
+ print(f"Advanced sentiment analysis failed: {str(e)}")
117
+ # Continue with just VADER if advanced model fails
118
+ pass
119
+
120
+ # Fine-grained sentiment mapping
121
+ if compound_score >= 0.3:
122
+ return "Positive"
123
+ elif compound_score >= threshold_positive:
124
+ return "Slightly Positive"
125
+ elif compound_score <= -0.3:
126
+ return "Negative"
127
+ elif compound_score <= threshold_negative:
128
+ return "Slightly Negative"
129
+ else:
130
+ return "Neutral"
131
+
132
+ except Exception as e:
133
+ print(f"Sentiment analysis error: {str(e)}")
134
+ return "Neutral" # Default fallback
135
+
136
+ @staticmethod
137
+ def generate_summary(text: str, num_sentences: int = 5) -> str:
138
+ # Generate summary using extractive summarization
139
+ if not text or len(text) < 100:
140
+ return text
141
+
142
+ # Tokenize sentences
143
+ sentences = sent_tokenize(text)
144
+ if len(sentences) <= num_sentences:
145
+ return text
146
+
147
+ # Calculate sentence similarity and rank them
148
+ similarity_matrix = build_similarity_matrix(sentences)
149
+ scores = nx.pagerank(nx.from_numpy_array(similarity_matrix))
150
+
151
+ # Select top sentences
152
+ ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
153
+ summary_sentences = [ranked_sentences[i][1] for i in range(min(num_sentences, len(ranked_sentences)))]
154
+
155
+ # Maintain original order
156
+ original_order = []
157
+ for sentence in sentences:
158
+ if sentence in summary_sentences and sentence not in original_order:
159
+ original_order.append(sentence)
160
+ if len(original_order) >= num_sentences:
161
+ break
162
+
163
+ return " ".join(original_order)
164
+
165
+ @staticmethod
166
+ def extract_topics(text: str, num_topics: int = 5) -> List[str]:
167
+ # Extract key topics from text based on term frequency
168
+ stop_words = set(stopwords.words('english'))
169
+ words = word_tokenize(text.lower())
170
+
171
+ # Filter out stopwords and short words
172
+ filtered_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 3]
173
+
174
+ # Count word frequencies
175
+ word_counts = Counter(filtered_words)
176
+
177
+ # Return most common words as topics
178
+ topics = [word for word, _ in word_counts.most_common(num_topics)]
179
+ return topics
180
+
181
+ def build_similarity_matrix(sentences: List[str]) -> np.ndarray:
182
+ """Build similarity matrix for sentences based on cosine similarity."""
183
+ # Number of sentences
184
+ n = len(sentences)
185
+
186
+ # Initialize similarity matrix
187
+ similarity_matrix = np.zeros((n, n))
188
+
189
+ # Calculate similarity between each pair of sentences
190
+ for i in range(n):
191
+ for j in range(n):
192
+ if i != j:
193
+ similarity_matrix[i][j] = sentence_similarity(sentences[i], sentences[j])
194
+
195
+ return similarity_matrix
196
+
197
+ def sentence_similarity(sent1: str, sent2: str) -> float:
198
+ """Calculate similarity between two sentences using cosine similarity."""
199
+ # Tokenize sentences
200
+ words1 = [word.lower() for word in word_tokenize(sent1) if word.isalpha()]
201
+ words2 = [word.lower() for word in word_tokenize(sent2) if word.isalpha()]
202
+
203
+ # Get all unique words
204
+ all_words = list(set(words1 + words2))
205
+
206
+ # Create word vectors
207
+ vector1 = [1 if word in words1 else 0 for word in all_words]
208
+ vector2 = [1 if word in words2 else 0 for word in all_words]
209
+
210
+ # Calculate cosine similarity
211
+ if not any(vector1) or not any(vector2):
212
+ return 0.0
213
+
214
+ return 1 - cosine_distance(vector1, vector2)
215
+
216
+ def search_news(company_name: str, num_articles: int = 10) -> List[NewsArticle]:
217
+ """Search for news articles about a given company."""
218
+ # List to store articles
219
+ articles = []
220
+
221
+ # Define search queries and news sources
222
+ search_queries = [
223
+ f"{company_name} news",
224
+ f"{company_name} financial news",
225
+ f"{company_name} business news",
226
+ f"{company_name} recent news",
227
+ f"{company_name} company news",
228
+ f"{company_name} stock",
229
+ f"{company_name} market"
230
+ ]
231
+
232
+ # Updated news sources with more reliable sources
233
+ news_sources = [
234
+ {
235
+ "base_url": "https://finance.yahoo.com/quote/",
236
+ "article_patterns": ["news", "finance", "articles"],
237
+ "direct_access": True
238
+ },
239
+ {
240
+ "base_url": "https://www.reuters.com/search/news?blob=",
241
+ "article_patterns": ["article", "business", "companies", "markets"],
242
+ "direct_access": False
243
+ },
244
+ {
245
+ "base_url": "https://www.marketwatch.com/search?q=",
246
+ "article_patterns": ["story", "articles", "news"],
247
+ "direct_access": False
248
+ },
249
+ {
250
+ "base_url": "https://www.fool.com/search?q=",
251
+ "article_patterns": ["article", "investing", "stock"],
252
+ "direct_access": False
253
+ },
254
+ {
255
+ "base_url": "https://seekingalpha.com/search?q=",
256
+ "article_patterns": ["article", "news", "stock", "analysis"],
257
+ "direct_access": False
258
+ },
259
+ {
260
+ "base_url": "https://www.zacks.com/search.php?q=",
261
+ "article_patterns": ["stock", "research", "analyst"],
262
+ "direct_access": False
263
+ },
264
+ {
265
+ "base_url": "https://economictimes.indiatimes.com/search?q=",
266
+ "article_patterns": ["articleshow", "news", "industry"],
267
+ "direct_access": False
268
+ },
269
+ {
270
+ "base_url": "https://www.bloomberg.com/search?query=",
271
+ "article_patterns": ["news", "articles"],
272
+ "direct_access": False
273
+ }
274
+ ]
275
+
276
+ print(f"Starting search for news about {company_name}...")
277
+
278
+ # Search each source with each query until we have enough articles
279
+ for query in search_queries:
280
+ if len(articles) >= num_articles:
281
+ break
282
+
283
+ for source in news_sources:
284
+ if len(articles) >= num_articles:
285
+ break
286
+
287
+ try:
288
+ source_base = source["base_url"]
289
+ article_patterns = source["article_patterns"]
290
+ direct_access = source["direct_access"]
291
+
292
+ # Construct search URL
293
+ if direct_access:
294
+ # Try to fetch the stock symbol for Yahoo Finance
295
+ if "yahoo" in source_base:
296
+ try:
297
+ # First try the company name directly (for known tickers)
298
+ search_url = f"{source_base}{company_name}/news"
299
+ print(f"Trying direct ticker access: {search_url}")
300
+
301
+ # Fetch to check if valid
302
+ headers = {
303
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
304
+ }
305
+ test_response = requests.get(search_url, headers=headers, timeout=10)
306
+
307
+ # If we got a 404, try searching for the symbol first
308
+ if test_response.status_code == 404:
309
+ print("Company name not a valid ticker, searching for symbol...")
310
+ symbol_url = f"https://finance.yahoo.com/lookup?s={company_name}"
311
+ symbol_response = requests.get(symbol_url, headers=headers, timeout=10)
312
+
313
+ if symbol_response.status_code == 200:
314
+ symbol_soup = BeautifulSoup(symbol_response.text, 'html.parser')
315
+ # Try to find the first stock symbol result
316
+ symbol_row = symbol_soup.select_one("tr.data-row0")
317
+ if symbol_row:
318
+ symbol_cell = symbol_row.select_one("td:first-child a")
319
+ if symbol_cell:
320
+ symbol = symbol_cell.text.strip()
321
+ search_url = f"{source_base}{symbol}/news"
322
+ print(f"Found symbol {symbol}, using URL: {search_url}")
323
+ except Exception as e:
324
+ print(f"Error getting stock symbol: {str(e)}")
325
+ search_url = f"{source_base}{company_name}/news"
326
+ else:
327
+ search_url = f"{source_base}{company_name}/news"
328
+ else:
329
+ search_url = f"{source_base}{query.replace(' ', '+')}"
330
+
331
+ print(f"Searching {search_url}")
332
+
333
+ # Fetch search results with retry mechanism
334
+ max_retries = 3
335
+ retry_count = 0
336
+ response = None
337
+
338
+ while retry_count < max_retries:
339
+ try:
340
+ headers = {
341
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
342
+ "Accept": "text/html,application/xhtml+xml,application/xml",
343
+ "Accept-Language": "en-US,en;q=0.9",
344
+ "Referer": "https://www.google.com/"
345
+ }
346
+ response = requests.get(search_url, headers=headers, timeout=15)
347
+ if response.status_code == 200:
348
+ break
349
+ retry_count += 1
350
+ print(f"Retry {retry_count}/{max_retries} for {search_url} (status: {response.status_code})")
351
+ time.sleep(1) # Short delay before retry
352
+ except Exception as e:
353
+ retry_count += 1
354
+ print(f"Request error (attempt {retry_count}/{max_retries}): {str(e)}")
355
+ time.sleep(1)
356
+
357
+ if not response or response.status_code != 200:
358
+ print(f"Failed to fetch results from {search_url} after {max_retries} attempts")
359
+ continue
360
+
361
+ soup = BeautifulSoup(response.text, 'html.parser')
362
+
363
+ # Extract article links - using more flexible patterns
364
+ links = soup.find_all('a', href=True)
365
+ article_links = []
366
+
367
+ # Domain for resolving relative URLs
368
+ domain = response.url.split('/')[0] + '//' + response.url.split('/')[2]
369
+ print(f"Domain for resolving URLs: {domain}")
370
+
371
+ for link in links:
372
+ href = link['href']
373
+ link_text = link.text.strip()
374
+
375
+ # Skip empty links or navigation elements
376
+ if not link_text or len(link_text) < 10 or href.startswith('#'):
377
+ continue
378
+
379
+ # Check if the link matches any of our article patterns
380
+ is_article_link = False
381
+ for pattern in article_patterns:
382
+ if pattern in href.lower():
383
+ is_article_link = True
384
+ break
385
+
386
+ # Check for the company name in link text or URL (less restrictive now)
387
+ contains_company = (
388
+ company_name.lower() in link_text.lower() or
389
+ company_name.lower() in href.lower()
390
+ )
391
+
392
+ if is_article_link or contains_company:
393
+ # Convert relative URLs to absolute
394
+ if href.startswith('/'):
395
+ href = f"{domain}{href}"
396
+ elif not href.startswith(('http://', 'https://')):
397
+ href = f"{domain}/{href}"
398
+
399
+ # Avoid duplicates
400
+ if href not in article_links:
401
+ article_links.append(href)
402
+ print(f"Found potential article: {link_text[:50]}... at {href}")
403
+
404
+ print(f"Found {len(article_links)} potential article links from {search_url}")
405
+
406
+ # Process each article link
407
+ for link in article_links[:5]: # Increased from 3 to 5
408
+ if len(articles) >= num_articles:
409
+ break
410
+
411
+ try:
412
+ print(f"Fetching article: {link}")
413
+ article_response = requests.get(link, headers=headers, timeout=15)
414
+
415
+ if article_response.status_code != 200:
416
+ print(f"Failed to fetch article: {article_response.status_code}")
417
+ continue
418
+
419
+ article_soup = BeautifulSoup(article_response.text, 'html.parser')
420
+
421
+ # Extract article title - more robust method
422
+ title = None
423
+
424
+ # Try different elements that could contain the title
425
+ for title_tag in ['h1', 'h2', '.headline', '.title', 'title']:
426
+ if title:
427
+ break
428
+
429
+ if title_tag.startswith('.'):
430
+ elements = article_soup.select(title_tag)
431
+ else:
432
+ elements = article_soup.find_all(title_tag)
433
+
434
+ for element in elements:
435
+ candidate = element.text.strip()
436
+ if len(candidate) > 5 and len(candidate) < 200: # Reasonable title length
437
+ title = candidate
438
+ break
439
+
440
+ if not title:
441
+ print("Could not find a suitable title")
442
+ continue
443
+
444
+ # Check if title contains company name (case insensitive)
445
+ if company_name.lower() not in title.lower():
446
+ # Try alternative check - sometimes the title doesn't explicitly mention the company
447
+ meta_description = article_soup.find('meta', attrs={'name': 'description'}) or \
448
+ article_soup.find('meta', attrs={'property': 'og:description'})
449
+
450
+ if meta_description and 'content' in meta_description.attrs:
451
+ meta_text = meta_description['content']
452
+ if company_name.lower() not in meta_text.lower():
453
+ # One more check in the page content
454
+ page_text = article_soup.get_text().lower()
455
+ company_mentions = page_text.count(company_name.lower())
456
+ if company_mentions < 2: # Require at least 2 mentions
457
+ print(f"Article doesn't seem to be about {company_name}: {title}")
458
+ continue
459
+
460
+ # Extract article content - improved method
461
+ content = ""
462
+
463
+ # Try multiple content extraction strategies
464
+ content_containers = []
465
+
466
+ # 1. Look for article/main content containers
467
+ for container in ['article', 'main', '.article-body', '.story-body', '.story-content',
468
+ '.article-content', '.content-body', '.entry-content']:
469
+ if container.startswith('.'):
470
+ elements = article_soup.select(container)
471
+ else:
472
+ elements = article_soup.find_all(container)
473
+
474
+ content_containers.extend(elements)
475
+
476
+ # 2. If no specific containers, fallback to div with article-like classes
477
+ if not content_containers:
478
+ for div in article_soup.find_all('div', class_=True):
479
+ classes = div.get('class', [])
480
+ for cls in classes:
481
+ if any(term in cls.lower() for term in ['article', 'story', 'content', 'body', 'text']):
482
+ content_containers.append(div)
483
+ break
484
+
485
+ # 3. Extract paragraphs from containers
486
+ processed_paragraphs = set() # To avoid duplicates
487
+
488
+ for container in content_containers:
489
+ for p in container.find_all('p'):
490
+ p_text = p.text.strip()
491
+ # Avoid very short or duplicate paragraphs
492
+ if len(p_text) > 30 and p_text not in processed_paragraphs:
493
+ content += p_text + " "
494
+ processed_paragraphs.add(p_text)
495
+
496
+ # 4. If still no content, try all paragraphs
497
+ if not content:
498
+ for p in article_soup.find_all('p'):
499
+ p_text = p.text.strip()
500
+ if len(p_text) > 30 and p_text not in processed_paragraphs:
501
+ content += p_text + " "
502
+ processed_paragraphs.add(p_text)
503
+
504
+ content = content.strip()
505
+
506
+ # Skip if content is too short
507
+ if len(content) < 300: # Reduced from 500 to be less restrictive
508
+ print(f"Article content too short: {len(content)} characters")
509
+ continue
510
+
511
+ # Extract source name - more robust method
512
+ source = None
513
+
514
+ # Try to get from meta tags
515
+ meta_site_name = article_soup.find('meta', attrs={'property': 'og:site_name'})
516
+ if meta_site_name and 'content' in meta_site_name.attrs:
517
+ source = meta_site_name['content']
518
+ else:
519
+ # Extract from URL
520
+ try:
521
+ from urllib.parse import urlparse
522
+ parsed_url = urlparse(link)
523
+ source = parsed_url.netloc
524
+ except:
525
+ source = response.url.split('/')[2]
526
+
527
+ # Extract date - improved method
528
+ date = ""
529
+
530
+ # Try multiple date extraction strategies
531
+ # 1. Look for time element
532
+ date_tag = article_soup.find('time')
533
+
534
+ # 2. Look for meta tags with date
535
+ if not date and (not date_tag or not date_tag.get('datetime')):
536
+ for meta_name in ['article:published_time', 'date', 'publish-date', 'article:modified_time']:
537
+ meta_date = article_soup.find('meta', attrs={'property': meta_name}) or \
538
+ article_soup.find('meta', attrs={'name': meta_name})
539
+
540
+ if meta_date and 'content' in meta_date.attrs:
541
+ date = meta_date['content']
542
+ break
543
+
544
+ # 3. Look for spans/divs with date-related classes
545
+ if not date:
546
+ date_classes = ['date', 'time', 'published', 'posted', 'datetime']
547
+ for cls in date_classes:
548
+ elements = article_soup.find_all(['span', 'div', 'p'], class_=lambda x: x and cls.lower() in x.lower())
549
+ if elements:
550
+ date = elements[0].text.strip()
551
+ break
552
+
553
+ # If we got this far, we have a valid article
554
+ print(f"Successfully extracted article: {title}")
555
+
556
+ # Create article object and add to list
557
+ article = NewsArticle(
558
+ title=title,
559
+ url=link,
560
+ content=content,
561
+ source=source,
562
+ date=date
563
+ )
564
+
565
+ # Check if similar article already exists to avoid duplicates
566
+ is_duplicate = False
567
+ for existing_article in articles:
568
+ if sentence_similarity(existing_article.title, title) > 0.7: # Lowered threshold
569
+ is_duplicate = True
570
+ print(f"Found duplicate article: {title}")
571
+ break
572
+
573
+ if not is_duplicate:
574
+ articles.append(article)
575
+ print(f"Added article: {title}")
576
+
577
+ except Exception as e:
578
+ print(f"Error processing article {link}: {str(e)}")
579
+ continue
580
+
581
+ except Exception as e:
582
+ print(f"Error searching {source_base} with query {query}: {str(e)}")
583
+ continue
584
+
585
+ # If we couldn't find enough articles, create some dummy articles to prevent errors
586
+ if not articles and num_articles > 0:
587
+ print(f"No articles found for {company_name}. Creating a dummy article to prevent errors.")
588
+
589
+ dummy_article = NewsArticle(
590
+ title=f"{company_name} Information",
591
+ url="#",
592
+ content=f"Information about {company_name} was not found or could not be retrieved. This is a placeholder.",
593
+ source="System",
594
+ date="",
595
+ sentiment="Neutral",
596
+ topics=["information", "company", "placeholder"]
597
+ )
598
+
599
+ articles.append(dummy_article)
600
+
601
+ # Return collected articles
602
+ print(f"Returning {len(articles)} articles for {company_name}")
603
+ return articles[:num_articles]
604
+
605
+ def analyze_article_sentiment(article: NewsArticle) -> Dict[str, Any]:
606
+ """Perform detailed sentiment analysis on an article."""
607
+ # Use VADER for paragraph-level sentiment
608
+ paragraphs = article.content.split('\n')
609
+ paragraph_sentiments = []
610
+
611
+ overall_scores = {
612
+ 'pos': 0,
613
+ 'neg': 0,
614
+ 'neu': 0,
615
+ 'compound': 0
616
+ }
617
+
618
+ for paragraph in paragraphs:
619
+ if len(paragraph.strip()) < 20: # Skip short paragraphs
620
+ continue
621
+
622
+ scores = vader_analyzer.polarity_scores(paragraph)
623
+ paragraph_sentiments.append({
624
+ 'text': paragraph[:100] + '...' if len(paragraph) > 100 else paragraph,
625
+ 'scores': scores
626
+ })
627
+
628
+ overall_scores['pos'] += scores['pos']
629
+ overall_scores['neg'] += scores['neg']
630
+ overall_scores['neu'] += scores['neu']
631
+ overall_scores['compound'] += scores['compound']
632
+
633
+ num_paragraphs = len(paragraph_sentiments)
634
+ if num_paragraphs > 0:
635
+ overall_scores['pos'] /= num_paragraphs
636
+ overall_scores['neg'] /= num_paragraphs
637
+ overall_scores['neu'] /= num_paragraphs
638
+ overall_scores['compound'] /= num_paragraphs
639
+
640
+ # Use advanced model for overall sentiment
641
+ try:
642
+ # Truncate content if too long
643
+ truncated_content = article.content[:512] if len(article.content) > 512 else article.content
644
+ advanced_result = advanced_sentiment(truncated_content)[0]
645
+ advanced_sentiment_label = advanced_result['label']
646
+ advanced_confidence = advanced_result['score']
647
+ except Exception as e:
648
+ print(f"Error with advanced sentiment analysis: {str(e)}")
649
+ advanced_sentiment_label = "Error"
650
+ advanced_confidence = 0.0
651
+
652
+ # Determine final sentiment
653
+ if overall_scores['compound'] >= 0.05:
654
+ final_sentiment = "Positive"
655
+ elif overall_scores['compound'] <= -0.05:
656
+ final_sentiment = "Negative"
657
+ else:
658
+ final_sentiment = "Neutral"
659
+
660
+ return {
661
+ 'article_title': article.title,
662
+ 'overall_sentiment': final_sentiment,
663
+ 'vader_scores': overall_scores,
664
+ 'advanced_sentiment': {
665
+ 'label': advanced_sentiment_label,
666
+ 'confidence': advanced_confidence
667
+ },
668
+ 'paragraph_analysis': paragraph_sentiments,
669
+ 'positive_ratio': overall_scores['pos'],
670
+ 'negative_ratio': overall_scores['neg'],
671
+ 'neutral_ratio': overall_scores['neu']
672
+ }
673
+
674
+ def perform_comparative_analysis(articles: List[NewsArticle]) -> Dict[str, Any]:
675
+ """Perform comparative analysis across multiple articles."""
676
+ # Sentiment distribution with expanded categories
677
+ sentiment_counts = {
678
+ "Positive": 0,
679
+ "Slightly Positive": 0,
680
+ "Neutral": 0,
681
+ "Slightly Negative": 0,
682
+ "Negative": 0
683
+ }
684
+
685
+ for article in articles:
686
+ if article.sentiment in sentiment_counts:
687
+ sentiment_counts[article.sentiment] += 1
688
+ else:
689
+ # Fallback for any unexpected sentiment values
690
+ sentiment_counts["Neutral"] += 1
691
+
692
+ # Topic analysis
693
+ all_topics = []
694
+ for article in articles:
695
+ all_topics.extend(article.topics)
696
+
697
+ topic_counts = Counter(all_topics)
698
+ common_topics = [topic for topic, count in topic_counts.most_common(10)]
699
+
700
+ # Identify unique topics per article
701
+ unique_topics_by_article = {}
702
+ for i, article in enumerate(articles):
703
+ other_articles_topics = []
704
+ for j, other_article in enumerate(articles):
705
+ if i != j:
706
+ other_articles_topics.extend(other_article.topics)
707
+
708
+ unique_topics = [topic for topic in article.topics if topic not in other_articles_topics]
709
+ unique_topics_by_article[i] = unique_topics
710
+
711
+ # Generate comparisons
712
+ comparisons = []
713
+
714
+ # If we have more than one article, generate meaningful comparisons
715
+ if len(articles) > 1:
716
+ for i in range(len(articles) - 1):
717
+ for j in range(i + 1, len(articles)):
718
+ article1 = articles[i]
719
+ article2 = articles[j]
720
+
721
+ # Compare sentiments - more nuanced now with new categories
722
+ if article1.sentiment != article2.sentiment:
723
+ # Group sentiments for better comparison
724
+ sent1_group = get_sentiment_group(article1.sentiment)
725
+ sent2_group = get_sentiment_group(article2.sentiment)
726
+
727
+ if sent1_group != sent2_group:
728
+ comparison = {
729
+ "Articles": [article1.title, article2.title],
730
+ "Comparison": f"'{article1.title}' presents a {sent1_group.lower()} view ({article1.sentiment}), while '{article2.title}' has a {sent2_group.lower()} view ({article2.sentiment}).",
731
+ "Impact": "This difference in sentiment highlights varying perspectives on the company's situation."
732
+ }
733
+ comparisons.append(comparison)
734
+ else:
735
+ # Even if in same group, note the difference if one is stronger
736
+ if "Slightly" in article1.sentiment and "Slightly" not in article2.sentiment or \
737
+ "Slightly" in article2.sentiment and "Slightly" not in article1.sentiment:
738
+ stronger = article1 if "Slightly" not in article1.sentiment else article2
739
+ weaker = article2 if stronger == article1 else article1
740
+
741
+ comparison = {
742
+ "Articles": [stronger.title, weaker.title],
743
+ "Comparison": f"'{stronger.title}' expresses a stronger {sent1_group.lower()} sentiment ({stronger.sentiment}) than '{weaker.title}' ({weaker.sentiment}).",
744
+ "Impact": "The difference in intensity suggests varying degrees of confidence about the company."
745
+ }
746
+ comparisons.append(comparison)
747
+
748
+ # Compare topics
749
+ common_topics_between_two = set(article1.topics).intersection(set(article2.topics))
750
+ if common_topics_between_two:
751
+ comparison = {
752
+ "Articles": [article1.title, article2.title],
753
+ "Comparison": f"Both articles discuss {', '.join(common_topics_between_two)}.",
754
+ "Impact": "The common topics indicate key areas of focus around the company."
755
+ }
756
+ comparisons.append(comparison)
757
+
758
+ # Compare unique topics
759
+ unique_to_article1 = set(article1.topics) - set(article2.topics)
760
+ unique_to_article2 = set(article2.topics) - set(article1.topics)
761
+
762
+ if unique_to_article1 and unique_to_article2:
763
+ comparison = {
764
+ "Articles": [article1.title, article2.title],
765
+ "Comparison": f"'{article1.title}' uniquely covers {', '.join(unique_to_article1)}, while '{article2.title}' focuses on {', '.join(unique_to_article2)}.",
766
+ "Impact": "Different sources emphasize varying aspects of the company, offering a broader perspective."
767
+ }
768
+ comparisons.append(comparison)
769
+ else:
770
+ # If we only have one article, create a dummy comparison
771
+ if articles:
772
+ article = articles[0]
773
+ topics_str = ", ".join(article.topics[:3]) if article.topics else "no specific topics"
774
+ sentiment_group = get_sentiment_group(article.sentiment)
775
+
776
+ comparisons = [
777
+ {
778
+ "Comparison": f"Only found one article: '{article.title}' with a {article.sentiment.lower()} sentiment ({sentiment_group} overall).",
779
+ "Impact": f"Limited coverage focused on {topics_str}. More articles would provide a more balanced view."
780
+ },
781
+ {
782
+ "Comparison": f"The article discusses {topics_str} in relation to {article.source}.",
783
+ "Impact": "Single source reporting limits perspective. Consider searching for additional sources."
784
+ }
785
+ ]
786
+
787
+ # Generate overall sentiment analysis
788
+ # Combine slightly positive with positive and slightly negative with negative for summary
789
+ pos_count = sentiment_counts["Positive"] + sentiment_counts["Slightly Positive"]
790
+ neg_count = sentiment_counts["Negative"] + sentiment_counts["Slightly Negative"]
791
+ neu_count = sentiment_counts["Neutral"]
792
+ total = pos_count + neg_count + neu_count
793
+
794
+ # For display, we'll keep detailed counts but summarize the analysis text
795
+ if total == 0:
796
+ final_analysis = "No sentiment data available."
797
+ else:
798
+ pos_ratio = pos_count / total
799
+ neg_ratio = neg_count / total
800
+
801
+ # Show more details on the sentiment breakdown
802
+ sentiment_detail = []
803
+ if sentiment_counts["Positive"] > 0:
804
+ sentiment_detail.append(f"{sentiment_counts['Positive']} strongly positive")
805
+ if sentiment_counts["Slightly Positive"] > 0:
806
+ sentiment_detail.append(f"{sentiment_counts['Slightly Positive']} slightly positive")
807
+ if sentiment_counts["Neutral"] > 0:
808
+ sentiment_detail.append(f"{sentiment_counts['Neutral']} neutral")
809
+ if sentiment_counts["Slightly Negative"] > 0:
810
+ sentiment_detail.append(f"{sentiment_counts['Slightly Negative']} slightly negative")
811
+ if sentiment_counts["Negative"] > 0:
812
+ sentiment_detail.append(f"{sentiment_counts['Negative']} strongly negative")
813
+
814
+ sentiment_breakdown = ", ".join(sentiment_detail)
815
+
816
+ if pos_ratio > 0.6:
817
+ final_analysis = f"The company has primarily positive coverage ({pos_count}/{total} articles positive: {sentiment_breakdown}). This suggests a favorable market perception."
818
+ elif neg_ratio > 0.6:
819
+ final_analysis = f"The company has primarily negative coverage ({neg_count}/{total} articles negative: {sentiment_breakdown}). This could indicate challenges or controversies."
820
+ elif pos_ratio > neg_ratio:
821
+ final_analysis = f"The company has mixed coverage with a positive lean ({sentiment_breakdown})."
822
+ elif neg_ratio > pos_ratio:
823
+ final_analysis = f"The company has mixed coverage with a negative lean ({sentiment_breakdown})."
824
+ else:
825
+ final_analysis = f"The company has balanced coverage ({sentiment_breakdown})."
826
+
827
+ # If we only have the dummy article, customize the final analysis
828
+ if len(articles) == 1 and articles[0].url == "#":
829
+ final_analysis = "Limited news data available. The analysis is based on a placeholder article."
830
+
831
+ return {
832
+ "Sentiment Distribution": sentiment_counts,
833
+ "Common Topics": common_topics,
834
+ "Topic Overlap": {
835
+ "Common Topics Across All": common_topics[:5],
836
+ "Unique Topics By Article": unique_topics_by_article
837
+ },
838
+ "Coverage Differences": comparisons[:10], # Limit to top 10 comparisons
839
+ "Final Sentiment Analysis": final_analysis
840
+ }
841
+
842
+ def get_sentiment_group(sentiment: str) -> str:
843
+ """Group sentiments into broader categories for comparison."""
844
+ if sentiment in ["Positive", "Slightly Positive"]:
845
+ return "Positive"
846
+ elif sentiment in ["Negative", "Slightly Negative"]:
847
+ return "Negative"
848
+ else:
849
+ return "Neutral"
850
+
851
+ def translate_to_hindi(text: str) -> str:
852
+ """Translate text to Hindi using deep_translator."""
853
+ try:
854
+ # Split text into chunks if too long (Google Translator has a limit)
855
+ max_chunk_size = 4500 # deep_translator's GoogleTranslator has a limit of 5000 chars
856
+ chunks = [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]
857
+
858
+ translated_chunks = []
859
+ for chunk in chunks:
860
+ # Translate the chunk
861
+ translated = translator.translate(chunk)
862
+ translated_chunks.append(translated)
863
+ time.sleep(0.5) # Short delay to avoid rate limiting
864
+
865
+ return ''.join(translated_chunks)
866
+ except Exception as e:
867
+ print(f"Translation error: {str(e)}")
868
+ # Fallback to simple placeholder for Hindi text if translation fails
869
+ return "अनुवाद त्रुटि हुई।" # "Translation error occurred" in Hindi
870
+
871
+ def text_to_speech(text: str, output_file: str = 'output.mp3') -> str:
872
+ """Convert text to speech in Hindi."""
873
+ try:
874
+ # Ensure output directory exists
875
+ output_dir = os.path.dirname(output_file)
876
+ if output_dir:
877
+ os.makedirs(output_dir, exist_ok=True)
878
+ print(f"Ensuring output directory exists: {output_dir}")
879
+
880
+ # If text is too short, add some padding to avoid TTS errors
881
+ if len(text.strip()) < 5:
882
+ text = text + " " + "नमस्कार" * 3 # Add some padding text
883
+ print("Text was too short, adding padding")
884
+
885
+ print(f"Attempting to generate TTS for text of length {len(text)} characters")
886
+
887
+ # For long texts, split into chunks for better TTS quality
888
+ if len(text) > 3000:
889
+ print("Text is long, splitting into chunks for better TTS quality")
890
+
891
+ # Split at sentence boundaries
892
+ sentences = re.split(r'(।|\.|\?|\!)', text)
893
+ chunks = []
894
+ current_chunk = ""
895
+
896
+ # Combine sentences into chunks of appropriate size
897
+ for i in range(0, len(sentences), 2):
898
+ if i+1 < len(sentences): # Make sure we have the punctuation part
899
+ sentence = sentences[i] + sentences[i+1]
900
+ else:
901
+ sentence = sentences[i]
902
+
903
+ if len(current_chunk) + len(sentence) < 3000:
904
+ current_chunk += sentence
905
+ else:
906
+ if current_chunk:
907
+ chunks.append(current_chunk)
908
+ current_chunk = sentence
909
+
910
+ if current_chunk: # Add the last chunk
911
+ chunks.append(current_chunk)
912
+
913
+ print(f"Split text into {len(chunks)} chunks for TTS processing")
914
+
915
+ # Process each chunk and combine into one audio file
916
+ temp_files = []
917
+ for i, chunk in enumerate(chunks):
918
+ temp_output = f"{output_file}.part{i}.mp3"
919
+ try:
920
+ # Try gTTS for each chunk
921
+ tts = gTTS(text=chunk, lang='hi', slow=False)
922
+ tts.save(temp_output)
923
+ if os.path.exists(temp_output) and os.path.getsize(temp_output) > 0:
924
+ temp_files.append(temp_output)
925
+ else:
926
+ print(f"Failed to create chunk {i} with gTTS")
927
+ raise Exception(f"gTTS failed for chunk {i}")
928
+ except Exception as e:
929
+ print(f"Error with gTTS for chunk {i}: {str(e)}")
930
+ break
931
+
932
+ # If we have temp files, combine them
933
+ if temp_files:
934
+ try:
935
+ # Use pydub to concatenate audio files
936
+ from pydub import AudioSegment
937
+ combined = AudioSegment.empty()
938
+ for temp_file in temp_files:
939
+ audio = AudioSegment.from_mp3(temp_file)
940
+ combined += audio
941
+
942
+ combined.export(output_file, format="mp3")
943
+
944
+ # Clean up temp files
945
+ for temp_file in temp_files:
946
+ try:
947
+ os.remove(temp_file)
948
+ except:
949
+ pass
950
+
951
+ print(f"Successfully combined {len(temp_files)} audio chunks into {output_file}")
952
+ return output_file
953
+ except Exception as e:
954
+ print(f"Error combining audio files: {str(e)}")
955
+ # Try to return the first chunk at least
956
+ if os.path.exists(temp_files[0]):
957
+ import shutil
958
+ shutil.copy(temp_files[0], output_file)
959
+ print(f"Returning first chunk as fallback: {output_file}")
960
+ return output_file
961
+
962
+ # Method 1: Use gTTS for Hindi text-to-speech (for shorter texts or if chunking failed)
963
+ try:
964
+ print("Trying to use gTTS...")
965
+ tts = gTTS(text=text, lang='hi', slow=False)
966
+ tts.save(output_file)
967
+
968
+ # Verify the file was created and is not empty
969
+ if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
970
+ print(f"Successfully created audio file with gTTS: {output_file} (size: {os.path.getsize(output_file)} bytes)")
971
+ return output_file
972
+ else:
973
+ print(f"gTTS created a file but it may be empty or invalid: {output_file}")
974
+ raise Exception("Generated audio file is empty or invalid")
975
+
976
+ except Exception as e:
977
+ print(f"gTTS error: {str(e)}")
978
+
979
+ # Method 2: Fallback to pyttsx3
980
+ try:
981
+ print("Falling back to pyttsx3...")
982
+ engine = pyttsx3.init()
983
+ # Try to find a Hindi voice, or use default
984
+ voices = engine.getProperty('voices')
985
+ found_hindi_voice = False
986
+
987
+ for voice in voices:
988
+ print(f"Checking voice: {voice.name}")
989
+ if 'hindi' in voice.name.lower():
990
+ print(f"Found Hindi voice: {voice.name}")
991
+ engine.setProperty('voice', voice.id)
992
+ found_hindi_voice = True
993
+ break
994
+
995
+ if not found_hindi_voice:
996
+ print("No Hindi voice found, using default voice")
997
+
998
+ engine.save_to_file(text, output_file)
999
+ engine.runAndWait()
1000
+
1001
+ # Verify the file was created and is not empty
1002
+ if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
1003
+ print(f"Successfully created audio file with pyttsx3: {output_file} (size: {os.path.getsize(output_file)} bytes)")
1004
+ return output_file
1005
+ else:
1006
+ print(f"pyttsx3 created a file but it may be empty or invalid: {output_file}")
1007
+ raise Exception("Generated audio file is empty or invalid")
1008
+
1009
+ except Exception as e2:
1010
+ print(f"pyttsx3 error: {str(e2)}")
1011
+
1012
+ # If all TTS methods fail, create a simple notification sound as fallback
1013
+ try:
1014
+ print("Both TTS methods failed. Creating a simple audio notification instead.")
1015
+ # Generate a simple beep sound as a fallback (1 second, 440Hz)
1016
+ import numpy as np
1017
+ from scipy.io import wavfile
1018
+
1019
+ sample_rate = 44100
1020
+ duration = 1 # seconds
1021
+ t = np.linspace(0, duration, int(sample_rate * duration))
1022
+
1023
+ # Generate a simple tone
1024
+ frequency = 440 # Hz (A4 note)
1025
+ data = np.sin(2 * np.pi * frequency * t) * 32767
1026
+ data = data.astype(np.int16)
1027
+
1028
+ # Convert output_file from mp3 to wav
1029
+ wav_output_file = output_file.replace('.mp3', '.wav')
1030
+ wavfile.write(wav_output_file, sample_rate, data)
1031
+
1032
+ print(f"Created simple audio notification: {wav_output_file}")
1033
+ return wav_output_file
1034
+
1035
+ except Exception as e3:
1036
+ print(f"Failed to create fallback audio: {str(e3)}")
1037
+ return ""
1038
+
1039
+ return ""
1040
+ except Exception as e:
1041
+ print(f"TTS error: {str(e)}")
1042
+ return ""
1043
+
1044
+ def prepare_final_report(company_name: str, articles: List[NewsArticle],
1045
+ comparative_analysis: Dict[str, Any]) -> Dict[str, Any]:
1046
+ """Prepare final report in the required format."""
1047
+ article_data = []
1048
+
1049
+ for article in articles:
1050
+ article_data.append({
1051
+ "Title": article.title,
1052
+ "Summary": article.summary,
1053
+ "Sentiment": article.sentiment,
1054
+ "Topics": article.topics
1055
+ })
1056
+
1057
+ # Prepare a more detailed summary for TTS with actual content from articles
1058
+ summary_text = f"{company_name} के बारे में समाचार विश्लेषण। "
1059
+
1060
+ # Add information about the number of articles found
1061
+ summary_text += f"कुल {len(articles)} लेख मिले। "
1062
+
1063
+ # Add sentiment distribution
1064
+ sentiment_counts = comparative_analysis["Sentiment Distribution"]
1065
+ pos_count = sentiment_counts["Positive"] + sentiment_counts["Slightly Positive"]
1066
+ neg_count = sentiment_counts["Negative"] + sentiment_counts["Slightly Negative"]
1067
+ neu_count = sentiment_counts["Neutral"]
1068
+
1069
+ if pos_count > 0 or neg_count > 0 or neu_count > 0:
1070
+ sentiment_detail = []
1071
+ if sentiment_counts["Positive"] > 0:
1072
+ sentiment_detail.append(f"{sentiment_counts['Positive']} पूर्ण सकारात्मक")
1073
+ if sentiment_counts["Slightly Positive"] > 0:
1074
+ sentiment_detail.append(f"{sentiment_counts['Slightly Positive']} हल्का सकारात्मक")
1075
+ if sentiment_counts["Neutral"] > 0:
1076
+ sentiment_detail.append(f"{sentiment_counts['Neutral']} तटस्थ")
1077
+ if sentiment_counts["Slightly Negative"] > 0:
1078
+ sentiment_detail.append(f"{sentiment_counts['Slightly Negative']} हल्का नकारात्मक")
1079
+ if sentiment_counts["Negative"] > 0:
1080
+ sentiment_detail.append(f"{sentiment_counts['Negative']} पूर्ण नकारात्मक")
1081
+
1082
+ summary_text += f"भावना विश्लेषण: {', '.join(sentiment_detail)}। "
1083
+
1084
+ # Add common topics with more detail
1085
+ common_topics = comparative_analysis["Common Topics"][:5]
1086
+ if common_topics:
1087
+ summary_text += f"मुख्य विषय हैं: {', '.join(common_topics)}। "
1088
+
1089
+ # Add more context about the common topics
1090
+ summary_text += "इन विषयों के बारे में लेखों में यह कहा गया है: "
1091
+
1092
+ # Find sentences related to common topics in the articles
1093
+ topic_sentences = []
1094
+ for topic in common_topics[:3]: # Focus on top 3 topics
1095
+ found = False
1096
+ for article in articles:
1097
+ if topic in article.content.lower():
1098
+ # Find sentences containing this topic
1099
+ sentences = sent_tokenize(article.content)
1100
+ for sentence in sentences:
1101
+ if topic in sentence.lower() and len(sentence) < 150:
1102
+ topic_sentences.append(f"{topic} के बारे में: {sentence}")
1103
+ found = True
1104
+ break
1105
+ if found:
1106
+ break
1107
+
1108
+ if topic_sentences:
1109
+ summary_text += " ".join(topic_sentences[:3]) + " "
1110
+
1111
+ # Add article summaries
1112
+ summary_text += "लेखों का सारांश: "
1113
+ for i, article in enumerate(articles[:3]): # Include up to 3 articles
1114
+ summary_text += f"लेख {i+1}: {article.title}. {article.summary[:200]}... "
1115
+
1116
+ # Add sentiment for this specific article
1117
+ summary_text += f"इस लेख का भावना: {article.sentiment}. "
1118
+
1119
+ # Add final sentiment analysis
1120
+ summary_text += comparative_analysis["Final Sentiment Analysis"]
1121
+
1122
+ # Translate the detailed summary to Hindi
1123
+ hindi_summary = translate_to_hindi(summary_text)
1124
+
1125
+ # Format the response according to the required format
1126
+ return {
1127
+ "Company": company_name,
1128
+ "Articles": article_data,
1129
+ "Comparative Sentiment Score": comparative_analysis,
1130
+ "Final Sentiment Analysis": comparative_analysis["Final Sentiment Analysis"],
1131
+ "Hindi Summary": hindi_summary
1132
+ }