giuseppericcio commited on
Commit
3b07d0f
Β·
1 Parent(s): 0f34d5f

Publish app

Browse files
app.py ADDED
@@ -0,0 +1,1294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import faiss
4
+ import whisper
5
+ import ffmpeg
6
+ import tempfile
7
+ import requests
8
+ import numpy as np
9
+ import pandas as pd
10
+ import streamlit as st
11
+
12
+ from openai import OpenAI
13
+ from transformers import pipeline
14
+ from sentence_transformers import SentenceTransformer
15
+ from newsplease import NewsPlease
16
+ from streamlit_echarts import st_echarts
17
+ from streamlit_option_menu import option_menu
18
+
19
+ # NEWS to check
20
+ # https://fbe.unimelb.edu.au/newsroom/fake-news-in-the-age-of-covid-19 True Claim
21
+ # https://newssalutebenessere.altervista.org/covid-19-just-a-simple-flue-or-something-else/ False Claim
22
+
23
+ ###### CONFIGURATIONS ######
24
+ # Debug mode
25
+ debug = False
26
+
27
+ # File paths
28
+ embeddings_file = r"./data/abstract_embeddings.npy"
29
+ pmid_file = r"./data/pmids.npy"
30
+ faiss_index_file = r"./data/faiss_index.index"
31
+ file_path = r'./data/parte_205.csv'
32
+
33
+ # Initialize OpenAI API client
34
+ client = OpenAI(
35
+ base_url="https://integrate.api.nvidia.com/v1",
36
+ api_key=st.secrets.nvidia
37
+ )
38
+
39
+ # Load data
40
+ data = pd.read_csv(file_path)
41
+
42
+ # Load the model
43
+ model = SentenceTransformer('all-MiniLM-L6-v2')
44
+
45
+
46
+ def get_article_data(url):
47
+ """
48
+ Extracts article data from a specified URL.
49
+
50
+ Args:
51
+ url (str): URL of the article to analyze.
52
+
53
+ Returns:
54
+ dict: Structured article data, including: title, authors, publication date, and content.
55
+ """
56
+ try:
57
+ # Make an HTTP request to the specified URL
58
+ response = requests.get(url)
59
+ # Check if the request was successful (i.e., status code 200)
60
+ response.raise_for_status()
61
+
62
+ # Extract the HTML content from the response
63
+ html_content = response.text
64
+
65
+ # Use NewsPlease to extract structured data from the HTML content
66
+ article = NewsPlease.from_html(html_content, url=url)
67
+
68
+ # Return the structured article data
69
+ return {
70
+ "title": article.title,
71
+ "authors": article.authors,
72
+ "date_publish": article.date_publish,
73
+ "content": article.maintext,
74
+ }
75
+
76
+ except requests.exceptions.RequestException as e:
77
+ return {"error": f"Error during URL retrieval: {e}"}
78
+
79
+ except Exception as e:
80
+ return {"error": f"Error processing the article: {e}"}
81
+
82
+
83
+ def extract_and_split_claims(claims):
84
+ """
85
+ Extracts and splits claims from a given string.
86
+
87
+ Args:
88
+ claims (str): String containing claims.
89
+
90
+ Returns:
91
+ dict: Dictionary containing the extracted claims.
92
+ """
93
+ start_index = claims.find("Claim 1:")
94
+ if start_index != -1:
95
+ claims = claims[start_index:]
96
+
97
+ claim_lines = claims.strip().split("\n\n")
98
+
99
+ claims_dict = {}
100
+ for i, claim in enumerate(claim_lines, start=1):
101
+ claims_dict[f"Claim_{i}"] = claim
102
+
103
+ for var_name, claim_text in claims_dict.items():
104
+ globals()[var_name] = claim_text
105
+
106
+ return claims_dict
107
+
108
+
109
+ def extract_label_and_score(result):
110
+ """
111
+ Extracts the predicted label and score from the result string.
112
+
113
+ Args:
114
+ result (str): String containing the prediction result.
115
+
116
+ Returns:
117
+ tuple: Predicted label and score.
118
+ """
119
+ # Extract the predicted label
120
+ label_match = re.search(r"'labels': \['(.*?)'", result)
121
+ predicted_label = label_match.group(1) if label_match else None
122
+
123
+ # Extract the score
124
+ score_match = re.search(r"'scores': \[(\d+\.\d+)", result)
125
+ score_label = float(score_match.group(1)) if score_match else None
126
+
127
+ return predicted_label, score_label
128
+
129
+
130
+ def clean_phrases(phrases, pattern):
131
+ """
132
+ Clean and extract phrases from a list of strings using a specified pattern.
133
+
134
+ Args:
135
+ phrases (list): List of strings containing phrases.
136
+ pattern (str): Regular expression pattern to extract phrases.
137
+
138
+ Returns:
139
+ list: List of cleaned phrases as dictionaries with text and abstract keys
140
+ """
141
+ cleaned_phrases = []
142
+
143
+ for phrase in phrases:
144
+ matches = re.findall(pattern, phrase)
145
+ cleaned_phrases.extend([{"text": match[0], "abstract": f"abstract_{match[1]}"} for match in matches])
146
+
147
+ return cleaned_phrases
148
+
149
+
150
+ def highlight_phrases(abstract_text, phrases, color, label):
151
+ """
152
+ Highlight phrases in the abstract text with the specified background color.
153
+
154
+ Args:
155
+ abstract_text (str): Text of the abstract to highlight.
156
+ phrases (list): List of phrases to highlight.
157
+ color (str): Background color to use for highlighting.
158
+ label (str): Predicted label for the claim.
159
+
160
+ Returns:
161
+ str: Abstract text with highlighted phrases.
162
+ """
163
+ # Switch colors if the label is "False"
164
+ if label.lower() == "false":
165
+ color = "lightgreen" if color == "red" else color
166
+
167
+ # Highlight each phrase in the abstract text
168
+ for phrase in phrases:
169
+ abstract_text = re.sub(
170
+ re.escape(phrase["text"]),
171
+ f'<span style="background-color: {color}; font-weight: bold; border: 1px solid black; border-radius: 5px;">{phrase["text"]}</span>',
172
+ abstract_text,
173
+ flags=re.IGNORECASE
174
+ )
175
+
176
+ return abstract_text
177
+
178
+
179
+ def parse_response(response):
180
+ """
181
+ Parse the response from the model and extract the fields.
182
+
183
+ Args:
184
+ response (str): Response string from the model.
185
+
186
+ Returns:
187
+ tuple: Extracted fields from the response.
188
+ """
189
+ # Initial values for the fields
190
+ first_label = "Non trovato"
191
+ justification = "Non trovato"
192
+ supporting = "Non trovato"
193
+ refusing = "Non trovato"
194
+ notes = "Non trovato"
195
+
196
+ # Regular expression patterns for extracting fields
197
+ patterns = {
198
+ "first_label": r"Label:\s*(.*?)\n",
199
+ "justification": r"Justification:\s*(.*?)(?=\nSupporting sentences)",
200
+ "supporting": r"Supporting sentences from abstracts:\n(.*?)(?=\nRefusing sentences)",
201
+ "refusing": r"Refusing sentences from abstracts:\n(.*?)(?=\nNote:)",
202
+ "notes": r"Note:\s*(.*)"
203
+ }
204
+
205
+ # Extract the fields using regular expressions
206
+ if match := re.search(patterns["first_label"], response, re.DOTALL):
207
+ first_label = match.group(1).strip()
208
+ if match := re.search(patterns["justification"], response, re.DOTALL):
209
+ justification = match.group(1).strip()
210
+ if match := re.search(patterns["supporting"], response, re.DOTALL):
211
+ supporting = [{"text": sentence.strip(), "abstract": f"abstract_{i+1}"} for i, sentence in enumerate(match.group(1).strip().split('\n'))]
212
+ if match := re.search(patterns["refusing"], response, re.DOTALL):
213
+ refusing = [{"text": sentence.strip(), "abstract": f"abstract_{i+1}"} for i, sentence in enumerate(match.group(1).strip().split('\n'))]
214
+ if match := re.search(patterns["notes"], response, re.DOTALL):
215
+ notes = match.group(1).strip()
216
+
217
+ # Return the extracted fields
218
+ return first_label, justification, supporting, refusing, notes
219
+
220
+
221
+ def load_embeddings(embeddings_file, pmid_file, faiss_index_file, debug=False):
222
+ """
223
+ Load embeddings, PMIDs, and FAISS index from the specified files.
224
+
225
+ Args:
226
+ embeddings_file (str): File path for the embeddings.
227
+ pmid_file (str): File path for the PMIDs.
228
+ faiss_index_file (str): File path for the FAISS index.
229
+
230
+ Returns:
231
+ tuple: Tuple containing the embeddings, PMIDs, and FAISS index.
232
+ """
233
+ # Check if the files exist
234
+ if not (os.path.exists(embeddings_file) and os.path.exists(pmid_file) and os.path.exists(faiss_index_file)):
235
+ raise FileNotFoundError("One or more files not found. Please check the file paths.")
236
+
237
+ # Load the embeddings and PMIDs
238
+ embeddings = np.load(embeddings_file)
239
+ pmids = np.load(pmid_file, allow_pickle=True)
240
+
241
+ # Load the FAISS index
242
+ index = faiss.read_index(faiss_index_file)
243
+
244
+ if debug:
245
+ print("Embeddings, PMIDs, and FAISS index loaded successfully.")
246
+
247
+ return embeddings, pmids, index
248
+
249
+
250
+ def retrieve_top_abstracts(claim, model, index, pmids, data, top_k=5):
251
+ """
252
+ Retrieve the top abstracts from the FAISS index for a given claim.
253
+
254
+ Args:
255
+ claim (str): Claim to fact-check.
256
+ model (SentenceTransformer): Sentence transformer model for encoding text.
257
+ index (faiss.IndexFlatIP): FAISS index for similarity search.
258
+ pmids (np.ndarray): Array of PMIDs for the abstracts.
259
+ data (pd.DataFrame): DataFrame containing the abstract data.
260
+ top_k (int): Number of top abstracts to retrieve.
261
+
262
+ Returns:
263
+ list: List of tuples containing the abstract text, PMID, and distance.
264
+ """
265
+ # Encode the claim using the SentenceTransformer model
266
+ claim_embedding = model.encode([claim])
267
+ faiss.normalize_L2(claim_embedding) # Normalize the claim embedding (with L2 norm)
268
+ distances, indices = index.search(claim_embedding, top_k)
269
+
270
+ # Retrieve the top abstracts based on the indices
271
+ results = []
272
+ for j, i in enumerate(indices[0]):
273
+ pmid = pmids[i]
274
+ abstract_text = data[data['PMID'] == pmid]['AbstractText'].values[0]
275
+ distance = distances[0][j]
276
+ results.append((abstract_text, pmid, distance))
277
+
278
+ return results
279
+
280
+
281
+ def generate_justification(query, justification):
282
+ """
283
+ Generate a justification for the claim using the Zero-Shot Classification model.
284
+
285
+ Args:
286
+ query (str): Claim to fact-check.
287
+ justification (str): Justification for the claim.
288
+
289
+ Returns:
290
+ str: Final justification for the claim.
291
+ """
292
+ # Define the classes for the Zero-Shot Classification model
293
+ Class = ["True", "False","NEI"]
294
+
295
+ # Generate the justification text
296
+ justification_text = (
297
+ f'Justification: "{justification}"'
298
+ )
299
+
300
+ # Limit the justification text to a maximum length
301
+ max_length = 512
302
+ if len(justification_text) > max_length:
303
+ justification_text = justification_text[:max_length]
304
+
305
+ # Generate the final justification using the Zero-Shot Classification model
306
+ output = zeroshot_classifier(
307
+ query,
308
+ Class,
309
+ hypothesis_template=f"The claim is '{{}}' for: {justification_text}",
310
+ multi_label=False
311
+ )
312
+
313
+ # Prepare the final justification text
314
+ final_justification = f'{output}.'
315
+
316
+ return final_justification
317
+
318
+
319
+ def llm_reasoning_template(query):
320
+ """
321
+ Generate a template for the prompt used for justification generation by the LLM model.
322
+
323
+ Args:
324
+ query (str): Claim to fact-check.
325
+
326
+ Returns:
327
+ str: Reasoning template for the claim.
328
+ """
329
+ llm_reasoning_prompt = f"""<<SYS>> [INST]
330
+
331
+ You are a helpful, respectful and honest Doctor. Always answer as helpfully as possible using the context text provided.
332
+
333
+ Use the information in Context.
334
+
335
+ Elaborate the Context to generate a new information.
336
+
337
+ Use only the knowledge in Context to answer.
338
+
339
+ Answer describing in a scentific way. Be formal during the answer. Use the third person.
340
+
341
+ Answer without mentioning the Context. Use it but don't refer to it in the text.
342
+
343
+ To answer, use max 300 word.
344
+
345
+ Create a Justification from the sentences given.
346
+
347
+ Use the structure: Justification: The claim is (label) because... (don't use the word "context")
348
+
349
+ Write as an online doctor to create the Justification.
350
+
351
+ After, give some sentences from Context from scientific papers: that supports the label and reject the label.
352
+
353
+ Supporting sentences from abstracts:
354
+ information sentence from abstract_1:
355
+ information sentence from abstract_2:
356
+ ..
357
+ Refusing sentences from abstracts:
358
+ information sentence from abstract_1:
359
+ information sentence from abstract_2:
360
+ ..
361
+ Add where it comes from (abstract_1, abstract_2, abstract_3, abstract_4, abstract_5)
362
+
363
+ With the answer, gives a line like: "Label:". Always put Label as first. After Label, give the Justification.
364
+ The justification will be always given as Justification:
365
+ Label can be yes, no, NEI, where yes: claim is true. no: claim is false. NEI: not enough information.
366
+ The Label will be chosen with a voting system of support/refuse before.
367
+
368
+ [/INST] <</SYS>>
369
+
370
+ [INST] Question: {query} [/INST]
371
+ [INST] Context from scientific papers:
372
+ """
373
+
374
+ return llm_reasoning_prompt
375
+
376
+
377
+ def claim_detection_template(full_text):
378
+ """
379
+ Generate a template for the prompt used for claim detection by the LLM model.
380
+
381
+ Args:
382
+ full_text (str): Full text to analyze.
383
+
384
+ Returns:
385
+ str: Template for claim detection.
386
+ """
387
+ claim_detection_prompt = f"""<<SYS>> [INST]
388
+
389
+ Your task is to extract from the text potential health related question to verify their veracity.
390
+
391
+ The context extracted from the online where to take the claim is: {full_text}
392
+
393
+ Create simple claim of single sentence from the context.
394
+
395
+ Dont's use *
396
+
397
+ Give just the claim. Don't write other things.
398
+
399
+ Extract only health related claim.
400
+
401
+ Rank eventual claim like:
402
+
403
+ Claim 1:
404
+ Claim 2:
405
+ Claim 3:
406
+
407
+ Use always this structure.
408
+ Start every claim with "Claim " followed by the number.
409
+
410
+ The number of claims may go from 1 to a max of 5.
411
+
412
+ The claims have to be always health related. [/INST] <</SYS>>
413
+ """
414
+
415
+ return claim_detection_prompt
416
+
417
+
418
+ # Page and Title Configuration
419
+ st.set_page_config(page_title="CER - Combining Evidence and Reasoning Demo", layout="wide", initial_sidebar_state="collapsed")
420
+ st.markdown("<h1 style='text-align: center; color: inherit;'>βœ”οΈβœ¨ CER - Biomedical Fact Checker</h1>", unsafe_allow_html=True)
421
+
422
+ # Horizontal option menu for selecting the page
423
+ page = option_menu(None, ["Single claim check", "Page check", "Video check"],
424
+ icons=['check', 'ui-checks'],
425
+ menu_icon="cast", default_index=0, orientation="horizontal")
426
+
427
+ # Sidebar Configuration
428
+ st.sidebar.title("πŸ”¬ Combining Evidence and Reasoning Demo")
429
+ st.sidebar.caption("πŸ” Fact-check biomedical claims using scientific evidence and reasoning.")
430
+ st.sidebar.markdown("---")
431
+ st.sidebar.caption("#### ℹ️ About")
432
+ st.sidebar.caption("This is a demo application for fact-checking biomedical claims using scientific evidence and reasoning. It uses a combination of language models, scientific literature, and reasoning to provide explanations for the predictions.")
433
+
434
+ # Load embeddings, PMIDs, and FAISS index
435
+ if 'embeddings_loaded' not in st.session_state:
436
+ embeddings, pmids, index = load_embeddings(embeddings_file, pmid_file, faiss_index_file, debug)
437
+ st.session_state.embeddings = embeddings
438
+ st.session_state.pmids = pmids
439
+ st.session_state.index = index
440
+ st.session_state.embeddings_loaded = True
441
+ else:
442
+ embeddings = st.session_state.embeddings
443
+ pmids = st.session_state.pmids
444
+ index = st.session_state.index
445
+
446
+ # Check if the claim and top_abstracts are in the session state
447
+ if 'claim' not in st.session_state:
448
+ st.session_state.claim = ""
449
+
450
+ if 'top_abstracts' not in st.session_state:
451
+ st.session_state.top_abstracts = []
452
+
453
+
454
+ #### Single claim check PAGE ####
455
+ if page == "Single claim check":
456
+ st.subheader("Single claim check")
457
+ st.caption("✨ Enter a single claim to fact-check and hit the button to see the results! πŸ”")
458
+
459
+ st.session_state.claim = st.text_input("Claim to fact-check:")
460
+
461
+ if st.button("✨ Fact Check"):
462
+
463
+ if st.session_state.claim:
464
+ # Retrieve the top abstracts for the claim
465
+ top_abstracts = retrieve_top_abstracts(st.session_state.claim, model, index, pmids, data, top_k=5)
466
+ st.session_state.top_abstracts = top_abstracts
467
+
468
+ st.markdown("### **Results**")
469
+
470
+ with st.container():
471
+ for i, (abstract, pmid, distance) in enumerate(st.session_state.top_abstracts, 1):
472
+ pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
473
+ globals()[f"abstract_{i}"] = abstract
474
+ globals()[f"reference_{i}"] = pubmed_url
475
+ globals()[f"distance_{i}"] = distance
476
+
477
+ with st.spinner('πŸ” We are checking...'):
478
+ try:
479
+ # Retrieve the question from the DataFrame
480
+ query = st.session_state.claim
481
+
482
+ # Generate the reasoning template
483
+ prompt_template = llm_reasoning_template(query)
484
+
485
+ # Add the abstracts to the prompt
486
+ for i in range(1, len(st.session_state.top_abstracts)):
487
+ prompt_template += f"{globals()[f'abstract_{i}']} ; "
488
+ prompt_template += f"{globals()[f'abstract_{i+1}']} [/INST]"
489
+
490
+ # Call the API
491
+ completion = client.chat.completions.create(
492
+ model="meta/llama-3.1-405b-instruct",
493
+ messages=[{"role": "user", "content": prompt_template}],
494
+ temperature=0.1,
495
+ top_p=0.7,
496
+ max_tokens=1024,
497
+ stream=True
498
+ )
499
+
500
+ # Collect the response
501
+ answer = ""
502
+ for chunk in completion:
503
+ if chunk.choices[0].delta.content:
504
+ answer += chunk.choices[0].delta.content
505
+
506
+ # Debug: Check the answer
507
+ if debug:
508
+ print(f"{answer}")
509
+
510
+ except Exception as e:
511
+ st.write(f"Error processing index: {e}")
512
+
513
+ with st.spinner('πŸ€”πŸ’¬ Justifying the check...'):
514
+ # Perform parsing and separate variables
515
+ zeroshot_classifier = pipeline(
516
+ "zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33"
517
+ )
518
+ first_label, justification, supporting, refusing, notes = parse_response(answer)
519
+
520
+ with st.spinner('πŸ•΅οΈβ€β™‚οΈπŸ“œ We are finding evidence...'):
521
+ # Generate the justification for the claim
522
+ result = generate_justification(st.session_state.claim, justification)
523
+ predicted_label, score_label = extract_label_and_score(result)
524
+
525
+ if predicted_label == "True":
526
+ color = f"rgba(0, 204, 0, {score_label})" # Green
527
+ elif predicted_label == "False":
528
+ color = f"rgba(204, 0, 0, {score_label})" # Red
529
+ elif predicted_label == "NEI":
530
+ color = f"rgba(255, 255, 0, {score_label})" # Yellow
531
+ else:
532
+ color = "black" # Default color
533
+
534
+ # Calculate the confidence score
535
+ confidence = f"{score_label * 100:.2f}%"
536
+ st.caption(f"πŸ“ The Claim: {st.session_state.claim}")
537
+ st.markdown(
538
+ f"**Prediction of claim:** Most likely <span style='color: {color}; font-weight: bold;'>{predicted_label}</span> with a confidence of <span style='color: {color}; font-weight: bold;'>{confidence}</span>",
539
+ unsafe_allow_html=True
540
+ )
541
+ st.markdown("### **Justification**")
542
+ st.markdown(f'<p> {justification}</p>', unsafe_allow_html=True)
543
+
544
+ # Extract the abstracts and references
545
+ abstracts = {}
546
+ for i in range(1, len(st.session_state.top_abstracts) + 1):
547
+ abstracts[f"abstract_{i}"] = globals()[f"abstract_{i}"]
548
+
549
+ pattern = r'"\s*(.*?)\s*"\s*\(abstract_(\d+)\)'
550
+
551
+ supporting_texts = []
552
+ for item in supporting:
553
+ try:
554
+ supporting_texts.append(item["text"])
555
+ except (TypeError, KeyError):
556
+ continue
557
+ supporting = clean_phrases(supporting_texts, pattern)
558
+
559
+ refusing_text = []
560
+ for item in refusing:
561
+ try:
562
+ refusing_text.append(item["text"])
563
+ except (TypeError, KeyError):
564
+ continue
565
+ refusing = clean_phrases(refusing_text, pattern)
566
+
567
+ if debug:
568
+ print(supporting)
569
+ print(refusing)
570
+
571
+ processed_abstracts = {}
572
+ for abstract_name, abstract_text in abstracts.items():
573
+ # Highlight supporting phrases in green
574
+ supporting_matches = [phrase for phrase in supporting if phrase["abstract"] == abstract_name]
575
+ abstract_text = highlight_phrases(abstract_text, supporting_matches, "lightgreen", predicted_label)
576
+
577
+ # Highlight refusing phrases in red
578
+ refusing_matches = [phrase for phrase in refusing if phrase["abstract"] == abstract_name]
579
+ abstract_text = highlight_phrases(abstract_text, refusing_matches, "red", predicted_label)
580
+
581
+ # Add only if supporting matches are found
582
+ if supporting_matches:
583
+ # Add the reference if a corresponding variable exists
584
+ reference_variable = f"reference_{abstract_name.split('_')[1]}"
585
+ if reference_variable in globals():
586
+ reference_value = globals()[reference_variable]
587
+ abstract_text += f"<br><br><strong>πŸ”— Reference:</strong> {reference_value}"
588
+
589
+ # Add the processed abstract
590
+ processed_abstracts[abstract_name] = abstract_text
591
+
592
+ # Iterate over the processed abstracts and remove duplicates
593
+ seen_contents = set() # Set to track already seen contents
594
+ evidence_counter = 1
595
+
596
+ # Display the results of the processed abstracts with numbered expanders
597
+ st.markdown("### **Scientific Evidence**")
598
+
599
+ # Add a legend for the colors
600
+ legend_html = """
601
+ <div style="display: flex; flex-direction: column; align-items: flex-start;">
602
+ <div style="display: flex; align-items: center; margin-bottom: 5px;">
603
+ <div style="width: 20px; height: 20px; background-color: lightgreen; margin-right: 10px; border-radius: 5px;"></div>
604
+ <div>Positive Evidence</div>
605
+ </div>
606
+ <div style="display: flex; align-items: center; margin-bottom: 5px;">
607
+ <div style="width: 20px; height: 20px; background-color: red; margin-right: 10px; border-radius: 5px;"></div>
608
+ <div>Negative Evidence</div>
609
+ </div>
610
+ <div style="display: flex; align-items: center; margin-bottom: 5px;">
611
+ <div style="width: 20px; height: 20px; background-color: yellow; margin-right: 10px; border-radius: 5px;"></div>
612
+ <div>Dubious Evidence</div>
613
+ </div>
614
+ </div>
615
+ """
616
+ col1, col2 = st.columns([0.8, 0.2])
617
+
618
+ with col1:
619
+ if processed_abstracts:
620
+ tabs = st.tabs([f"Scientific Evidence {i}" for i in range(1, len(processed_abstracts) + 1)])
621
+ for tab, (name, content) in zip(tabs, processed_abstracts.items()):
622
+ if content not in seen_contents: # Check for duplicates
623
+ seen_contents.add(content)
624
+ with tab:
625
+ # Switch colors if the label is "False"
626
+ if predicted_label.lower() == "false":
627
+ content = content.replace("background-color: lightgreen", "background-color: tempcolor")
628
+ content = content.replace("background-color: red", "background-color: lightgreen")
629
+ content = content.replace("background-color: tempcolor", "background-color: red")
630
+
631
+ # Use `st.write` to display HTML directly
632
+ st.write(content, unsafe_allow_html=True)
633
+ else:
634
+ st.markdown("No relevant Scientific Evidence found")
635
+
636
+ with col2:
637
+ st.caption("Legend")
638
+ st.markdown(legend_html, unsafe_allow_html=True)
639
+
640
+
641
+ #### Web page check PAGE ####
642
+ elif page == "Page check":
643
+ st.subheader("Page check")
644
+ st.caption("✨ Enter a URL to fact-check the health-related claims on the page and hit the button to see the results! πŸ”")
645
+
646
+ url = st.text_input("URL to fact-check:")
647
+
648
+ if st.button("✨ Fact Check") and url:
649
+ st.session_state.true_count = 0
650
+ st.session_state.false_count = 0
651
+ st.session_state.nei_count = 0
652
+
653
+ with st.spinner('πŸŒπŸ” Extracting claims...'):
654
+ article_data = get_article_data(url)
655
+
656
+ try:
657
+ # Retrieve the claims from the article data
658
+ prompt_template = claim_detection_template(article_data)
659
+
660
+ # Call the API
661
+ completion = client.chat.completions.create(
662
+ model="meta/llama-3.1-405b-instruct",
663
+ messages=[{"role": "user", "content": prompt_template}],
664
+ temperature=0.1,
665
+ top_p=0.7,
666
+ max_tokens=1024,
667
+ stream=True
668
+ )
669
+
670
+ # Collect the response
671
+ answer = ""
672
+ for chunk in completion:
673
+ if chunk.choices[0].delta.content:
674
+ answer += chunk.choices[0].delta.content
675
+
676
+ # Debug: Controlla la risposta
677
+ print(f"{answer}")
678
+
679
+ except Exception as e:
680
+ print(f"Error {e}")
681
+
682
+ claims_dict = extract_and_split_claims(answer)
683
+
684
+ # Display the extracted claims
685
+ st.markdown("### **Claims Extracted**")
686
+ st.caption("πŸ” Here are the health-related claims extracted from the page:")
687
+ cols = st.columns(3)
688
+ for i, (claim_key, claim_text) in enumerate(claims_dict.items(), 1):
689
+ col = cols[(i - 1) % 3]
690
+ with col.expander(f"Claim {i} πŸ“", expanded=True):
691
+ st.write(claim_text)
692
+
693
+ # Display the results for the extracted claims
694
+ st.markdown("### **Results**")
695
+ st.caption("πŸ” Here are the results for the extracted claims:")
696
+ for claim_key, claim_text in claims_dict.items():
697
+ st.session_state.claim = claim_text
698
+ if st.session_state.claim:
699
+ top_abstracts = retrieve_top_abstracts(st.session_state.claim, model, index, pmids, data, top_k=5)
700
+ st.session_state.top_abstracts = top_abstracts # Salva i risultati
701
+
702
+ with st.expander(f"βœ”οΈ **Results for {claim_key}**", expanded=True):
703
+ for i, (abstract, pmid, distance) in enumerate(st.session_state.top_abstracts, 1):
704
+ pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
705
+ globals()[f"abstract_{i}"] = abstract
706
+ globals()[f"reference_{i}"] = pubmed_url
707
+ globals()[f"distance_{i}"] = distance
708
+
709
+ with st.spinner('πŸ” We are checking...'):
710
+ try:
711
+ # Retrieve the question from the DataFrame
712
+ query = st.session_state.claim
713
+
714
+ # Generate the reasoning template
715
+ prompt_template = llm_reasoning_template(query)
716
+
717
+ # Add the abstracts to the prompt
718
+ for i in range(1, len(st.session_state.top_abstracts)):
719
+ prompt_template += f"{globals()[f'abstract_{i}']} ; "
720
+ prompt_template += f"{globals()[f'abstract_{i+1}']} [/INST]"
721
+
722
+ # Call the API
723
+ completion = client.chat.completions.create(
724
+ model="meta/llama-3.1-405b-instruct",
725
+ messages=[{"role": "user", "content": prompt_template}],
726
+ temperature=0.1,
727
+ top_p=0.7,
728
+ max_tokens=1024,
729
+ stream=True
730
+ )
731
+
732
+ # Collect the response
733
+ answer = ""
734
+ for chunk in completion:
735
+ if chunk.choices[0].delta.content:
736
+ answer += chunk.choices[0].delta.content
737
+
738
+ # Debug: Check the answer
739
+ if debug:
740
+ print(f"{answer}")
741
+
742
+ except Exception as e:
743
+ st.write(f"Error processing index: {e}")
744
+
745
+ with st.spinner('πŸ€”πŸ’¬ Justifying the check...'):
746
+ # Perform parsing and separate variables
747
+ zeroshot_classifier = pipeline(
748
+ "zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33"
749
+ )
750
+ first_label, justification, supporting, refusing, notes = parse_response(answer)
751
+
752
+ with st.spinner('πŸ•΅οΈβ€β™‚οΈπŸ“œ We are finding evidence...'):
753
+ # Generate the justification for the claim
754
+ result = generate_justification(st.session_state.claim, justification)
755
+ predicted_label, score_label = extract_label_and_score(result)
756
+
757
+ # Update the counts based on the predicted label
758
+ if predicted_label == "True":
759
+ color = f"rgba(0, 204, 0, {score_label})" # Green
760
+ st.session_state.true_count += 1
761
+ elif predicted_label == "False":
762
+ color = f"rgba(204, 0, 0, {score_label})" # Red
763
+ st.session_state.false_count += 1
764
+ elif predicted_label == "NEI":
765
+ color = f"rgba(255, 255, 0, {score_label})" # Yellow
766
+ st.session_state.nei_count += 1
767
+ else:
768
+ color = "black" # Default color
769
+
770
+ confidence = f"{score_label * 100:.2f}%"
771
+ st.caption(f"πŸ“ The Claim: {st.session_state.claim}")
772
+ st.markdown(
773
+ f"**Prediction of claim:** Most likely <span style='color: {color}; font-weight: bold;'>{predicted_label}</span> with a confidence of <span style='color: {color}; font-weight: bold;'>{confidence}</span>",
774
+ unsafe_allow_html=True
775
+ )
776
+
777
+ st.markdown("### **Justification**")
778
+ st.markdown(f'<p> {justification}</p>', unsafe_allow_html=True)
779
+
780
+ abstracts = {}
781
+ for i in range(1, len(st.session_state.top_abstracts) + 1):
782
+ abstracts[f"abstract_{i}"] = globals()[f"abstract_{i}"]
783
+
784
+ pattern = r'"\s*(.*?)\s*"\s*\(abstract_(\d+)\)'
785
+
786
+ supporting_texts = []
787
+ for item in supporting:
788
+ try:
789
+ supporting_texts.append(item["text"])
790
+ except (TypeError, KeyError):
791
+ continue
792
+ supporting = clean_phrases(supporting_texts, pattern)
793
+
794
+ refusing_text = []
795
+ for item in refusing:
796
+ try:
797
+ refusing_text.append(item["text"])
798
+ except (TypeError, KeyError):
799
+ continue
800
+ refusing = clean_phrases(refusing_text, pattern)
801
+
802
+ if debug:
803
+ print(supporting)
804
+ print(refusing)
805
+
806
+ processed_abstracts = {}
807
+ for abstract_name, abstract_text in abstracts.items():
808
+ # Highlight supporting phrases in green
809
+ supporting_matches = [phrase for phrase in supporting if phrase["abstract"] == abstract_name]
810
+ abstract_text = highlight_phrases(abstract_text, supporting_matches, "lightgreen", predicted_label)
811
+
812
+ # Highlight refusing phrases in red
813
+ refusing_matches = [phrase for phrase in refusing if phrase["abstract"] == abstract_name]
814
+ abstract_text = highlight_phrases(abstract_text, refusing_matches, "red", predicted_label)
815
+
816
+ # Add only if supporting matches are found
817
+ if supporting_matches:
818
+ # Add the reference if a corresponding variable exists
819
+ reference_variable = f"reference_{abstract_name.split('_')[1]}"
820
+ if reference_variable in globals():
821
+ reference_value = globals()[reference_variable]
822
+ abstract_text += f"<br><br><strong>πŸ”— Reference:</strong> {reference_value}"
823
+
824
+ # Add the processed abstract
825
+ processed_abstracts[abstract_name] = abstract_text
826
+
827
+ # Iterate over the processed abstracts and remove duplicates
828
+ seen_contents = set() # Set to track already seen contents
829
+ evidence_counter = 1
830
+
831
+ # Display the results of the processed abstracts with numbered expanders
832
+ st.markdown("### **Scientific Evidence**")
833
+
834
+ # Add a legend for the colors
835
+ legend_html = """
836
+ <div style="display: flex; flex-direction: column; align-items: flex-start;">
837
+ <div style="display: flex; align-items: center; margin-bottom: 5px;">
838
+ <div style="width: 20px; height: 20px; background-color: lightgreen; margin-right: 10px; border-radius: 5px;"></div>
839
+ <div>Positive Evidence</div>
840
+ </div>
841
+ <div style="display: flex; align-items: center; margin-bottom: 5px;">
842
+ <div style="width: 20px; height: 20px; background-color: red; margin-right: 10px; border-radius: 5px;"></div>
843
+ <div>Negative Evidence</div>
844
+ </div>
845
+ <div style="display: flex; align-items: center; margin-bottom: 5px;">
846
+ <div style="width: 20px; height: 20px; background-color: yellow; margin-right: 10px; border-radius: 5px;"></div>
847
+ <div>Dubious Evidence</div>
848
+ </div>
849
+ </div>
850
+ """
851
+ col1, col2 = st.columns([0.8, 0.2])
852
+
853
+ with col1:
854
+ if processed_abstracts:
855
+ tabs = st.tabs([f"Scientific Evidence {i}" for i in range(1, len(processed_abstracts) + 1)])
856
+ for tab, (name, content) in zip(tabs, processed_abstracts.items()):
857
+ if content not in seen_contents: # Check for duplicates
858
+ seen_contents.add(content)
859
+ with tab:
860
+ # Switch colors if the label is "False"
861
+ if predicted_label.lower() == "false":
862
+ content = content.replace("background-color: lightgreen", "background-color: tempcolor")
863
+ content = content.replace("background-color: red", "background-color: lightgreen")
864
+ content = content.replace("background-color: tempcolor", "background-color: red")
865
+
866
+ # Use `st.write` to display HTML directly
867
+ st.write(content, unsafe_allow_html=True)
868
+ else:
869
+ st.markdown("No relevant Scientific Evidence found")
870
+
871
+ with col2:
872
+ st.caption("Legend")
873
+ st.markdown(legend_html, unsafe_allow_html=True)
874
+
875
+ st.markdown("### **Page Summary**")
876
+ st.caption("πŸ“Š Here is a summary of the results for the extracted claims:")
877
+
878
+ # Labels and Colors
879
+ labels = ['True', 'False', 'NEI']
880
+ colors = ['green', 'red', 'yellow']
881
+
882
+ # Sizes of the pie chart
883
+ sizes = [
884
+ st.session_state.true_count,
885
+ st.session_state.false_count,
886
+ st.session_state.nei_count
887
+ ]
888
+
889
+ # Configure the Pie Chart Options
890
+ options = {
891
+ "tooltip": {"trigger": "item"},
892
+ "legend": {"top": "5%", "left": "center"},
893
+ "series": [
894
+ {
895
+ "name": "Document Status",
896
+ "type": "pie",
897
+ "radius": ["40%", "70%"],
898
+ "avoidLabelOverlap": False,
899
+ "itemStyle": {
900
+ "borderRadius": 10,
901
+ "borderColor": "#fff",
902
+ "borderWidth": 2,
903
+ },
904
+ "label": {"show": True, "position": "center"},
905
+ "emphasis": {
906
+ "label": {"show": True, "fontSize": "20", "fontWeight": "bold"}
907
+ },
908
+ "labelLine": {"show": False},
909
+ "data": [
910
+ {"value": sizes[0], "name": labels[0], "itemStyle": {"color": colors[0]}},
911
+ {"value": sizes[1], "name": labels[1], "itemStyle": {"color": colors[1]}},
912
+ {"value": sizes[2], "name": labels[2], "itemStyle": {"color": colors[2]}},
913
+ ],
914
+ }
915
+ ],
916
+ }
917
+
918
+ # Display the Pie Chart
919
+ st1, st2 = st.columns([0.6, 0.4])
920
+
921
+ with st1:
922
+ st.markdown("#### The page is :")
923
+ true_count = st.session_state.true_count
924
+ false_count = st.session_state.false_count
925
+ nei_count = st.session_state.nei_count
926
+
927
+ if true_count > 0 and false_count == 0:
928
+ reliability = '<span style="color: darkgreen; font-weight: bold;">Highly Reliable</span>'
929
+ elif true_count > false_count:
930
+ reliability = '<span style="color: lightgreen; font-weight: bold;">Fairly Reliable</span>'
931
+ elif true_count == 0:
932
+ reliability = '<span style="color: darkred; font-weight: bold;">Strongly Considered Unreliable</span>'
933
+ elif false_count > true_count:
934
+ reliability = '<span style="color: lightcoral; font-weight: bold;">Unlikely to be Reliable</span>'
935
+ elif (true_count == false_count) or (nei_count > true_count and nei_count > false_count and true_count != 0 and false_count != 0):
936
+ reliability = '<span style="color: yellow; font-weight: bold;">NEI</span>'
937
+ else:
938
+ reliability = '<span style="color: black; font-weight: bold;">Completely Reliable</span>'
939
+
940
+ st.markdown(f"The page is considered {reliability} because it contains {true_count} true claims, {false_count} false claims, and {nei_count} claims with not enough information.", unsafe_allow_html=True)
941
+
942
+ with st.popover("ℹ️ Understanding the Truthfulness Ratings"):
943
+ st.markdown("""
944
+ The reliability of the page is determined based on the number of true and false claims extracted from the page.
945
+ - If the page contains only true claims, it is considered **Highly Reliable**.
946
+ - If the page has more true claims than false claims, it is considered **Fairly Reliable**.
947
+ -If the page has more false claims than true claims, it is considered **Unlikely to be Reliable**.
948
+ - If the page contains only false claims, it is considered **Strongly Considered Unreliable**.
949
+ - If the page has an equal number of true and false claims, it is considered **NEI**.
950
+ """)
951
+
952
+ with st2:
953
+ st_echarts(
954
+ options=options, height="500px",
955
+ )
956
+
957
+
958
+ #### Video check PAGE ####
959
+ elif page == "Video check":
960
+ st.subheader("Video claim check")
961
+ st.caption("✨ Upload a video to fact-check and hit the button to see the results! πŸ”")
962
+
963
+ video = st.file_uploader("Choose a video...", type=["mp4"])
964
+ video_box, text_box = st.columns([0.6, 0.4])
965
+ if video is not None:
966
+ with video_box:
967
+ with st.expander("▢️ See uploaded video", expanded=False):
968
+ st.video(video)
969
+
970
+ if st.button("✨ Fact Check") and video is not None:
971
+ with st.spinner('πŸŽ₯πŸ”„ Processing video...'):
972
+ # Save the video to a temporary file
973
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as temp_video:
974
+ temp_video.write(video.read())
975
+ temp_video_path = temp_video.name
976
+
977
+ # Extract the audio from the video
978
+ temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
979
+ ffmpeg.input(temp_video_path).output(temp_audio_path, acodec="pcm_s16le", ar=16000, ac=1).run(overwrite_output=True)
980
+
981
+ # Transcribe the audio
982
+ model1 = whisper.load_model("small")
983
+ result = model1.transcribe(temp_audio_path)
984
+
985
+ # Extract the final text
986
+ transcribed_text = result["text"]
987
+ with text_box:
988
+ with st.expander("πŸ“ Transcribed Text", expanded=False):
989
+ st.caption("πŸ” Here is the transcribed text from the uploaded video:")
990
+ container = st.container(height=322)
991
+ container.write(transcribed_text)
992
+
993
+ st.session_state.true_count = 0
994
+ st.session_state.false_count = 0
995
+ st.session_state.nei_count = 0
996
+
997
+ with st.spinner('πŸŒπŸ” Extracting claims from video...'):
998
+ try:
999
+ # Retrieve the claims from the video
1000
+ prompt_template = claim_detection_template(transcribed_text)
1001
+
1002
+ # Call the API
1003
+ completion = client.chat.completions.create(
1004
+ model="meta/llama-3.1-405b-instruct",
1005
+ messages=[{"role": "user", "content": prompt_template}],
1006
+ temperature=0.1,
1007
+ top_p=0.7,
1008
+ max_tokens=1024,
1009
+ stream=True
1010
+ )
1011
+
1012
+ # Collect the response
1013
+ answer = ""
1014
+ for chunk in completion:
1015
+ if chunk.choices[0].delta.content:
1016
+ answer += chunk.choices[0].delta.content
1017
+
1018
+ # Debug: Check the answer
1019
+ if debug:
1020
+ print(f"{answer}")
1021
+
1022
+ except Exception as e:
1023
+ print(f"Error {e}")
1024
+
1025
+ claims_dict = extract_and_split_claims(answer)
1026
+
1027
+ # Display the extracted claims
1028
+ st.markdown("### **Claims Extracted**")
1029
+ st.caption("πŸ” Here are the health-related claims extracted from the video:")
1030
+ cols = st.columns(3)
1031
+ for i, (claim_key, claim_text) in enumerate(claims_dict.items(), 1):
1032
+ col = cols[(i - 1) % 3]
1033
+ with col.expander(f"Claim {i} πŸ“", expanded=True):
1034
+ st.write(claim_text)
1035
+
1036
+ # Display the results for the extracted claims
1037
+ st.markdown("### **Results**")
1038
+ st.caption("πŸ” Here are the results for the extracted claims:")
1039
+ for claim_key, claim_text in claims_dict.items():
1040
+ st.session_state.claim = claim_text
1041
+ if st.session_state.claim:
1042
+ top_abstracts = retrieve_top_abstracts(st.session_state.claim, model, index, pmids, data, top_k=5)
1043
+ st.session_state.top_abstracts = top_abstracts # Salva i risultati
1044
+
1045
+ with st.expander(f"βœ”οΈ **Results for {claim_key}**", expanded=True):
1046
+ for i, (abstract, pmid, distance) in enumerate(st.session_state.top_abstracts, 1):
1047
+ pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/"
1048
+ globals()[f"abstract_{i}"] = abstract
1049
+ globals()[f"reference_{i}"] = pubmed_url
1050
+ globals()[f"distance_{i}"] = distance
1051
+
1052
+ with st.spinner('πŸ” We are checking...'):
1053
+ try:
1054
+ # Retrieve the question from the DataFrame
1055
+ query = st.session_state.claim
1056
+
1057
+ # Generate the reasoning template
1058
+ prompt_template = llm_reasoning_template(query)
1059
+
1060
+ # Add the abstracts to the prompt
1061
+ for i in range(1, len(st.session_state.top_abstracts)):
1062
+ prompt_template += f"{globals()[f'abstract_{i}']} ; "
1063
+ prompt_template += f"{globals()[f'abstract_{i+1}']} [/INST]"
1064
+
1065
+ # Call the API
1066
+ completion = client.chat.completions.create(
1067
+ model="meta/llama-3.1-405b-instruct",
1068
+ messages=[{"role": "user", "content": prompt_template}],
1069
+ temperature=0.1,
1070
+ top_p=0.7,
1071
+ max_tokens=1024,
1072
+ stream=True
1073
+ )
1074
+
1075
+ # Collect the response
1076
+ answer = ""
1077
+ for chunk in completion:
1078
+ if chunk.choices[0].delta.content:
1079
+ answer += chunk.choices[0].delta.content
1080
+
1081
+ # Debug: Check the answer
1082
+ if debug:
1083
+ print(f"{answer}")
1084
+
1085
+ except Exception as e:
1086
+ st.write(f"Error processing index: {e}")
1087
+
1088
+ with st.spinner('πŸ€”πŸ’¬ Justifying the check...'):
1089
+ # Perform parsing and separate variables
1090
+ zeroshot_classifier = pipeline(
1091
+ "zero-shot-classification", model="MoritzLaurer/deberta-v3-large-zeroshot-v1.1-all-33"
1092
+ )
1093
+ first_label, justification, supporting, refusing, notes = parse_response(answer)
1094
+
1095
+ with st.spinner('πŸ•΅οΈβ€β™‚οΈπŸ“œ We are finding evidence...'):
1096
+ # Generate the justification for the claim
1097
+ result = generate_justification(st.session_state.claim, justification)
1098
+ predicted_label, score_label = extract_label_and_score(result)
1099
+
1100
+ # Update the counts based on the predicted label
1101
+ if predicted_label == "True":
1102
+ color = f"rgba(0, 204, 0, {score_label})" # Green
1103
+ st.session_state.true_count += 1
1104
+ elif predicted_label == "False":
1105
+ color = f"rgba(204, 0, 0, {score_label})" # Red
1106
+ st.session_state.false_count += 1
1107
+ elif predicted_label == "NEI":
1108
+ color = f"rgba(255, 255, 0, {score_label})" # Yellow
1109
+ st.session_state.nei_count += 1
1110
+ else:
1111
+ color = "black" # Default color
1112
+
1113
+ confidence = f"{score_label * 100:.2f}%"
1114
+ st.caption(f"πŸ“ The Claim: {st.session_state.claim}")
1115
+ st.markdown(
1116
+ f"**Prediction of claim:** Most likely <span style='color: {color}; font-weight: bold;'>{predicted_label}</span> with a confidence of <span style='color: {color}; font-weight: bold;'>{confidence}</span>",
1117
+ unsafe_allow_html=True
1118
+ )
1119
+
1120
+ st.markdown("### **Justification**")
1121
+ st.markdown(f'<p> {justification}</p>', unsafe_allow_html=True)
1122
+
1123
+ abstracts = {}
1124
+ for i in range(1, len(st.session_state.top_abstracts) + 1):
1125
+ abstracts[f"abstract_{i}"] = globals()[f"abstract_{i}"]
1126
+
1127
+ pattern = r'"\s*(.*?)\s*"\s*\(abstract_(\d+)\)'
1128
+
1129
+ supporting_texts = []
1130
+ for item in supporting:
1131
+ try:
1132
+ supporting_texts.append(item["text"])
1133
+ except (TypeError, KeyError):
1134
+ continue
1135
+ supporting = clean_phrases(supporting_texts, pattern)
1136
+
1137
+ refusing_text = []
1138
+ for item in refusing:
1139
+ try:
1140
+ refusing_text.append(item["text"])
1141
+ except (TypeError, KeyError):
1142
+ continue
1143
+ refusing = clean_phrases(refusing_text, pattern)
1144
+
1145
+ processed_abstracts = {}
1146
+ for abstract_name, abstract_text in abstracts.items():
1147
+ # Highlight supporting phrases in green
1148
+ supporting_matches = [phrase for phrase in supporting if phrase["abstract"] == abstract_name]
1149
+ abstract_text = highlight_phrases(abstract_text, supporting_matches, "lightgreen", predicted_label)
1150
+
1151
+ # Highlight refusing phrases in red
1152
+ refusing_matches = [phrase for phrase in refusing if phrase["abstract"] == abstract_name]
1153
+ abstract_text = highlight_phrases(abstract_text, refusing_matches, "red", predicted_label)
1154
+
1155
+ if supporting_matches:
1156
+ # Add the reference if a corresponding variable exists
1157
+ reference_variable = f"reference_{abstract_name.split('_')[1]}"
1158
+ if reference_variable in globals():
1159
+ reference_value = globals()[reference_variable]
1160
+ abstract_text += f"<br><br><strong>πŸ”— Reference:</strong> {reference_value}"
1161
+
1162
+ # Add the processed abstract
1163
+ processed_abstracts[abstract_name] = abstract_text
1164
+
1165
+ # Iterate over the processed abstracts and remove duplicates
1166
+ seen_contents = set() # Set to track already seen contents
1167
+ evidence_counter = 1
1168
+
1169
+ # Display the results of the processed abstracts with numbered expanders
1170
+ st.markdown("### **Scientific Evidence**")
1171
+
1172
+ # Add a legend for the colors
1173
+ legend_html = """
1174
+ <div style="display: flex; flex-direction: column; align-items: flex-start;">
1175
+ <div style="display: flex; align-items: center; margin-bottom: 5px;">
1176
+ <div style="width: 20px; height: 20px; background-color: lightgreen; margin-right: 10px; border-radius: 5px;"></div>
1177
+ <div>Positive Evidence</div>
1178
+ </div>
1179
+ <div style="display: flex; align-items: center; margin-bottom: 5px;">
1180
+ <div style="width: 20px; height: 20px; background-color: red; margin-right: 10px; border-radius: 5px;"></div>
1181
+ <div>Negative Evidence</div>
1182
+ </div>
1183
+ <div style="display: flex; align-items: center; margin-bottom: 5px;">
1184
+ <div style="width: 20px; height: 20px; background-color: yellow; margin-right: 10px; border-radius: 5px;"></div>
1185
+ <div>Dubious Evidence</div>
1186
+ </div>
1187
+ </div>
1188
+ """
1189
+ col1, col2 = st.columns([0.8, 0.2])
1190
+
1191
+ with col1:
1192
+ if processed_abstracts:
1193
+ tabs = st.tabs([f"Scientific Evidence {i}" for i in range(1, len(processed_abstracts) + 1)])
1194
+ for tab, (name, content) in zip(tabs, processed_abstracts.items()):
1195
+ if content not in seen_contents: # Check for duplicates
1196
+ seen_contents.add(content)
1197
+ with tab:
1198
+ # Switch colors if the label is "False"
1199
+ if predicted_label.lower() == "false":
1200
+ content = content.replace("background-color: lightgreen", "background-color: tempcolor")
1201
+ content = content.replace("background-color: red", "background-color: lightgreen")
1202
+ content = content.replace("background-color: tempcolor", "background-color: red")
1203
+
1204
+ # Use `st.write` to display HTML directly
1205
+ st.write(content, unsafe_allow_html=True)
1206
+ else:
1207
+ st.markdown("No relevant Scientific Evidence found")
1208
+
1209
+ with col2:
1210
+ st.caption("Legend")
1211
+ st.markdown(legend_html, unsafe_allow_html=True)
1212
+
1213
+ st.markdown("### **Video Summary**")
1214
+ st.caption("πŸ“Š Here is a summary of the results for the extracted claims:")
1215
+
1216
+ # Labels and Colors
1217
+ labels = ['True', 'False', 'NEI']
1218
+ colors = ['green', 'red', 'yellow']
1219
+
1220
+ # Sizes of the pie chart
1221
+ sizes = [
1222
+ st.session_state.true_count,
1223
+ st.session_state.false_count,
1224
+ st.session_state.nei_count
1225
+ ]
1226
+
1227
+ # Configure the Pie Chart Options
1228
+ options = {
1229
+ "tooltip": {"trigger": "item"},
1230
+ "legend": {"top": "5%", "left": "center"},
1231
+ "series": [
1232
+ {
1233
+ "name": "Document Status",
1234
+ "type": "pie",
1235
+ "radius": ["40%", "70%"],
1236
+ "avoidLabelOverlap": False,
1237
+ "itemStyle": {
1238
+ "borderRadius": 10,
1239
+ "borderColor": "#fff",
1240
+ "borderWidth": 2,
1241
+ },
1242
+ "label": {"show": True, "position": "center"},
1243
+ "emphasis": {
1244
+ "label": {"show": True, "fontSize": "20", "fontWeight": "bold"}
1245
+ },
1246
+ "labelLine": {"show": False},
1247
+ "data": [
1248
+ {"value": sizes[0], "name": labels[0], "itemStyle": {"color": colors[0]}},
1249
+ {"value": sizes[1], "name": labels[1], "itemStyle": {"color": colors[1]}},
1250
+ {"value": sizes[2], "name": labels[2], "itemStyle": {"color": colors[2]}},
1251
+ ],
1252
+ }
1253
+ ],
1254
+ }
1255
+
1256
+ # Display the Pie Chart
1257
+ st1, st2 = st.columns([0.6, 0.4])
1258
+
1259
+ with st1:
1260
+ st.markdown("#### The Video is :")
1261
+ true_count = st.session_state.true_count
1262
+ false_count = st.session_state.false_count
1263
+ nei_count = st.session_state.nei_count
1264
+
1265
+ if true_count > 0 and false_count == 0:
1266
+ reliability = '<span style="color: darkgreen; font-weight: bold;">Highly Reliable</span>'
1267
+ elif true_count > false_count:
1268
+ reliability = '<span style="color: lightgreen; font-weight: bold;">Fairly Reliable</span>'
1269
+ elif true_count == 0:
1270
+ reliability = '<span style="color: darkred; font-weight: bold;">Strongly Considered Unreliable</span>'
1271
+ elif false_count > true_count:
1272
+ reliability = '<span style="color: lightcoral; font-weight: bold;">Unlikely to be Reliable</span>'
1273
+ elif (true_count == false_count) or (nei_count > true_count and nei_count > false_count and true_count != 0 and false_count != 0):
1274
+ reliability = '<span style="color: yellow; font-weight: bold;">NEI</span>'
1275
+ else:
1276
+ reliability = '<span style="color: black; font-weight: bold;">Completely Reliable</span>'
1277
+
1278
+ st.markdown(f"The video is considered {reliability} because it contains {true_count} true claims, {false_count} false claims, and {nei_count} claims with not enough information.", unsafe_allow_html=True)
1279
+
1280
+ with st.popover("ℹ️ Understanding the Truthfulness Ratings"):
1281
+ st.markdown("""
1282
+ The reliability of the video is determined based on the number of true and false claims extracted from the video.
1283
+ - If the video contains only true claims, it is considered **Highly Reliable**.
1284
+ - If the video has more true claims than false claims, it is considered **Fairly Reliable**.
1285
+ - If the video has more false claims than true claims, it is considered **Unlikely to be Reliable**.
1286
+ - If the video contains only false claims, it is considered **Strongly Considered Unreliable**.
1287
+ - If the video has an equal number of true and false claims, it is considered **NEI**.
1288
+ """)
1289
+
1290
+ with st2:
1291
+ st_echarts(
1292
+ options=options, height="500px",
1293
+ )
1294
+
data/abstract_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b06d5719866779f5ff4c1d6fa6bff15951d5601d06b4c535d71ff573f06ad39b
3
+ size 153600128
data/faiss_index.index ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b03e1883853c41ecb1885ec9ded14d16a6e1aa99d40437cdcd3d05fd6865a41
3
+ size 153600045
data/parte_205.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27da6250b597f6409e28c2a32903446ba45f39f2c931e8973ab389aeb60f1837
3
+ size 149748082
data/pmids.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:556fcf26d0e2d8204a28a9f0c06a43dc3410088ec92b10a79dadd38d6d728c5a
3
+ size 800128