Sarthak005 commited on
Commit
165166b
·
verified ·
1 Parent(s): 45f7a2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +555 -554
app.py CHANGED
@@ -1,554 +1,555 @@
1
- from fastapi import FastAPI, HTTPException, Query
2
- from pydantic import BaseModel
3
- import os
4
- from langchain_community.embeddings import HuggingFaceEmbeddings
5
- from langchain_community.vectorstores import FAISS
6
- from langchain_community.document_loaders import CSVLoader
7
- from langchain_openai import ChatOpenAI
8
- from langchain_groq import ChatGroq
9
- from langchain_core.prompts import ChatPromptTemplate
10
- from langchain.chains.combine_documents import create_stuff_documents_chain
11
- from langchain.chains import create_retrieval_chain
12
- from langchain_google_genai import ChatGoogleGenerativeAI
13
- from dotenv import load_dotenv
14
- from fastapi.responses import PlainTextResponse
15
- from fastapi.middleware.cors import CORSMiddleware
16
- import asyncio
17
- import json
18
- import re
19
- # Load environment variables
20
- load_dotenv()
21
- os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
22
- os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
23
- os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
24
- key = os.getenv("GOOGLE_API_KEY")
25
- # Define paths
26
- DB_FAISS_PATH = "bgi/db_faiss"
27
-
28
- # Initialize FastAPI app
29
- app = FastAPI()
30
- app.add_middleware(
31
- CORSMiddleware,
32
- allow_origins=["*"], # Add the React app's URL
33
- allow_credentials=True,
34
- allow_methods=["*"], # Allow all HTTP methods
35
- allow_headers=["*"], # Allow all headers
36
- )
37
- # Initialize variables
38
- embeddings = None
39
- db = None
40
-
41
- # Load or create FAISS vector store
42
- @app.on_event("startup")
43
- def load_vector_store():
44
- global embeddings, db
45
- if os.path.exists(DB_FAISS_PATH):
46
- print("Loading existing FAISS vector store.")
47
- embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en', model_kwargs={'device': 'cpu'})
48
- db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
49
- print("Vector store loaded.")
50
- else:
51
- print("Creating new FAISS vector store.")
52
- loader = CSVLoader(file_path="Final_Research_Dataset_2.csv", encoding="utf-8", csv_args={'delimiter': ','})
53
- data = loader.load()
54
- embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en', model_kwargs={'device': 'cpu'})
55
- db = FAISS.from_documents(data, embeddings)
56
- db.save_local(DB_FAISS_PATH)
57
-
58
-
59
- # Define request and response models
60
- from typing import List, Optional
61
-
62
- class FilterCriteria(BaseModel):
63
- impactFactor: float
64
- firstDecisionTime: int
65
- publisher: Optional[str]
66
- llmModel: str
67
-
68
- class QueryRequest(BaseModel):
69
- abstract: str
70
- criteria: FilterCriteria
71
-
72
- class Journal(BaseModel):
73
- id: int
74
- Name: str
75
- JIF: float
76
- Category: str
77
- Keywords: str
78
- Publisher: str
79
- Decision_Time: int
80
-
81
- # Define the QueryResponse model with a list of journals
82
- class QueryResponse(BaseModel):
83
- result: List[Journal]
84
-
85
-
86
- @app.get("/", response_class=PlainTextResponse)
87
- def read_root():
88
- return "Welcome to the Journal Recommender API!"
89
- # Define models
90
- @app.get("/models")
91
- def get_models():
92
- return {"available_models": ["openai", "groq","mixtral","gemini-pro","faiss"]}
93
-
94
- def fix_incomplete_json(raw_response):
95
- """
96
- Fixes incomplete JSON by adding missing braces or brackets.
97
- Returns a valid JSON string or None if not fixable.
98
- """
99
- # Ensure the response ends with a closing bracket if it's a list
100
- if raw_response.endswith("},"):
101
- raw_response = raw_response[:-1] # Remove the last comma
102
- if raw_response.count("{") > raw_response.count("}"):
103
- raw_response += "}"
104
- if raw_response.count("[") > raw_response.count("]"):
105
- raw_response += "]"
106
-
107
- # Try to load the fixed response
108
- try:
109
- json_response = json.loads(raw_response)
110
- return json_response
111
- except json.JSONDecodeError as e:
112
- print(f"Error fixing JSON: {e}")
113
- return None
114
-
115
-
116
- # Query endpoint
117
- @app.post("/query", response_model=QueryResponse)
118
- async def query(request: QueryRequest):
119
- global db
120
- if not db:
121
- raise HTTPException(status_code=500, detail="Vector store not loaded.")
122
-
123
- query_text = request.abstract
124
- model_choice = request.criteria.llmModel
125
- impact_factor = request.criteria.impactFactor
126
- preferred_publisher = request.criteria.publisher
127
- # Perform the query
128
- docs = db.similarity_search(query_text, k=5)
129
- context = "\n".join([doc.page_content for doc in docs])
130
-
131
- messages = [
132
- {
133
- "role": "system",
134
- "content": (
135
- "Give a strict comma-separated list of exactly 15 keywords from the following text. "
136
- "Give a strict comma-separated list of exactly 15 keywords from the following text. "
137
- "Do not include any bullet points, introductory text, or ending text. "
138
- "No introductory or ending text strictly" # Added to ensure can be removed if results deteriorate
139
- "Do not say anything like 'Here are the keywords.' "
140
- "Only return the keywords, strictly comma-separated, without any additional words."
141
- ),
142
- },
143
- {"role": "user", "content": query_text},
144
- ]
145
- llm = ChatGroq(model="llama3-8b-8192", temperature=0)
146
- ai_msg = llm.invoke(messages)
147
- keywords = ai_msg.content.split("keywords extracted from the text:\n")[-1].strip()
148
- print("Keywords:", keywords)
149
- if model_choice == "openai":
150
- retriever = db.as_retriever()
151
-
152
- # Set up system prompt
153
- system_prompt = (
154
- f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
155
- f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
156
- f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
157
- f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
158
- f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
159
- f"Ensure no introductory or ending texts are included. Give max 30 results"
160
- "Context: {context}"
161
- )
162
-
163
- prompt = ChatPromptTemplate.from_messages(
164
- [("system", system_prompt), ("user", "{input}")]
165
- )
166
-
167
-
168
- async def create_chain():
169
- client = ChatOpenAI(model="gpt-4o")
170
- return create_stuff_documents_chain(client, prompt)
171
-
172
- # Create the question-answer chain using async function
173
- question_answer_chain = await create_chain()
174
- rag_chain = create_retrieval_chain(retriever, question_answer_chain)
175
-
176
- # Ensure the vector dimensions match the FAISS index
177
-
178
- # Invoke the RAG chain
179
- answer = rag_chain.invoke(
180
- {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
181
- )
182
-
183
- # Inspect the result structure
184
- result = []
185
- raw_response = answer['answer']
186
- cleaned_response = raw_response.strip('```json\n').strip('```').strip()
187
-
188
- # Parse the cleaned JSON response
189
- try:
190
- json_response = json.loads(cleaned_response)
191
-
192
- # Initialize an empty list to hold the journal objects
193
- result = []
194
-
195
- # Process the JSON data and create Journal objects
196
- for i, journal in enumerate(json_response):
197
- try:
198
- journal_name = journal.get('Journal Name')
199
- publisher = journal.get('Publisher')
200
- jif = float(journal.get('JIF', 0)) # Ensure valid float
201
- decision_time = journal.get('Decsion Time', 0) # Default to 0 if not available
202
-
203
- # Only include if JIF is greater than the minimum threshold
204
- if jif > impact_factor:
205
- result.append(
206
- Journal(
207
- id=i + 1,
208
- Name=journal_name,
209
- Publisher=publisher,
210
- JIF=jif,
211
- Category="", # Set to empty if not available
212
- Keywords=keywords, # Use provided keywords
213
- Decision_Time=decision_time,
214
- )
215
- )
216
- except Exception as e:
217
- print(f"Error processing journal data: {e}")
218
-
219
- except json.JSONDecodeError as e:
220
- print(f"Error parsing JSON response: {e}")
221
- result = []
222
-
223
- # Return the result wrapped in a QueryResponse
224
- return QueryResponse(result=result)
225
- elif model_choice == "groq":
226
- retriever = db.as_retriever()
227
-
228
- # Set up system prompt
229
- system_prompt = (
230
- f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
231
- f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
232
- f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
233
- f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
234
- f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
235
- f"Ensure no introductory or ending texts are included. Dont give more than 10 results"
236
- "Context: {context}"
237
- )
238
-
239
-
240
-
241
- prompt = ChatPromptTemplate.from_messages(
242
- [("system", system_prompt), ("user", "{input}")]
243
- )
244
-
245
- # Create the question-answer chain
246
- async def create_chain():
247
- client = ChatGroq(model="llama-3.2-3b-preview", temperature=0)
248
- return create_stuff_documents_chain(client, prompt)
249
-
250
- # Create the question-answer chain using async function
251
- question_answer_chain = await create_chain()
252
- rag_chain = create_retrieval_chain(retriever, question_answer_chain)
253
-
254
- # Ensure the vector dimensions match the FAISS index
255
-
256
- # Invoke the RAG chain
257
- answer = rag_chain.invoke(
258
- {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
259
- )
260
-
261
- # Inspect the result structure
262
- result = []
263
- raw_response = answer['answer']
264
-
265
- cleaned_response = raw_response.strip('```json\n').strip('```').strip()
266
-
267
- # Parse the cleaned JSON response
268
- try:
269
- # Parse the cleaned response
270
- print("Cleaned Response:", cleaned_response) # For debugging
271
- json_response = json.loads(cleaned_response)
272
-
273
- # Initialize an empty list to hold the journal objects
274
- result = []
275
-
276
- # Process the JSON data and create Journal objects
277
- for i, journal in enumerate(json_response["journals"]): # Accessing the 'journals' key
278
- print("Journal entry:", journal) # For debugging
279
-
280
- try:
281
- if isinstance(journal, dict): # Ensure journal is a dictionary
282
- journal_name = journal.get('Journal Name')
283
- publisher = journal.get('Publisher')
284
- jif = float(journal.get('JIF', 0)) # Ensure valid float
285
- decision_time = journal.get('Decision Time', 0) # Default to 0 if not available
286
-
287
- # Only include if JIF is greater than the minimum threshold
288
- if jif > impact_factor:
289
- result.append(
290
- Journal(
291
- id=i + 1,
292
- Name=journal_name,
293
- Publisher=publisher,
294
- JIF=jif,
295
- Category="", # Set to empty if not available
296
- Keywords=keywords, # Use provided keywords
297
- Decision_Time=decision_time,
298
- )
299
- )
300
- else:
301
- print(f"Skipping invalid journal entry: {journal}")
302
- except Exception as e:
303
- print(f"Error processing journal data: {e}")
304
-
305
- except json.JSONDecodeError as e:
306
- print(f"Error parsing JSON response: {e}")
307
- result = []
308
-
309
- # Return the result wrapped in a QueryResponse
310
- return QueryResponse(result=result)
311
-
312
-
313
- elif model_choice == "mixtral":
314
- retriever = db.as_retriever()
315
-
316
- # Set up system prompt
317
- system_prompt = (
318
- f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
319
- f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
320
- f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
321
- f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
322
- f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
323
- f"Ensure no introductory or ending texts are included. Dont give more than 10 results"
324
- "Context: {context}"
325
- )
326
-
327
- prompt = ChatPromptTemplate.from_messages(
328
- [("system", system_prompt), ("user", "{input}")]
329
- )
330
-
331
- # Create the question-answer chain
332
-
333
-
334
- async def create_chain():
335
- client = ChatGroq(model="mixtral-8x7b-32768",temperature=0)
336
- return create_stuff_documents_chain(client, prompt)
337
-
338
- # Create the question-answer chain using async function
339
- question_answer_chain = await create_chain()
340
- rag_chain = create_retrieval_chain(retriever, question_answer_chain)
341
-
342
- # Ensure the vector dimensions match the FAISS index
343
-
344
- # Invoke the RAG chain
345
- answer = rag_chain.invoke(
346
- {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
347
- )
348
-
349
- # Inspect the result structure
350
- result = []
351
- raw_response = answer['answer']
352
-
353
- cleaned_response = raw_response.strip('```json\n').strip('```').strip()
354
-
355
- # Parse the cleaned JSON response
356
- try:
357
- # Parse the cleaned response
358
- print("Cleaned Response:", cleaned_response) # For debugging
359
- json_response = json.loads(cleaned_response)
360
-
361
- # Initialize an empty list to hold the journal objects
362
- result = []
363
-
364
- # Process the JSON data and create Journal objects
365
- for i, journal in enumerate(json_response): # Iterate directly over the list
366
- print("Journal entry:", journal) # For debugging
367
-
368
- try:
369
- if isinstance(journal, dict): # Ensure journal is a dictionary
370
- journal_name = journal.get('Journal Name')
371
- publisher = journal.get('Publisher')
372
- jif = float(journal.get('JIF', 0)) # Ensure valid float
373
- decision_time = journal.get('Decsion Time', 0) # Default to 0 if not available
374
-
375
- # Only include if JIF is greater than the minimum threshold
376
- if jif > impact_factor:
377
- result.append(
378
- Journal(
379
- id=i + 1,
380
- Name=journal_name,
381
- Publisher=publisher,
382
- JIF=jif,
383
- Category="", # Set to empty if not available
384
- Keywords=keywords, # Use provided keywords
385
- Decision_Time=decision_time,
386
- )
387
- )
388
- else:
389
- print(f"Skipping invalid journal entry: {journal}")
390
- except Exception as e:
391
- print(f"Error processing journal data: {e}")
392
-
393
- except json.JSONDecodeError as e:
394
- print(f"Error parsing JSON response: {e}")
395
- result = []
396
-
397
- # Return the result wrapped in a QueryResponse
398
- return QueryResponse(result=result)
399
-
400
- elif model_choice == "gemini-pro":
401
- print("Using Gemini-Pro model")
402
- retriever = db.as_retriever()
403
-
404
- # Set up system prompt
405
- system_prompt = (
406
- f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
407
- f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
408
- f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
409
- f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
410
- f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
411
- f"Ensure no introductory or ending texts are included."
412
- "Context: {context}"
413
- )
414
-
415
- prompt = ChatPromptTemplate.from_messages(
416
- [("system", system_prompt), ("user", "{input}")]
417
- )
418
-
419
- async def create_chain():
420
- client = ChatGoogleGenerativeAI(
421
- model="gemini-pro",
422
- google_api_key=key,
423
- convert_system_message_to_human=True,
424
- )
425
- return create_stuff_documents_chain(client, prompt)
426
-
427
- # Create the question-answer chain using async function
428
- question_answer_chain = await create_chain()
429
- rag_chain = create_retrieval_chain(retriever, question_answer_chain)
430
-
431
-
432
- # Ensure the vector dimensions match the FAISS index
433
-
434
- # Invoke the RAG chain
435
- answer = rag_chain.invoke(
436
- {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
437
- )
438
-
439
- # Inspect the result structure
440
- result = []
441
- raw_response = answer['answer']
442
- cleaned_response = raw_response.strip('```json\n').strip('```').strip()
443
-
444
- # Parse the cleaned JSON response
445
- try:
446
- json_response = json.loads(cleaned_response)
447
-
448
- # Initialize an empty list to hold the journal objects
449
- result = []
450
-
451
- # Process the JSON data and create Journal objects
452
- for i, journal in enumerate(json_response):
453
- try:
454
- journal_name = journal.get('Journal Name')
455
- publisher = journal.get('Publisher')
456
- jif = float(journal.get('JIF', 0)) # Ensure valid float
457
- decision_time = journal.get('Decsion Time', 0) # Default to 0 if not available
458
-
459
- # Only include if JIF is greater than the minimum threshold
460
- if jif > impact_factor:
461
- result.append(
462
- Journal(
463
- id=i + 1,
464
- Name=journal_name,
465
- Publisher=publisher,
466
- JIF=jif,
467
- Category="", # Set to empty if not available
468
- Keywords=keywords, # Use provided keywords
469
- Decision_Time=decision_time,
470
- )
471
- )
472
- except Exception as e:
473
- print(f"Error processing journal data: {e}")
474
-
475
- except json.JSONDecodeError as e:
476
- print(f"Error parsing JSON response: {e}")
477
- result = []
478
-
479
- # Return the result wrapped in a QueryResponse
480
- return QueryResponse(result=result)
481
- elif model_choice == "faiss":
482
- embeddings = HuggingFaceEmbeddings(
483
- model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"}
484
- )
485
- jif = impact_factor # Minimum JIF value for filtering
486
- publisher = preferred_publisher # Preferred publisher list or "no preference"
487
-
488
- # Load the FAISS index from local storage
489
- db1 = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
490
-
491
- # Embed the query
492
- query_embedding = embeddings.embed_query(keywords)
493
-
494
- # Perform similarity search with FAISS (retrieve top 20 results)
495
- results = db1.similarity_search_by_vector(query_embedding, k=20)
496
-
497
- # Prepare the context for processing results
498
- context = "\n\n".join(doc.page_content for doc in results)
499
-
500
- # Apply filters for JIF and publisher
501
- min_jif = jif
502
- valid_publishers = publisher if publisher != ["no preference"] else None
503
-
504
- # Split the output based on each entry starting with 'Name: '
505
- entries = re.split(r"\n(?=Name:)", context.strip())
506
-
507
- # Initialize an empty list to hold the Journal models
508
- journal_list = []
509
-
510
- # Process each entry
511
- for entry in entries:
512
- # Use regex to capture different fields
513
- name = re.search(r"Name: (.+)", entry)
514
- jif_match = re.search(r"JIF: (.+)", entry)
515
- category = re.search(r"Category: (.+)", entry)
516
- keywords_match = re.search(r"Keywords: (.+)", entry)
517
- publisher_match = re.search(r"Publisher: (.+)", entry)
518
- first_decision_match = re.search(r"Decsion Time: (.+)", entry)
519
-
520
- if jif_match :
521
- # Extract values from regex matches
522
- name_value = name.group(1).strip()
523
- jif_value = float(jif_match.group(1).strip())
524
- category_value = category.group(1).strip()
525
- keywords_value = keywords_match.group(1).strip()
526
- publisher_value = publisher_match.group(1).strip()
527
- decision_time = first_decision_match.group(1).strip()
528
- # Filter based on JIF and publisher preferences
529
- if jif_value >= min_jif :
530
- # Create the Journal model instance
531
- journal = Journal(
532
- id=len(journal_list) + 1, # Incrementing ID for each journal
533
- Name=name_value,
534
- JIF=jif_value,
535
- Category=category_value,
536
- Keywords=keywords_value,
537
- Publisher=publisher_value,
538
- Decision_Time = decision_time
539
- )
540
-
541
- # Add the journal to the list
542
- journal_list.append(journal)
543
-
544
- # Return the list of journals as a response or process it further
545
- return {"result": [journal.dict() for journal in journal_list]}
546
- else:
547
- raise HTTPException(status_code=400, detail="Invalid model choice.")
548
-
549
- # Generate response using LLM
550
- response = llm.predict(f"Context: {context}\n\nQuestion: {query_text}")
551
- return QueryResponse(result=response)
552
-
553
- # Run the app with Uvicorn
554
- # Command: uvicorn app:app --reload
 
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
+ from pydantic import BaseModel
3
+ import os
4
+ from langchain_community.embeddings import HuggingFaceEmbeddings
5
+ from langchain_community.vectorstores import FAISS
6
+ from langchain_community.document_loaders import CSVLoader
7
+ from langchain_openai import ChatOpenAI
8
+ from langchain_groq import ChatGroq
9
+ from langchain_core.prompts import ChatPromptTemplate
10
+ from langchain.chains.combine_documents import create_stuff_documents_chain
11
+ from langchain.chains import create_retrieval_chain
12
+ from langchain_google_genai import ChatGoogleGenerativeAI
13
+ from dotenv import load_dotenv
14
+ from fastapi.responses import PlainTextResponse
15
+ from fastapi.middleware.cors import CORSMiddleware
16
+ import asyncio
17
+ import json
18
+ import re
19
+ # Load environment variables
20
+ load_dotenv()
21
+ os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
22
+ os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
23
+ os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
24
+ key = os.getenv("GOOGLE_API_KEY")
25
+ # Define paths
26
+ DB_FAISS_PATH = "bgi/db_faiss"
27
+
28
+ # Initialize FastAPI app
29
+ app = FastAPI()
30
+ app.add_middleware(
31
+ CORSMiddleware,
32
+ allow_origins=["*"], # Add the React app's URL
33
+ allow_credentials=True,
34
+ allow_methods=["*"], # Allow all HTTP methods
35
+ allow_headers=["*"], # Allow all headers
36
+ )
37
+ # Initialize variables
38
+ embeddings = None
39
+ db = None
40
+
41
+ # Load or create FAISS vector store
42
+ @app.on_event("startup")
43
+ def load_vector_store():
44
+ global embeddings, db
45
+ if os.path.exists(DB_FAISS_PATH):
46
+ print("Loading existing FAISS vector store.")
47
+ embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en', model_kwargs={'device': 'cpu'})
48
+ db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
49
+ print("Vector store loaded.")
50
+ else:
51
+ print("Creating new FAISS vector store.")
52
+ loader = CSVLoader(file_path="Final_Research_Dataset_2.csv", encoding="utf-8", csv_args={'delimiter': ','})
53
+ data = loader.load()
54
+ embeddings = HuggingFaceEmbeddings(model_name='BAAI/bge-small-en', model_kwargs={'device': 'cpu'})
55
+ db = FAISS.from_documents(data, embeddings)
56
+ db.save_local(DB_FAISS_PATH)
57
+
58
+
59
+ # Define request and response models
60
+ from typing import List, Optional
61
+
62
+ class FilterCriteria(BaseModel):
63
+ impactFactor: float
64
+ firstDecisionTime: int
65
+ publisher: Optional[str]
66
+ llmModel: str
67
+
68
+ class QueryRequest(BaseModel):
69
+ abstract: str
70
+ criteria: FilterCriteria
71
+
72
+ class Journal(BaseModel):
73
+ id: int
74
+ Name: str
75
+ JIF: float
76
+ Category: str
77
+ Keywords: str
78
+ Publisher: str
79
+ Decision_Time: int
80
+
81
+ # Define the QueryResponse model with a list of journals
82
+ class QueryResponse(BaseModel):
83
+ result: List[Journal]
84
+
85
+
86
+ @app.get("/", response_class=PlainTextResponse)
87
+ def read_root():
88
+ return "Welcome to the Journal Recommender API!"
89
+ # Define models
90
+ @app.get("/models")
91
+ def get_models():
92
+ return {"available_models": ["openai", "groq","mixtral","gemini-pro","faiss"]}
93
+
94
+ def fix_incomplete_json(raw_response):
95
+ """
96
+ Fixes incomplete JSON by adding missing braces or brackets.
97
+ Returns a valid JSON string or None if not fixable.
98
+ """
99
+ # Ensure the response ends with a closing bracket if it's a list
100
+ if raw_response.endswith("},"):
101
+ raw_response = raw_response[:-1] # Remove the last comma
102
+ if raw_response.count("{") > raw_response.count("}"):
103
+ raw_response += "}"
104
+ if raw_response.count("[") > raw_response.count("]"):
105
+ raw_response += "]"
106
+
107
+ # Try to load the fixed response
108
+ try:
109
+ json_response = json.loads(raw_response)
110
+ return json_response
111
+ except json.JSONDecodeError as e:
112
+ print(f"Error fixing JSON: {e}")
113
+ return None
114
+
115
+
116
+ # Query endpoint
117
+ @app.post("/query", response_model=QueryResponse)
118
+ async def query(request: QueryRequest):
119
+ global db
120
+ if not db:
121
+ raise HTTPException(status_code=500, detail="Vector store not loaded.")
122
+
123
+ query_text = request.abstract
124
+ model_choice = request.criteria.llmModel
125
+ impact_factor = request.criteria.impactFactor
126
+ preferred_publisher = request.criteria.publisher
127
+ # Perform the query
128
+ docs = db.similarity_search(query_text, k=5)
129
+ context = "\n".join([doc.page_content for doc in docs])
130
+
131
+ messages = [
132
+ {
133
+ "role": "system",
134
+ "content": (
135
+ "Give a strict comma-separated list of exactly 15 keywords from the following text. "
136
+ "Give a strict comma-separated list of exactly 15 keywords from the following text. "
137
+ "Do not include any bullet points, introductory text, or ending text. "
138
+ "No introductory or ending text strictly" # Added to ensure can be removed if results deteriorate
139
+ "Do not say anything like 'Here are the keywords.' "
140
+ "Only return the keywords, strictly comma-separated, without any additional words."
141
+ ),
142
+ },
143
+ {"role": "user", "content": query_text},
144
+ ]
145
+ llm = ChatGroq(model="llama3-8b-8192", temperature=0)
146
+ ai_msg = llm.invoke(messages)
147
+ keywords = ai_msg.content.split("keywords extracted from the text:\n")[-1].strip()
148
+ print("Keywords:", keywords)
149
+ if model_choice == "openai":
150
+ retriever = db.as_retriever()
151
+
152
+ # Set up system prompt
153
+ system_prompt = (
154
+ f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
155
+ f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
156
+ f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
157
+ f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
158
+ f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
159
+ f"Ensure no introductory or ending texts are included. Give max 30 results"
160
+ "Context: {context}"
161
+ )
162
+ print(os.environ["OPENAI_API_KEY"])
163
+
164
+ prompt = ChatPromptTemplate.from_messages(
165
+ [("system", system_prompt), ("user", "{input}")]
166
+ )
167
+
168
+
169
+ async def create_chain():
170
+ client = ChatOpenAI(model="gpt-4o")
171
+ return create_stuff_documents_chain(client, prompt)
172
+
173
+ # Create the question-answer chain using async function
174
+ question_answer_chain = await create_chain()
175
+ rag_chain = create_retrieval_chain(retriever, question_answer_chain)
176
+
177
+ # Ensure the vector dimensions match the FAISS index
178
+
179
+ # Invoke the RAG chain
180
+ answer = rag_chain.invoke(
181
+ {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
182
+ )
183
+
184
+ # Inspect the result structure
185
+ result = []
186
+ raw_response = answer['answer']
187
+ cleaned_response = raw_response.strip('```json\n').strip('```').strip()
188
+
189
+ # Parse the cleaned JSON response
190
+ try:
191
+ json_response = json.loads(cleaned_response)
192
+
193
+ # Initialize an empty list to hold the journal objects
194
+ result = []
195
+
196
+ # Process the JSON data and create Journal objects
197
+ for i, journal in enumerate(json_response):
198
+ try:
199
+ journal_name = journal.get('Journal Name')
200
+ publisher = journal.get('Publisher')
201
+ jif = float(journal.get('JIF', 0)) # Ensure valid float
202
+ decision_time = journal.get('Decsion Time', 0) # Default to 0 if not available
203
+
204
+ # Only include if JIF is greater than the minimum threshold
205
+ if jif > impact_factor:
206
+ result.append(
207
+ Journal(
208
+ id=i + 1,
209
+ Name=journal_name,
210
+ Publisher=publisher,
211
+ JIF=jif,
212
+ Category="", # Set to empty if not available
213
+ Keywords=keywords, # Use provided keywords
214
+ Decision_Time=decision_time,
215
+ )
216
+ )
217
+ except Exception as e:
218
+ print(f"Error processing journal data: {e}")
219
+
220
+ except json.JSONDecodeError as e:
221
+ print(f"Error parsing JSON response: {e}")
222
+ result = []
223
+
224
+ # Return the result wrapped in a QueryResponse
225
+ return QueryResponse(result=result)
226
+ elif model_choice == "groq":
227
+ retriever = db.as_retriever()
228
+
229
+ # Set up system prompt
230
+ system_prompt = (
231
+ f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
232
+ f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
233
+ f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
234
+ f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
235
+ f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
236
+ f"Ensure no introductory or ending texts are included. Dont give more than 10 results"
237
+ "Context: {context}"
238
+ )
239
+
240
+
241
+
242
+ prompt = ChatPromptTemplate.from_messages(
243
+ [("system", system_prompt), ("user", "{input}")]
244
+ )
245
+
246
+ # Create the question-answer chain
247
+ async def create_chain():
248
+ client = ChatGroq(model="llama-3.2-3b-preview", temperature=0)
249
+ return create_stuff_documents_chain(client, prompt)
250
+
251
+ # Create the question-answer chain using async function
252
+ question_answer_chain = await create_chain()
253
+ rag_chain = create_retrieval_chain(retriever, question_answer_chain)
254
+
255
+ # Ensure the vector dimensions match the FAISS index
256
+
257
+ # Invoke the RAG chain
258
+ answer = rag_chain.invoke(
259
+ {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
260
+ )
261
+
262
+ # Inspect the result structure
263
+ result = []
264
+ raw_response = answer['answer']
265
+
266
+ cleaned_response = raw_response.strip('```json\n').strip('```').strip()
267
+
268
+ # Parse the cleaned JSON response
269
+ try:
270
+ # Parse the cleaned response
271
+ print("Cleaned Response:", cleaned_response) # For debugging
272
+ json_response = json.loads(cleaned_response)
273
+
274
+ # Initialize an empty list to hold the journal objects
275
+ result = []
276
+
277
+ # Process the JSON data and create Journal objects
278
+ for i, journal in enumerate(json_response["journals"]): # Accessing the 'journals' key
279
+ print("Journal entry:", journal) # For debugging
280
+
281
+ try:
282
+ if isinstance(journal, dict): # Ensure journal is a dictionary
283
+ journal_name = journal.get('Journal Name')
284
+ publisher = journal.get('Publisher')
285
+ jif = float(journal.get('JIF', 0)) # Ensure valid float
286
+ decision_time = journal.get('Decision Time', 0) # Default to 0 if not available
287
+
288
+ # Only include if JIF is greater than the minimum threshold
289
+ if jif > impact_factor:
290
+ result.append(
291
+ Journal(
292
+ id=i + 1,
293
+ Name=journal_name,
294
+ Publisher=publisher,
295
+ JIF=jif,
296
+ Category="", # Set to empty if not available
297
+ Keywords=keywords, # Use provided keywords
298
+ Decision_Time=decision_time,
299
+ )
300
+ )
301
+ else:
302
+ print(f"Skipping invalid journal entry: {journal}")
303
+ except Exception as e:
304
+ print(f"Error processing journal data: {e}")
305
+
306
+ except json.JSONDecodeError as e:
307
+ print(f"Error parsing JSON response: {e}")
308
+ result = []
309
+
310
+ # Return the result wrapped in a QueryResponse
311
+ return QueryResponse(result=result)
312
+
313
+
314
+ elif model_choice == "mixtral":
315
+ retriever = db.as_retriever()
316
+
317
+ # Set up system prompt
318
+ system_prompt = (
319
+ f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
320
+ f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
321
+ f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
322
+ f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
323
+ f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
324
+ f"Ensure no introductory or ending texts are included. Dont give more than 10 results"
325
+ "Context: {context}"
326
+ )
327
+
328
+ prompt = ChatPromptTemplate.from_messages(
329
+ [("system", system_prompt), ("user", "{input}")]
330
+ )
331
+
332
+ # Create the question-answer chain
333
+
334
+
335
+ async def create_chain():
336
+ client = ChatGroq(model="mixtral-8x7b-32768",temperature=0)
337
+ return create_stuff_documents_chain(client, prompt)
338
+
339
+ # Create the question-answer chain using async function
340
+ question_answer_chain = await create_chain()
341
+ rag_chain = create_retrieval_chain(retriever, question_answer_chain)
342
+
343
+ # Ensure the vector dimensions match the FAISS index
344
+
345
+ # Invoke the RAG chain
346
+ answer = rag_chain.invoke(
347
+ {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
348
+ )
349
+
350
+ # Inspect the result structure
351
+ result = []
352
+ raw_response = answer['answer']
353
+
354
+ cleaned_response = raw_response.strip('```json\n').strip('```').strip()
355
+
356
+ # Parse the cleaned JSON response
357
+ try:
358
+ # Parse the cleaned response
359
+ print("Cleaned Response:", cleaned_response) # For debugging
360
+ json_response = json.loads(cleaned_response)
361
+
362
+ # Initialize an empty list to hold the journal objects
363
+ result = []
364
+
365
+ # Process the JSON data and create Journal objects
366
+ for i, journal in enumerate(json_response): # Iterate directly over the list
367
+ print("Journal entry:", journal) # For debugging
368
+
369
+ try:
370
+ if isinstance(journal, dict): # Ensure journal is a dictionary
371
+ journal_name = journal.get('Journal Name')
372
+ publisher = journal.get('Publisher')
373
+ jif = float(journal.get('JIF', 0)) # Ensure valid float
374
+ decision_time = journal.get('Decsion Time', 0) # Default to 0 if not available
375
+
376
+ # Only include if JIF is greater than the minimum threshold
377
+ if jif > impact_factor:
378
+ result.append(
379
+ Journal(
380
+ id=i + 1,
381
+ Name=journal_name,
382
+ Publisher=publisher,
383
+ JIF=jif,
384
+ Category="", # Set to empty if not available
385
+ Keywords=keywords, # Use provided keywords
386
+ Decision_Time=decision_time,
387
+ )
388
+ )
389
+ else:
390
+ print(f"Skipping invalid journal entry: {journal}")
391
+ except Exception as e:
392
+ print(f"Error processing journal data: {e}")
393
+
394
+ except json.JSONDecodeError as e:
395
+ print(f"Error parsing JSON response: {e}")
396
+ result = []
397
+
398
+ # Return the result wrapped in a QueryResponse
399
+ return QueryResponse(result=result)
400
+
401
+ elif model_choice == "gemini-pro":
402
+ print("Using Gemini-Pro model")
403
+ retriever = db.as_retriever()
404
+
405
+ # Set up system prompt
406
+ system_prompt = (
407
+ f"You are a specialized Journal recommender that compares all journals in database to given research paper keywords and based on JIF and publisher gives result."
408
+ f"From the provided context, recommend all journals that are suitable for research paper with {keywords} keywords."
409
+ f"Ensure that you include **every** journal with a Journal Impact Factor (JIF) strictly greater than {impact_factor}, and the Journal must be only from any Publishers in list: {preferred_publisher}. And Pls show that jif as in Context database "
410
+ f"Make sure to include both exact matches and related journals, and prioritize including **all relevant high-JIF journals without repetition**. "
411
+ f"Present the results in a json format with the following information: Journal Name, Publisher, JIF, Decsion Time. "
412
+ f"Ensure no introductory or ending texts are included."
413
+ "Context: {context}"
414
+ )
415
+
416
+ prompt = ChatPromptTemplate.from_messages(
417
+ [("system", system_prompt), ("user", "{input}")]
418
+ )
419
+
420
+ async def create_chain():
421
+ client = ChatGoogleGenerativeAI(
422
+ model="gemini-pro",
423
+ google_api_key=key,
424
+ convert_system_message_to_human=True,
425
+ )
426
+ return create_stuff_documents_chain(client, prompt)
427
+
428
+ # Create the question-answer chain using async function
429
+ question_answer_chain = await create_chain()
430
+ rag_chain = create_retrieval_chain(retriever, question_answer_chain)
431
+
432
+
433
+ # Ensure the vector dimensions match the FAISS index
434
+
435
+ # Invoke the RAG chain
436
+ answer = rag_chain.invoke(
437
+ {"input": f"Keywords: {keywords}, Minimum JIF: {impact_factor},Publisher list: {preferred_publisher}"}
438
+ )
439
+
440
+ # Inspect the result structure
441
+ result = []
442
+ raw_response = answer['answer']
443
+ cleaned_response = raw_response.strip('```json\n').strip('```').strip()
444
+
445
+ # Parse the cleaned JSON response
446
+ try:
447
+ json_response = json.loads(cleaned_response)
448
+
449
+ # Initialize an empty list to hold the journal objects
450
+ result = []
451
+
452
+ # Process the JSON data and create Journal objects
453
+ for i, journal in enumerate(json_response):
454
+ try:
455
+ journal_name = journal.get('Journal Name')
456
+ publisher = journal.get('Publisher')
457
+ jif = float(journal.get('JIF', 0)) # Ensure valid float
458
+ decision_time = journal.get('Decsion Time', 0) # Default to 0 if not available
459
+
460
+ # Only include if JIF is greater than the minimum threshold
461
+ if jif > impact_factor:
462
+ result.append(
463
+ Journal(
464
+ id=i + 1,
465
+ Name=journal_name,
466
+ Publisher=publisher,
467
+ JIF=jif,
468
+ Category="", # Set to empty if not available
469
+ Keywords=keywords, # Use provided keywords
470
+ Decision_Time=decision_time,
471
+ )
472
+ )
473
+ except Exception as e:
474
+ print(f"Error processing journal data: {e}")
475
+
476
+ except json.JSONDecodeError as e:
477
+ print(f"Error parsing JSON response: {e}")
478
+ result = []
479
+
480
+ # Return the result wrapped in a QueryResponse
481
+ return QueryResponse(result=result)
482
+ elif model_choice == "faiss":
483
+ embeddings = HuggingFaceEmbeddings(
484
+ model_name="BAAI/bge-small-en", model_kwargs={"device": "cpu"}
485
+ )
486
+ jif = impact_factor # Minimum JIF value for filtering
487
+ publisher = preferred_publisher # Preferred publisher list or "no preference"
488
+
489
+ # Load the FAISS index from local storage
490
+ db1 = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
491
+
492
+ # Embed the query
493
+ query_embedding = embeddings.embed_query(keywords)
494
+
495
+ # Perform similarity search with FAISS (retrieve top 20 results)
496
+ results = db1.similarity_search_by_vector(query_embedding, k=20)
497
+
498
+ # Prepare the context for processing results
499
+ context = "\n\n".join(doc.page_content for doc in results)
500
+
501
+ # Apply filters for JIF and publisher
502
+ min_jif = jif
503
+ valid_publishers = publisher if publisher != ["no preference"] else None
504
+
505
+ # Split the output based on each entry starting with 'Name: '
506
+ entries = re.split(r"\n(?=Name:)", context.strip())
507
+
508
+ # Initialize an empty list to hold the Journal models
509
+ journal_list = []
510
+
511
+ # Process each entry
512
+ for entry in entries:
513
+ # Use regex to capture different fields
514
+ name = re.search(r"Name: (.+)", entry)
515
+ jif_match = re.search(r"JIF: (.+)", entry)
516
+ category = re.search(r"Category: (.+)", entry)
517
+ keywords_match = re.search(r"Keywords: (.+)", entry)
518
+ publisher_match = re.search(r"Publisher: (.+)", entry)
519
+ first_decision_match = re.search(r"Decsion Time: (.+)", entry)
520
+
521
+ if jif_match :
522
+ # Extract values from regex matches
523
+ name_value = name.group(1).strip()
524
+ jif_value = float(jif_match.group(1).strip())
525
+ category_value = category.group(1).strip()
526
+ keywords_value = keywords_match.group(1).strip()
527
+ publisher_value = publisher_match.group(1).strip()
528
+ decision_time = first_decision_match.group(1).strip()
529
+ # Filter based on JIF and publisher preferences
530
+ if jif_value >= min_jif :
531
+ # Create the Journal model instance
532
+ journal = Journal(
533
+ id=len(journal_list) + 1, # Incrementing ID for each journal
534
+ Name=name_value,
535
+ JIF=jif_value,
536
+ Category=category_value,
537
+ Keywords=keywords_value,
538
+ Publisher=publisher_value,
539
+ Decision_Time = decision_time
540
+ )
541
+
542
+ # Add the journal to the list
543
+ journal_list.append(journal)
544
+
545
+ # Return the list of journals as a response or process it further
546
+ return {"result": [journal.dict() for journal in journal_list]}
547
+ else:
548
+ raise HTTPException(status_code=400, detail="Invalid model choice.")
549
+
550
+ # Generate response using LLM
551
+ response = llm.predict(f"Context: {context}\n\nQuestion: {query_text}")
552
+ return QueryResponse(result=response)
553
+
554
+ # Run the app with Uvicorn
555
+ # Command: uvicorn app:app --reload