TabasumDev commited on
Commit
a493d1c
Β·
verified Β·
1 Parent(s): 57038dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +193 -54
app.py CHANGED
@@ -278,6 +278,175 @@
278
  # ###################################################################################
279
 
280
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
  import streamlit as st
282
  import os
283
  import re
@@ -286,7 +455,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
286
  from PyPDF2 import PdfReader
287
  from peft import get_peft_model, LoraConfig, TaskType
288
 
289
- # βœ… Force CPU execution for Hugging Face Spaces
290
  device = torch.device("cpu")
291
 
292
  # πŸ”Ή Load IBM Granite Model (CPU-Compatible)
@@ -295,7 +464,7 @@ MODEL_NAME = "ibm-granite/granite-3.1-2b-instruct"
295
  model = AutoModelForCausalLM.from_pretrained(
296
  MODEL_NAME,
297
  device_map="cpu", # Force CPU execution
298
- torch_dtype=torch.float32 # Use float32 since Hugging Face runs on CPU
299
  )
300
 
301
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
@@ -312,21 +481,6 @@ lora_config = LoraConfig(
312
  model = get_peft_model(model, lora_config)
313
  model.eval()
314
 
315
- # πŸ›  Function to Read & Extract Text from PDFs
316
- # def read_files(file):
317
- # file_context = ""
318
- # try:
319
- # reader = PdfReader(file)
320
- # for page in reader.pages:
321
- # text = page.extract_text()
322
- # if text:
323
- # file_context += text + "\n"
324
- # except Exception as e:
325
- # st.error(f"⚠️ Error reading PDF file: {e}")
326
- # return ""
327
-
328
- # return file_context.strip()
329
-
330
  # πŸ›  Function to Read & Extract Text from PDFs
331
  def read_files(file):
332
  file_context = ""
@@ -337,12 +491,8 @@ def read_files(file):
337
  if text:
338
  file_context += text + "\n"
339
 
340
- if not file_context.strip():
341
- return "⚠️ No text found. The document might be scanned or encrypted."
342
-
343
  return file_context.strip()
344
 
345
-
346
  # πŸ›  Function to Format AI Prompts
347
  def format_prompt(system_msg, user_msg, file_context=""):
348
  if file_context:
@@ -354,9 +504,8 @@ def format_prompt(system_msg, user_msg, file_context=""):
354
 
355
  # πŸ›  Function to Generate AI Responses
356
  def generate_response(input_text, max_tokens=1000, top_p=0.9, temperature=0.7):
357
- st.write("πŸ” Generating response...") # Debugging message
358
  model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
359
-
360
  with torch.no_grad():
361
  output = model.generate(
362
  **model_inputs,
@@ -367,10 +516,8 @@ def generate_response(input_text, max_tokens=1000, top_p=0.9, temperature=0.7):
367
  num_return_sequences=1,
368
  pad_token_id=tokenizer.eos_token_id
369
  )
370
-
371
- response = tokenizer.decode(output[0], skip_special_tokens=True)
372
- st.write("βœ… Response Generated!") # Debugging message
373
- return response
374
 
375
  # πŸ›  Function to Clean AI Output
376
  def post_process(text):
@@ -382,23 +529,18 @@ def post_process(text):
382
  # πŸ›  Function to Handle RAG with IBM Granite & Streamlit
383
  def granite_simple(prompt, file):
384
  file_context = read_files(file) if file else ""
385
-
386
- # Debugging: Show extracted file content preview
387
- if not file_context:
388
- st.error("⚠️ No content extracted from the PDF. It might be a scanned image or encrypted.")
389
- return "Error: No content found in the document."
390
-
391
  system_message = "You are IBM Granite, a legal AI assistant specializing in contract analysis."
392
-
393
  messages = format_prompt(system_message, prompt, file_context)
394
  input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
395
-
396
  response = generate_response(input_text)
397
  return post_process(response)
398
 
399
  # πŸ”Ή Streamlit UI
400
  def main():
401
- st.set_page_config(page_title="Contract Analysis AI", page_icon="πŸ“œ")
402
 
403
  st.title("πŸ“œ AI-Powered Contract Analysis Tool")
404
  st.write("Upload a contract document (PDF) for a detailed AI-driven legal and technical analysis.")
@@ -413,38 +555,35 @@ def main():
413
  # πŸ”Ή File Upload Section
414
  uploaded_file = st.file_uploader("πŸ“‚ Upload a contract document (PDF)", type="pdf")
415
 
416
- if uploaded_file:
417
- st.success(f"βœ… File uploaded successfully! File Name: {uploaded_file.name}")
418
- st.write(f"**File Size:** {uploaded_file.size / 1024:.2f} KB")
419
-
420
- # Debugging: Show extracted text preview
421
- pdf_text = read_files(uploaded_file)
422
- if pdf_text:
423
- st.write("**Extracted Sample Text:**")
424
- st.code(pdf_text[:500]) # Show first 500 characters
425
- else:
426
- st.error("⚠️ No readable text found in the document.")
427
 
428
- st.write("Click the button below to analyze the contract.")
429
 
430
- # Force button to always render
431
- st.markdown('<style>div.stButton > button {display: block; width: 100%;}</style>', unsafe_allow_html=True)
432
 
433
  if st.button("πŸ” Analyze Document"):
434
  with st.spinner("Analyzing contract document... ⏳"):
435
- final_answer = granite_simple(
436
- "Perform a detailed technical analysis of the attached contract document, highlighting potential risks, legal pitfalls, compliance issues, and areas where contractual terms may lead to future disputes or operational challenges.",
437
- uploaded_file
438
- )
439
 
440
  # πŸ”Ή Display Analysis Result
441
  st.subheader("πŸ“‘ Analysis Result")
442
  st.write(final_answer)
443
 
 
 
 
444
  # πŸ”₯ Run Streamlit App
445
  if __name__ == '__main__':
446
  main()
447
 
 
 
 
 
448
  # import streamlit as st
449
  # from PyPDF2 import PdfReader
450
 
 
278
  # ###################################################################################
279
 
280
 
281
+ # import streamlit as st
282
+ # import os
283
+ # import re
284
+ # import torch
285
+ # from transformers import AutoModelForCausalLM, AutoTokenizer
286
+ # from PyPDF2 import PdfReader
287
+ # from peft import get_peft_model, LoraConfig, TaskType
288
+
289
+ # # βœ… Force CPU execution for Hugging Face Spaces
290
+ # device = torch.device("cpu")
291
+
292
+ # # πŸ”Ή Load IBM Granite Model (CPU-Compatible)
293
+ # MODEL_NAME = "ibm-granite/granite-3.1-2b-instruct"
294
+
295
+ # model = AutoModelForCausalLM.from_pretrained(
296
+ # MODEL_NAME,
297
+ # device_map="cpu", # Force CPU execution
298
+ # torch_dtype=torch.float32 # Use float32 since Hugging Face runs on CPU
299
+ # )
300
+
301
+ # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
302
+
303
+ # # πŸ”Ή Apply LoRA Fine-Tuning Configuration
304
+ # lora_config = LoraConfig(
305
+ # r=8,
306
+ # lora_alpha=32,
307
+ # target_modules=["q_proj", "v_proj"],
308
+ # lora_dropout=0.1,
309
+ # bias="none",
310
+ # task_type=TaskType.CAUSAL_LM
311
+ # )
312
+ # model = get_peft_model(model, lora_config)
313
+ # model.eval()
314
+
315
+ # # πŸ›  Function to Read & Extract Text from PDFs
316
+ # # def read_files(file):
317
+ # # file_context = ""
318
+ # # try:
319
+ # # reader = PdfReader(file)
320
+ # # for page in reader.pages:
321
+ # # text = page.extract_text()
322
+ # # if text:
323
+ # # file_context += text + "\n"
324
+ # # except Exception as e:
325
+ # # st.error(f"⚠️ Error reading PDF file: {e}")
326
+ # # return ""
327
+
328
+ # # return file_context.strip()
329
+
330
+ # # πŸ›  Function to Read & Extract Text from PDFs
331
+ # def read_files(file):
332
+ # file_context = ""
333
+ # reader = PdfReader(file)
334
+
335
+ # for page in reader.pages:
336
+ # text = page.extract_text()
337
+ # if text:
338
+ # file_context += text + "\n"
339
+
340
+ # if not file_context.strip():
341
+ # return "⚠️ No text found. The document might be scanned or encrypted."
342
+
343
+ # return file_context.strip()
344
+
345
+
346
+ # # πŸ›  Function to Format AI Prompts
347
+ # def format_prompt(system_msg, user_msg, file_context=""):
348
+ # if file_context:
349
+ # system_msg += f" The user has provided a contract document. Use its context to generate insights, but do not repeat or summarize the document itself."
350
+ # return [
351
+ # {"role": "system", "content": system_msg},
352
+ # {"role": "user", "content": user_msg}
353
+ # ]
354
+
355
+ # # πŸ›  Function to Generate AI Responses
356
+ # def generate_response(input_text, max_tokens=1000, top_p=0.9, temperature=0.7):
357
+ # st.write("πŸ” Generating response...") # Debugging message
358
+ # model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
359
+
360
+ # with torch.no_grad():
361
+ # output = model.generate(
362
+ # **model_inputs,
363
+ # max_new_tokens=max_tokens,
364
+ # do_sample=True,
365
+ # top_p=top_p,
366
+ # temperature=temperature,
367
+ # num_return_sequences=1,
368
+ # pad_token_id=tokenizer.eos_token_id
369
+ # )
370
+
371
+ # response = tokenizer.decode(output[0], skip_special_tokens=True)
372
+ # st.write("βœ… Response Generated!") # Debugging message
373
+ # return response
374
+
375
+ # # πŸ›  Function to Clean AI Output
376
+ # def post_process(text):
377
+ # cleaned = re.sub(r'ζˆ₯+', '', text) # Remove unwanted symbols
378
+ # lines = cleaned.splitlines()
379
+ # unique_lines = list(dict.fromkeys([line.strip() for line in lines if line.strip()]))
380
+ # return "\n".join(unique_lines)
381
+
382
+ # # πŸ›  Function to Handle RAG with IBM Granite & Streamlit
383
+ # def granite_simple(prompt, file):
384
+ # file_context = read_files(file) if file else ""
385
+
386
+ # # Debugging: Show extracted file content preview
387
+ # if not file_context:
388
+ # st.error("⚠️ No content extracted from the PDF. It might be a scanned image or encrypted.")
389
+ # return "Error: No content found in the document."
390
+
391
+ # system_message = "You are IBM Granite, a legal AI assistant specializing in contract analysis."
392
+
393
+ # messages = format_prompt(system_message, prompt, file_context)
394
+ # input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
395
+
396
+ # response = generate_response(input_text)
397
+ # return post_process(response)
398
+
399
+ # # πŸ”Ή Streamlit UI
400
+ # def main():
401
+ # st.set_page_config(page_title="Contract Analysis AI", page_icon="πŸ“œ")
402
+
403
+ # st.title("πŸ“œ AI-Powered Contract Analysis Tool")
404
+ # st.write("Upload a contract document (PDF) for a detailed AI-driven legal and technical analysis.")
405
+
406
+ # # πŸ”Ή Sidebar Settings
407
+ # with st.sidebar:
408
+ # st.header("βš™οΈ Settings")
409
+ # max_tokens = st.slider("Max Tokens", 50, 1000, 250, 50)
410
+ # top_p = st.slider("Top P (sampling)", 0.1, 1.0, 0.9, 0.1)
411
+ # temperature = st.slider("Temperature (creativity)", 0.1, 1.0, 0.7, 0.1)
412
+
413
+ # # πŸ”Ή File Upload Section
414
+ # uploaded_file = st.file_uploader("πŸ“‚ Upload a contract document (PDF)", type="pdf")
415
+
416
+ # if uploaded_file:
417
+ # st.success(f"βœ… File uploaded successfully! File Name: {uploaded_file.name}")
418
+ # st.write(f"**File Size:** {uploaded_file.size / 1024:.2f} KB")
419
+
420
+ # # Debugging: Show extracted text preview
421
+ # pdf_text = read_files(uploaded_file)
422
+ # if pdf_text:
423
+ # st.write("**Extracted Sample Text:**")
424
+ # st.code(pdf_text[:500]) # Show first 500 characters
425
+ # else:
426
+ # st.error("⚠️ No readable text found in the document.")
427
+
428
+ # st.write("Click the button below to analyze the contract.")
429
+
430
+ # # Force button to always render
431
+ # st.markdown('<style>div.stButton > button {display: block; width: 100%;}</style>', unsafe_allow_html=True)
432
+
433
+ # if st.button("πŸ” Analyze Document"):
434
+ # with st.spinner("Analyzing contract document... ⏳"):
435
+ # final_answer = granite_simple(
436
+ # "Perform a detailed technical analysis of the attached contract document, highlighting potential risks, legal pitfalls, compliance issues, and areas where contractual terms may lead to future disputes or operational challenges.",
437
+ # uploaded_file
438
+ # )
439
+
440
+ # # πŸ”Ή Display Analysis Result
441
+ # st.subheader("πŸ“‘ Analysis Result")
442
+ # st.write(final_answer)
443
+
444
+ # πŸ”₯ Run Streamlit App
445
+ # if __name__ == '__main__':
446
+ # main()
447
+
448
+
449
+
450
  import streamlit as st
451
  import os
452
  import re
 
455
  from PyPDF2 import PdfReader
456
  from peft import get_peft_model, LoraConfig, TaskType
457
 
458
+ # βœ… Force CPU execution for Streamlit Cloud
459
  device = torch.device("cpu")
460
 
461
  # πŸ”Ή Load IBM Granite Model (CPU-Compatible)
 
464
  model = AutoModelForCausalLM.from_pretrained(
465
  MODEL_NAME,
466
  device_map="cpu", # Force CPU execution
467
+ torch_dtype=torch.float32 # Use float32 since Streamlit runs on CPU
468
  )
469
 
470
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
481
  model = get_peft_model(model, lora_config)
482
  model.eval()
483
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
  # πŸ›  Function to Read & Extract Text from PDFs
485
  def read_files(file):
486
  file_context = ""
 
491
  if text:
492
  file_context += text + "\n"
493
 
 
 
 
494
  return file_context.strip()
495
 
 
496
  # πŸ›  Function to Format AI Prompts
497
  def format_prompt(system_msg, user_msg, file_context=""):
498
  if file_context:
 
504
 
505
  # πŸ›  Function to Generate AI Responses
506
  def generate_response(input_text, max_tokens=1000, top_p=0.9, temperature=0.7):
 
507
  model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
508
+
509
  with torch.no_grad():
510
  output = model.generate(
511
  **model_inputs,
 
516
  num_return_sequences=1,
517
  pad_token_id=tokenizer.eos_token_id
518
  )
519
+
520
+ return tokenizer.decode(output[0], skip_special_tokens=True)
 
 
521
 
522
  # πŸ›  Function to Clean AI Output
523
  def post_process(text):
 
529
  # πŸ›  Function to Handle RAG with IBM Granite & Streamlit
530
  def granite_simple(prompt, file):
531
  file_context = read_files(file) if file else ""
532
+
 
 
 
 
 
533
  system_message = "You are IBM Granite, a legal AI assistant specializing in contract analysis."
534
+
535
  messages = format_prompt(system_message, prompt, file_context)
536
  input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
537
+
538
  response = generate_response(input_text)
539
  return post_process(response)
540
 
541
  # πŸ”Ή Streamlit UI
542
  def main():
543
+ st.set_page_config(page_title="Contract Analysis AI", page_icon="πŸ“œ", layout="wide")
544
 
545
  st.title("πŸ“œ AI-Powered Contract Analysis Tool")
546
  st.write("Upload a contract document (PDF) for a detailed AI-driven legal and technical analysis.")
 
555
  # πŸ”Ή File Upload Section
556
  uploaded_file = st.file_uploader("πŸ“‚ Upload a contract document (PDF)", type="pdf")
557
 
558
+ if uploaded_file is not None:
559
+ temp_file_path = "temp_uploaded_contract.pdf"
560
+ with open(temp_file_path, "wb") as f:
561
+ f.write(uploaded_file.getbuffer())
 
 
 
 
 
 
 
562
 
563
+ st.success("βœ… File uploaded successfully!")
564
 
565
+ # πŸ”Ή User Input for Analysis
566
+ user_prompt = "Perform a detailed technical analysis of the attached contract document, highlighting potential risks, legal pitfalls, compliance issues, and areas where contractual terms may lead to future disputes or operational challenges."
567
 
568
  if st.button("πŸ” Analyze Document"):
569
  with st.spinner("Analyzing contract document... ⏳"):
570
+ final_answer = granite_simple(user_prompt, temp_file_path)
 
 
 
571
 
572
  # πŸ”Ή Display Analysis Result
573
  st.subheader("πŸ“‘ Analysis Result")
574
  st.write(final_answer)
575
 
576
+ # πŸ”Ή Remove Temporary File
577
+ os.remove(temp_file_path)
578
+
579
  # πŸ”₯ Run Streamlit App
580
  if __name__ == '__main__':
581
  main()
582
 
583
+
584
+
585
+
586
+
587
  # import streamlit as st
588
  # from PyPDF2 import PdfReader
589