TabasumDev commited on
Commit
d1b0bff
Β·
verified Β·
1 Parent(s): d627145

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -48
app.py CHANGED
@@ -444,15 +444,18 @@
444
  # πŸ”₯ Run Streamlit App
445
  # if __name__ == '__main__':
446
  # main()
 
 
 
447
  import streamlit as st
448
  import os
449
  import re
450
  import torch
 
451
  from transformers import AutoModelForCausalLM, AutoTokenizer
452
- from PyPDF2 import PdfReader
453
  from peft import get_peft_model, LoraConfig, TaskType
454
 
455
- # βœ… Force CPU execution for Hugging Face Spaces
456
  device = torch.device("cpu")
457
 
458
  # πŸ”Ή Load IBM Granite Model (CPU-Compatible)
@@ -460,8 +463,8 @@ MODEL_NAME = "ibm-granite/granite-3.1-2b-instruct"
460
 
461
  model = AutoModelForCausalLM.from_pretrained(
462
  MODEL_NAME,
463
- device_map="cpu",
464
- torch_dtype=torch.float32
465
  )
466
 
467
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
@@ -480,35 +483,18 @@ model.eval()
480
 
481
  # πŸ›  Function to Read & Extract Text from PDFs
482
  def read_files(uploaded_file):
483
- try:
484
- # πŸ”₯ Step 1: Save file to disk first
485
- temp_pdf_path = "temp_uploaded_file.pdf"
486
- with open(temp_pdf_path, "wb") as f:
487
- f.write(uploaded_file.getbuffer()) # Save the file
488
-
489
- # πŸ”₯ Step 2: Open the saved file and extract text
490
- st.write("πŸ“‚ Processing saved PDF file...") # Debugging
491
- file_context = ""
492
- reader = PdfReader(temp_pdf_path)
493
-
494
- for page in reader.pages:
495
  text = page.extract_text()
496
  if text:
497
  file_context += text + "\n"
498
-
499
- # πŸ”₯ Step 3: Delete the temp file after reading
500
- os.remove(temp_pdf_path)
501
-
502
- if not file_context.strip():
503
- st.error("⚠️ No text found. The document might be scanned or encrypted.")
504
- return ""
505
-
506
- st.write(f"βœ… Extracted {len(file_context)} characters.") # Debugging
507
- return file_context.strip()
508
-
509
- except Exception as e:
510
- st.error(f"⚠️ Error reading PDF: {e}")
511
- return ""
512
 
513
  # πŸ›  Function to Format AI Prompts
514
  def format_prompt(system_msg, user_msg, file_context=""):
@@ -538,25 +524,18 @@ def generate_response(input_text, max_tokens=1000, top_p=0.9, temperature=0.7):
538
 
539
  # πŸ›  Function to Clean AI Output
540
  def post_process(text):
541
- cleaned = re.sub(r'ζˆ₯+', '', text)
542
  lines = cleaned.splitlines()
543
  unique_lines = list(dict.fromkeys([line.strip() for line in lines if line.strip()]))
544
  return "\n".join(unique_lines)
545
 
546
  # πŸ›  Function to Handle RAG with IBM Granite & Streamlit
547
  def granite_simple(prompt, file):
548
- if not file:
549
- st.error("⚠️ No file detected. Please upload a document.")
550
- return ""
551
-
552
- file_context = read_files(file)
553
- if not file_context:
554
- st.error("⚠️ No valid text extracted from the document.")
555
- return ""
556
-
557
  system_message = "You are IBM Granite, a legal AI assistant specializing in contract analysis."
558
- messages = format_prompt(system_message, prompt, file_context)
559
 
 
560
  input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
561
 
562
  response = generate_response(input_text)
@@ -580,8 +559,19 @@ def main():
580
  uploaded_file = st.file_uploader("πŸ“‚ Upload a contract document (PDF)", type="pdf")
581
 
582
  if uploaded_file:
583
- st.success(f"βœ… File uploaded: {uploaded_file.name}")
584
- st.write(f"πŸ“ File Size: {uploaded_file.size / 1024:.2f} KB")
 
 
 
 
 
 
 
 
 
 
 
585
 
586
  if st.button("πŸ” Analyze Document"):
587
  with st.spinner("Analyzing contract document... ⏳"):
@@ -590,11 +580,9 @@ def main():
590
  uploaded_file
591
  )
592
 
593
- if final_answer:
594
- st.subheader("πŸ“‘ Analysis Result")
595
- st.write(final_answer)
596
- else:
597
- st.error("⚠️ No response generated. Please check your input.")
598
 
599
  # πŸ”₯ Run Streamlit App
600
  if __name__ == '__main__':
@@ -605,6 +593,7 @@ if __name__ == '__main__':
605
 
606
 
607
 
 
608
  # import streamlit as st
609
  # from PyPDF2 import PdfReader
610
 
 
444
  # πŸ”₯ Run Streamlit App
445
  # if __name__ == '__main__':
446
  # main()
447
+
448
+
449
+
450
  import streamlit as st
451
  import os
452
  import re
453
  import torch
454
+ import pdfplumber
455
  from transformers import AutoModelForCausalLM, AutoTokenizer
 
456
  from peft import get_peft_model, LoraConfig, TaskType
457
 
458
+ # βœ… Force CPU execution
459
  device = torch.device("cpu")
460
 
461
  # πŸ”Ή Load IBM Granite Model (CPU-Compatible)
 
463
 
464
  model = AutoModelForCausalLM.from_pretrained(
465
  MODEL_NAME,
466
+ device_map="cpu", # Force CPU execution
467
+ torch_dtype=torch.float32
468
  )
469
 
470
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
 
483
 
484
  # πŸ›  Function to Read & Extract Text from PDFs
485
  def read_files(uploaded_file):
486
+ file_context = ""
487
+
488
+ with pdfplumber.open(uploaded_file) as pdf:
489
+ for page in pdf.pages:
 
 
 
 
 
 
 
 
490
  text = page.extract_text()
491
  if text:
492
  file_context += text + "\n"
493
+
494
+ if not file_context.strip():
495
+ st.error("⚠️ No text extracted. This document may be scanned or encrypted.")
496
+
497
+ return file_context.strip()
 
 
 
 
 
 
 
 
 
498
 
499
  # πŸ›  Function to Format AI Prompts
500
  def format_prompt(system_msg, user_msg, file_context=""):
 
524
 
525
  # πŸ›  Function to Clean AI Output
526
  def post_process(text):
527
+ cleaned = re.sub(r'ζˆ₯+', '', text) # Remove unwanted symbols
528
  lines = cleaned.splitlines()
529
  unique_lines = list(dict.fromkeys([line.strip() for line in lines if line.strip()]))
530
  return "\n".join(unique_lines)
531
 
532
  # πŸ›  Function to Handle RAG with IBM Granite & Streamlit
533
  def granite_simple(prompt, file):
534
+ file_context = read_files(file) if file else ""
535
+
 
 
 
 
 
 
 
536
  system_message = "You are IBM Granite, a legal AI assistant specializing in contract analysis."
 
537
 
538
+ messages = format_prompt(system_message, prompt, file_context)
539
  input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
540
 
541
  response = generate_response(input_text)
 
559
  uploaded_file = st.file_uploader("πŸ“‚ Upload a contract document (PDF)", type="pdf")
560
 
561
  if uploaded_file:
562
+ # βœ… Debugging: Show file info
563
+ st.success(f"βœ… File uploaded: {uploaded_file.name}, Size: {uploaded_file.size / 1024:.2f} KB")
564
+
565
+ # βœ… Extract and preview text
566
+ extracted_text = read_files(uploaded_file)
567
+ if extracted_text:
568
+ st.write("πŸ“œ Extracted Text Preview:")
569
+ st.text_area("Extracted Text", extracted_text[:2000], height=200) # Show first 2000 chars
570
+
571
+ st.write("Click the button below to analyze the contract.")
572
+
573
+ # Force button to always render
574
+ st.markdown('<style>div.stButton > button {display: block; width: 100%;}</style>', unsafe_allow_html=True)
575
 
576
  if st.button("πŸ” Analyze Document"):
577
  with st.spinner("Analyzing contract document... ⏳"):
 
580
  uploaded_file
581
  )
582
 
583
+ # πŸ”Ή Display Analysis Result
584
+ st.subheader("πŸ“‘ Analysis Result")
585
+ st.write(final_answer)
 
 
586
 
587
  # πŸ”₯ Run Streamlit App
588
  if __name__ == '__main__':
 
593
 
594
 
595
 
596
+
597
  # import streamlit as st
598
  # from PyPDF2 import PdfReader
599