TabasumDev commited on
Commit
57038dd
Β·
verified Β·
1 Parent(s): b165b5d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +164 -148
app.py CHANGED
@@ -278,41 +278,41 @@
278
  # ###################################################################################
279
 
280
 
281
- # import streamlit as st
282
- # import os
283
- # import re
284
- # import torch
285
- # from transformers import AutoModelForCausalLM, AutoTokenizer
286
- # from PyPDF2 import PdfReader
287
- # from peft import get_peft_model, LoraConfig, TaskType
288
-
289
- # # βœ… Force CPU execution for Hugging Face Spaces
290
- # device = torch.device("cpu")
291
-
292
- # # πŸ”Ή Load IBM Granite Model (CPU-Compatible)
293
- # MODEL_NAME = "ibm-granite/granite-3.1-2b-instruct"
294
-
295
- # model = AutoModelForCausalLM.from_pretrained(
296
- # MODEL_NAME,
297
- # device_map="cpu", # Force CPU execution
298
- # torch_dtype=torch.float32 # Use float32 since Hugging Face runs on CPU
299
- # )
300
-
301
- # tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
302
-
303
- # # πŸ”Ή Apply LoRA Fine-Tuning Configuration
304
- # lora_config = LoraConfig(
305
- # r=8,
306
- # lora_alpha=32,
307
- # target_modules=["q_proj", "v_proj"],
308
- # lora_dropout=0.1,
309
- # bias="none",
310
- # task_type=TaskType.CAUSAL_LM
311
- # )
312
- # model = get_peft_model(model, lora_config)
313
- # model.eval()
314
-
315
- # # πŸ›  Function to Read & Extract Text from PDFs
316
  # def read_files(file):
317
  # file_context = ""
318
  # try:
@@ -327,131 +327,147 @@
327
 
328
  # return file_context.strip()
329
 
330
- # # πŸ›  Function to Format AI Prompts
331
- # def format_prompt(system_msg, user_msg, file_context=""):
332
- # if file_context:
333
- # system_msg += f" The user has provided a contract document. Use its context to generate insights, but do not repeat or summarize the document itself."
334
- # return [
335
- # {"role": "system", "content": system_msg},
336
- # {"role": "user", "content": user_msg}
337
- # ]
338
-
339
- # # πŸ›  Function to Generate AI Responses
340
- # def generate_response(input_text, max_tokens=1000, top_p=0.9, temperature=0.7):
341
- # st.write("πŸ” Generating response...") # Debugging message
342
- # model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
343
-
344
- # with torch.no_grad():
345
- # output = model.generate(
346
- # **model_inputs,
347
- # max_new_tokens=max_tokens,
348
- # do_sample=True,
349
- # top_p=top_p,
350
- # temperature=temperature,
351
- # num_return_sequences=1,
352
- # pad_token_id=tokenizer.eos_token_id
353
- # )
354
-
355
- # response = tokenizer.decode(output[0], skip_special_tokens=True)
356
- # st.write("βœ… Response Generated!") # Debugging message
357
- # return response
358
-
359
- # # πŸ›  Function to Clean AI Output
360
- # def post_process(text):
361
- # cleaned = re.sub(r'ζˆ₯+', '', text) # Remove unwanted symbols
362
- # lines = cleaned.splitlines()
363
- # unique_lines = list(dict.fromkeys([line.strip() for line in lines if line.strip()]))
364
- # return "\n".join(unique_lines)
365
-
366
- # # πŸ›  Function to Handle RAG with IBM Granite & Streamlit
367
- # def granite_simple(prompt, file):
368
- # file_context = read_files(file) if file else ""
369
-
370
- # # Debugging: Show extracted file content preview
371
- # if not file_context:
372
- # st.error("⚠️ No content extracted from the PDF. It might be a scanned image or encrypted.")
373
- # return "Error: No content found in the document."
374
-
375
- # system_message = "You are IBM Granite, a legal AI assistant specializing in contract analysis."
376
-
377
- # messages = format_prompt(system_message, prompt, file_context)
378
- # input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
379
-
380
- # response = generate_response(input_text)
381
- # return post_process(response)
382
-
383
- # # πŸ”Ή Streamlit UI
384
- # def main():
385
- # st.set_page_config(page_title="Contract Analysis AI", page_icon="πŸ“œ")
386
-
387
- # st.title("πŸ“œ AI-Powered Contract Analysis Tool")
388
- # st.write("Upload a contract document (PDF) for a detailed AI-driven legal and technical analysis.")
389
-
390
- # # πŸ”Ή Sidebar Settings
391
- # with st.sidebar:
392
- # st.header("βš™οΈ Settings")
393
- # max_tokens = st.slider("Max Tokens", 50, 1000, 250, 50)
394
- # top_p = st.slider("Top P (sampling)", 0.1, 1.0, 0.9, 0.1)
395
- # temperature = st.slider("Temperature (creativity)", 0.1, 1.0, 0.7, 0.1)
396
-
397
- # # πŸ”Ή File Upload Section
398
- # uploaded_file = st.file_uploader("πŸ“‚ Upload a contract document (PDF)", type="pdf")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
- # if uploaded_file:
401
- # st.success(f"βœ… File uploaded successfully! File Name: {uploaded_file.name}")
402
- # st.write(f"**File Size:** {uploaded_file.size / 1024:.2f} KB")
403
 
404
- # # Debugging: Show extracted text preview
405
- # pdf_text = read_files(uploaded_file)
406
- # if pdf_text:
407
- # st.write("**Extracted Sample Text:**")
408
- # st.code(pdf_text[:500]) # Show first 500 characters
409
- # else:
410
- # st.error("⚠️ No readable text found in the document.")
411
 
412
- # st.write("Click the button below to analyze the contract.")
 
 
 
 
 
413
 
414
- # # Force button to always render
415
- # st.markdown('<style>div.stButton > button {display: block; width: 100%;}</style>', unsafe_allow_html=True)
 
416
 
417
- # if st.button("πŸ” Analyze Document"):
418
- # with st.spinner("Analyzing contract document... ⏳"):
419
- # final_answer = granite_simple(
420
- # "Perform a detailed technical analysis of the attached contract document, highlighting potential risks, legal pitfalls, compliance issues, and areas where contractual terms may lead to future disputes or operational challenges.",
421
- # uploaded_file
422
- # )
423
 
424
- # # πŸ”Ή Display Analysis Result
425
- # st.subheader("πŸ“‘ Analysis Result")
426
- # st.write(final_answer)
427
-
428
- # # πŸ”₯ Run Streamlit App
429
- # if __name__ == '__main__':
430
- # main()
431
-
432
- import streamlit as st
433
- from PyPDF2 import PdfReader
434
 
435
- st.title("πŸ“‚ PDF Upload Debugger")
436
 
437
- uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
438
 
439
- if uploaded_file:
440
- st.success(f"βœ… File uploaded: {uploaded_file.name}")
441
- st.write(f"πŸ“ File Size: {uploaded_file.size / 1024:.2f} KB")
442
 
443
- try:
444
- reader = PdfReader(uploaded_file)
445
- text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
446
 
447
- if text.strip():
448
- st.subheader("Extracted Text (First 500 characters)")
449
- st.code(text[:500]) # Show a preview of the text
450
- else:
451
- st.error("⚠️ No text found. The document might be scanned or encrypted.")
452
 
453
- except Exception as e:
454
- st.error(f"⚠️ Error reading PDF: {e}")
455
 
456
 
457
  # ###################################################################################
 
278
  # ###################################################################################
279
 
280
 
281
+ import streamlit as st
282
+ import os
283
+ import re
284
+ import torch
285
+ from transformers import AutoModelForCausalLM, AutoTokenizer
286
+ from PyPDF2 import PdfReader
287
+ from peft import get_peft_model, LoraConfig, TaskType
288
+
289
+ # βœ… Force CPU execution for Hugging Face Spaces
290
+ device = torch.device("cpu")
291
+
292
+ # πŸ”Ή Load IBM Granite Model (CPU-Compatible)
293
+ MODEL_NAME = "ibm-granite/granite-3.1-2b-instruct"
294
+
295
+ model = AutoModelForCausalLM.from_pretrained(
296
+ MODEL_NAME,
297
+ device_map="cpu", # Force CPU execution
298
+ torch_dtype=torch.float32 # Use float32 since Hugging Face runs on CPU
299
+ )
300
+
301
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
302
+
303
+ # πŸ”Ή Apply LoRA Fine-Tuning Configuration
304
+ lora_config = LoraConfig(
305
+ r=8,
306
+ lora_alpha=32,
307
+ target_modules=["q_proj", "v_proj"],
308
+ lora_dropout=0.1,
309
+ bias="none",
310
+ task_type=TaskType.CAUSAL_LM
311
+ )
312
+ model = get_peft_model(model, lora_config)
313
+ model.eval()
314
+
315
+ # πŸ›  Function to Read & Extract Text from PDFs
316
  # def read_files(file):
317
  # file_context = ""
318
  # try:
 
327
 
328
  # return file_context.strip()
329
 
330
+ # πŸ›  Function to Read & Extract Text from PDFs
331
+ def read_files(file):
332
+ file_context = ""
333
+ reader = PdfReader(file)
334
+
335
+ for page in reader.pages:
336
+ text = page.extract_text()
337
+ if text:
338
+ file_context += text + "\n"
339
+
340
+ if not file_context.strip():
341
+ return "⚠️ No text found. The document might be scanned or encrypted."
342
+
343
+ return file_context.strip()
344
+
345
+
346
+ # πŸ›  Function to Format AI Prompts
347
+ def format_prompt(system_msg, user_msg, file_context=""):
348
+ if file_context:
349
+ system_msg += f" The user has provided a contract document. Use its context to generate insights, but do not repeat or summarize the document itself."
350
+ return [
351
+ {"role": "system", "content": system_msg},
352
+ {"role": "user", "content": user_msg}
353
+ ]
354
+
355
+ # πŸ›  Function to Generate AI Responses
356
+ def generate_response(input_text, max_tokens=1000, top_p=0.9, temperature=0.7):
357
+ st.write("πŸ” Generating response...") # Debugging message
358
+ model_inputs = tokenizer([input_text], return_tensors="pt").to(device)
359
+
360
+ with torch.no_grad():
361
+ output = model.generate(
362
+ **model_inputs,
363
+ max_new_tokens=max_tokens,
364
+ do_sample=True,
365
+ top_p=top_p,
366
+ temperature=temperature,
367
+ num_return_sequences=1,
368
+ pad_token_id=tokenizer.eos_token_id
369
+ )
370
+
371
+ response = tokenizer.decode(output[0], skip_special_tokens=True)
372
+ st.write("βœ… Response Generated!") # Debugging message
373
+ return response
374
+
375
+ # πŸ›  Function to Clean AI Output
376
+ def post_process(text):
377
+ cleaned = re.sub(r'ζˆ₯+', '', text) # Remove unwanted symbols
378
+ lines = cleaned.splitlines()
379
+ unique_lines = list(dict.fromkeys([line.strip() for line in lines if line.strip()]))
380
+ return "\n".join(unique_lines)
381
+
382
+ # πŸ›  Function to Handle RAG with IBM Granite & Streamlit
383
+ def granite_simple(prompt, file):
384
+ file_context = read_files(file) if file else ""
385
+
386
+ # Debugging: Show extracted file content preview
387
+ if not file_context:
388
+ st.error("⚠️ No content extracted from the PDF. It might be a scanned image or encrypted.")
389
+ return "Error: No content found in the document."
390
+
391
+ system_message = "You are IBM Granite, a legal AI assistant specializing in contract analysis."
392
+
393
+ messages = format_prompt(system_message, prompt, file_context)
394
+ input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
395
+
396
+ response = generate_response(input_text)
397
+ return post_process(response)
398
+
399
+ # πŸ”Ή Streamlit UI
400
+ def main():
401
+ st.set_page_config(page_title="Contract Analysis AI", page_icon="πŸ“œ")
402
+
403
+ st.title("πŸ“œ AI-Powered Contract Analysis Tool")
404
+ st.write("Upload a contract document (PDF) for a detailed AI-driven legal and technical analysis.")
405
+
406
+ # πŸ”Ή Sidebar Settings
407
+ with st.sidebar:
408
+ st.header("βš™οΈ Settings")
409
+ max_tokens = st.slider("Max Tokens", 50, 1000, 250, 50)
410
+ top_p = st.slider("Top P (sampling)", 0.1, 1.0, 0.9, 0.1)
411
+ temperature = st.slider("Temperature (creativity)", 0.1, 1.0, 0.7, 0.1)
412
+
413
+ # πŸ”Ή File Upload Section
414
+ uploaded_file = st.file_uploader("πŸ“‚ Upload a contract document (PDF)", type="pdf")
415
+
416
+ if uploaded_file:
417
+ st.success(f"βœ… File uploaded successfully! File Name: {uploaded_file.name}")
418
+ st.write(f"**File Size:** {uploaded_file.size / 1024:.2f} KB")
419
+
420
+ # Debugging: Show extracted text preview
421
+ pdf_text = read_files(uploaded_file)
422
+ if pdf_text:
423
+ st.write("**Extracted Sample Text:**")
424
+ st.code(pdf_text[:500]) # Show first 500 characters
425
+ else:
426
+ st.error("⚠️ No readable text found in the document.")
427
 
428
+ st.write("Click the button below to analyze the contract.")
 
 
429
 
430
+ # Force button to always render
431
+ st.markdown('<style>div.stButton > button {display: block; width: 100%;}</style>', unsafe_allow_html=True)
 
 
 
 
 
432
 
433
+ if st.button("πŸ” Analyze Document"):
434
+ with st.spinner("Analyzing contract document... ⏳"):
435
+ final_answer = granite_simple(
436
+ "Perform a detailed technical analysis of the attached contract document, highlighting potential risks, legal pitfalls, compliance issues, and areas where contractual terms may lead to future disputes or operational challenges.",
437
+ uploaded_file
438
+ )
439
 
440
+ # πŸ”Ή Display Analysis Result
441
+ st.subheader("πŸ“‘ Analysis Result")
442
+ st.write(final_answer)
443
 
444
+ # πŸ”₯ Run Streamlit App
445
+ if __name__ == '__main__':
446
+ main()
 
 
 
447
 
448
+ # import streamlit as st
449
+ # from PyPDF2 import PdfReader
 
 
 
 
 
 
 
 
450
 
451
+ # st.title("πŸ“‚ PDF Upload Debugger")
452
 
453
+ # uploaded_file = st.file_uploader("Upload a PDF", type="pdf")
454
 
455
+ # if uploaded_file:
456
+ # st.success(f"βœ… File uploaded: {uploaded_file.name}")
457
+ # st.write(f"πŸ“ File Size: {uploaded_file.size / 1024:.2f} KB")
458
 
459
+ # try:
460
+ # reader = PdfReader(uploaded_file)
461
+ # text = "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
462
 
463
+ # if text.strip():
464
+ # st.subheader("Extracted Text (First 500 characters)")
465
+ # st.code(text[:500]) # Show a preview of the text
466
+ # else:
467
+ # st.error("⚠️ No text found. The document might be scanned or encrypted.")
468
 
469
+ # except Exception as e:
470
+ # st.error(f"⚠️ Error reading PDF: {e}")
471
 
472
 
473
  # ###################################################################################