Spaces:
Sleeping
Sleeping
import streamlit as st | |
import random | |
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer | |
import torch | |
import io | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter | |
from pdfminer.layout import LAParams | |
from pdfminer.pdfpage import PDFPage | |
from docx import Document | |
# --- Streamlit Page Configuration --- | |
st.set_page_config(page_title="AI & Plagiarism Detection", page_icon="π", layout="wide") | |
# --- DeepSeek Theme --- | |
DEEPSEEK_THEME = { | |
"backgroundColor": "#282c34", | |
"textColor": "#abb2bf", | |
"inputAreaColor": "#3E4451", | |
"accentColor": "#61afef", | |
"sidebarColor": "#21252b", | |
"font": "sans-serif", | |
} | |
# --- Function to Apply Theme --- | |
def apply_theme(theme): | |
st.markdown(f""" | |
<style> | |
body {{ | |
color: {theme["textColor"]}; | |
background-color: {theme["backgroundColor"]}; | |
font-family: {theme["font"]}; | |
}} | |
.welcome-text {{ | |
color: {theme["textColor"]}; | |
font-size: 36px; | |
font-weight: bold; | |
text-align: center; | |
margin-bottom: 20px; | |
}} | |
.output-box {{ | |
background-color: {theme["inputAreaColor"]}; | |
color: {theme["textColor"]}; | |
padding: 10px; | |
border-radius: 5px; | |
margin-top: 20px; | |
}} | |
.stTextArea textarea {{ | |
background-color: {theme["inputAreaColor"]}; | |
color: {theme["textColor"]}; | |
border: 1px solid {theme["accentColor"]}; | |
border-radius: 5px; | |
}} | |
.stFileUploader > div > div:nth-child(1) > div > button {{ | |
background-color: {theme["accentColor"]}; | |
color: {theme["backgroundColor"]}; | |
border-radius: 5px; | |
}} | |
.stMetricLabel {{ | |
color: {theme["textColor"]} !important; | |
}} | |
.stMetricValue {{ | |
color: {theme["textColor"]} !important; | |
}} | |
.streamlit-expanderHeader {{ | |
color: {theme["textColor"]}; | |
}} | |
.streamlit-expanderContent {{ | |
color: {theme["textColor"]}; | |
}} | |
[data-testid="stSidebar"] {{ | |
background-color: {theme["sidebarColor"]}; | |
color: {theme["textColor"]}; | |
}} | |
</style> | |
""", unsafe_allow_html=True) | |
# --- Helper Functions --- | |
def extract_text_from_pdf(pdf_file): | |
resource_manager = PDFResourceManager() | |
output_string = io.StringIO() | |
laparams = LAParams() | |
device = TextConverter(resource_manager, output_string, laparams=laparams) | |
interpreter = PDFPageInterpreter(resource_manager, device) | |
for page in PDFPage.get_pages(pdf_file, caching=True, check_extractable=True): | |
interpreter.process_page(page) | |
text = output_string.getvalue() | |
device.close() | |
output_string.close() | |
return text | |
def extract_text_from_docx(docx_file): | |
doc = Document(docx_file) | |
full_text = [] | |
for paragraph in doc.paragraphs: | |
full_text.append(paragraph.text) | |
return '\n'.join(full_text) | |
def split_text_into_chunks(text, tokenizer, max_length=512): | |
chunks = [] | |
tokens = tokenizer.tokenize(text) | |
for i in range(0, len(tokens), max_length): | |
chunk_tokens = tokens[i:i + max_length] | |
chunk_text = tokenizer.convert_tokens_to_string(chunk_tokens) | |
chunks.append(chunk_text) | |
return chunks | |
def load_ai_detection_model(model_name="Hello-SimpleAI/chatgpt-detector-roberta"): | |
try: | |
ai_detection = pipeline("text-classification", model=model_name, truncation=True, max_length=512) | |
return ai_detection | |
except Exception as e: | |
st.error(f"Error loading AI detection model: {e}") | |
return None | |
def load_plagiarism_model(model_name="jpwahle/longformer-base-plagiarism-detection"): | |
try: | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForSequenceClassification.from_pretrained(model_name) | |
return tokenizer, model | |
except Exception as e: | |
st.error(f"Error loading plagiarism detection model: {e}") | |
return None | |
def detect_ai_content(text_chunks, ai_detection_model, ai_threshold=0.4): | |
try: | |
ai_percentages = [] | |
for chunk in text_chunks: | |
result = ai_detection_model(chunk) | |
ai_label = result[0]['label'] | |
ai_score = result[0]['score'] | |
if ai_label == 'AI' and ai_score > ai_threshold: | |
ai_percentages.append(ai_score) | |
elif ai_label == 'Human' and ai_score < (1 - ai_threshold): | |
ai_percentages.append(0) | |
else: | |
ai_percentages.append(0) | |
return ai_percentages | |
except Exception as e: | |
st.error(f"Error during AI content detection: {e}") | |
return None | |
def plagiarism_check(text_chunks, tokenizer, model): | |
try: | |
plagiarized_count = 0 | |
for chunk in text_chunks: | |
inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=512) | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
predicted_class = torch.argmax(outputs.logits, dim=-1).item() | |
if predicted_class == 1: | |
plagiarized_count += 1 | |
plagiarism_percentage = (plagiarized_count / len(text_chunks)) * 100 | |
return plagiarism_percentage | |
except Exception as e: | |
st.error(f"Error during plagiarism detection: {e}") | |
return None | |
# --- Main Function --- | |
def main(): | |
# --- Apply DeepSeek Theme --- | |
apply_theme(DEEPSEEK_THEME) | |
# --- Sidebar --- | |
with st.sidebar: | |
st.markdown("<h1 style='color:#61afef;'>AI & Plagiarism</h1>", unsafe_allow_html=True) | |
st.markdown("Navigation") | |
menu_options = ["New Chat"] # Removed "My Profile" and "Get App" | |
selected_option = st.radio("Choose an option", menu_options) | |
st.markdown("---") | |
st.markdown("Today") | |
recent_chats = ["Chat 1", "Chat 2", "Chat 3"] | |
for chat in recent_chats: | |
st.markdown(f"- {chat}") | |
# --- Main Content --- | |
col1, col2 = st.columns([1, 3]) # Adjust the ratio as needed | |
with col2: | |
st.markdown("<h1 class='welcome-text'>Hi, I'm AI & Plagiarism Assistant.</h1>", unsafe_allow_html=True) | |
st.markdown("How can I help you today?") | |
# --- Input Area: Text Area and File Upload --- | |
input_text = st.text_area("Message", "", height=200) | |
uploaded_files = st.file_uploader("Attach documents (PDF or DOCX)", type=["pdf", "docx"], accept_multiple_files=True) | |
# --- Load models --- | |
ai_detection_model, tokenizer, plagiarism_model = load_models() | |
# --- Process Input --- | |
if input_text or uploaded_files: | |
raw_text = "" | |
# --- Process Uploaded Files --- | |
if uploaded_files: | |
with st.expander("Uploaded Files", expanded=False): | |
for uploaded_file in uploaded_files: | |
file_size = len(uploaded_file.getvalue()) | |
if file_size > 1000000000: | |
st.error(f"{uploaded_file.name}: File size exceeds the 1GB limit.") | |
continue | |
try: | |
if uploaded_file.type == "application/pdf": | |
extracted_text = extract_text_from_pdf(uploaded_file) | |
raw_text += extracted_text + "\n" | |
st.write(f"Extracted text from {uploaded_file.name}") | |
elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": | |
extracted_text = extract_text_from_docx(uploaded_file) | |
raw_text += extracted_text + "\n" | |
st.write(f"Extracted text from {uploaded_file.name}") | |
else: | |
st.error(f"{uploaded_file.name}: Unsupported file type") | |
continue | |
except Exception as e: | |
st.error(f"Error processing {uploaded_file.name}: {e}") | |
continue | |
# --- Append Manual Text --- | |
raw_text += input_text.strip() | |
# --- Split text into manageable chunks --- | |
text_chunks = split_text_into_chunks(raw_text.strip(), tokenizer) | |
# --- Process and Display Results --- | |
process_and_display(text_chunks, "Combined Input", ai_detection_model, tokenizer, plagiarism_model) | |
# --- Helper function to process text and display results --- | |
def process_and_display(text_chunks, source_name, ai_detection_model, tokenizer, plagiarism_model): | |
# AI Detection | |
ai_percentage_avg = None | |
human_percentage = None | |
if ai_detection_model: | |
ai_percentages = detect_ai_content(text_chunks, ai_detection_model) | |
if ai_percentages: | |
ai_percentage_avg = sum(ai_percentages) / len(ai_percentages) * 100 | |
human_percentage = 100 - ai_percentage_avg | |
# Plagiarism Check | |
plagiarism_percentage = None | |
if tokenizer and plagiarism_model: | |
plagiarism_percentage = plagiarism_check(text_chunks, tokenizer, plagiarism_model) | |
# --- Tiled Output --- | |
with st.container(): | |
st.markdown(f"<div class='output-box'><h3>{source_name}</h3></div>", unsafe_allow_html=True) | |
col1, col2 = st.columns(2) | |
with col1: | |
st.markdown("<div class='output-box'><h4>AI Detection:</h4></div>", unsafe_allow_html=True) | |
if ai_percentage_avg is not None: | |
st.metric(label="AI Content", value=f"{ai_percentage_avg:.2f}%", delta="AI Generated") | |
st.metric(label="Human Written", value=f"{human_percentage:.2f}%", delta="Humanized Text") | |
else: | |
st.write("AI Detection not available") | |
with col2: | |
st.markdown("<div class='output-box'><h4>Plagiarism Detection:</h4></div>", unsafe_allow_html=True) | |
if plagiarism_percentage is not None: | |
st.metric(label="Plagiarism", value=f"{plagiarism_percentage:.2f}%", delta="Plagiarized" if plagiarism_percentage > 0 else "Original") | |
else: | |
st.write("Plagiarism Detection not available") | |
# --- Load models globally --- | |
def load_models(): | |
ai_detection_model = load_ai_detection_model() | |
tokenizer, plagiarism_model = load_plagiarism_model() | |
return ai_detection_model, tokenizer, plagiarism_model | |
# --- Call Main --- | |
if __name__ == "__main__": | |
ai_detection_model, tokenizer, plagiarism_model = load_models() | |
main() | |