File size: 3,258 Bytes
4eae158
 
 
 
 
b8985e3
 
4eae158
 
b8985e3
 
 
2574ff7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eae158
2574ff7
 
 
 
 
 
 
 
 
 
 
 
 
 
b8985e3
2574ff7
4eae158
2574ff7
 
 
b8985e3
 
 
 
2574ff7
4eae158
b8985e3
4eae158
8581164
 
207bc33
2574ff7
 
 
d2552db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import gradio as gr
import torch
from torch.nn.functional import softmax
import shap
import requests
from transformers import RobertaTokenizer,RobertaForSequenceClassification, pipeline
from IPython.core.display import HTML
model_dir = 'temp'
tokenizer = RobertaTokenizer.from_pretrained(model_dir)
model = RobertaForSequenceClassification.from_pretrained(model_dir)
#pipe = pipeline("text-classification", model="thugCodeNinja/robertatemp")
pipe = pipeline("text-classification",model=model,tokenizer=tokenizer)
def process_text(input_text, input_file):
    if input_text:
        text = input_text
    elif input_file is not None:
        text = input_file.read().decode('utf-8')
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = softmax(logits, dim=1)
    max_prob, predicted_class_id = torch.max(probs, dim=1)
    prob = str(round(max_prob.item() * 100, 2))
    label = model.config.id2label[predicted_class_id.item()]
    final_label='Human' if model.config.id2label[predicted_class_id.item()]=='LABEL_0' else 'Chat-GPT'
    processed_result = text
    def search(text):
        query = text
        api_key = 'AIzaSyClvkiiJTZrCJ8BLqUY9I38WYmbve8g-c8'
        search_engine_id = '53d064810efa44ce7'
        url = f'https://www.googleapis.com/customsearch/v1?key={api_key}&cx={search_engine_id}&q={query}'

        try:
            response = requests.get(url)
            data = response.json()
            return data
        except Exception as e:
            return {'error': str(e)}
    def find_plagiarism(text):
        search_results = search(text)
        if 'items' not in search_results:
            return []
        similar_articles = []
        for item in search_results['items']:
            title = item.get('title', '')
            link = item.get('link', '')
            similar_articles.append([ title,link])
        return similar_articles[:5]

    prediction = pipe([text])
    explainer = shap.Explainer(pipe)
    shap_values = explainer([text])
    shap_plot_html = HTML(shap.plots.text(shap_values, display=False)).data
    # HTML(shap.plots.text(shap_values, display=False))
    # with open('rendered.html', 'w') as file:
    #     file.write(shap.plots.text(shap_values, display=False))
    similar_articles = find_plagiarism(text)

    return processed_result, prob, final_label, shap_plot_html,similar_articles

text_input = gr.Textbox(label="Enter text")
file_input = gr.File(label="Upload a text file")
outputs = [gr.Textbox(label="Processed text"), gr.Textbox(label="Probability"), gr.Textbox(label="Label"), gr.HTML(label="SHAP Plot"),gr.Dataframe(label="Similar Articles", headers=["Title", "Link"],row_count=5)]
title = "Group 2- ChatGPT text detection module"
description = '''Please upload text files and text input responsibly and await the explainable results. The approach in place includes finetuning a Roberta model for text classification.Once the classifications are done the decision is exaplined thorugh the SHAP text plot.
The probability is particularly explained by the attention plots through SHAP'''
gr.Interface(fn=process_text,title=title,description=description, inputs=[text_input, file_input], outputs=outputs).launch()