yilunzhang commited on
Commit
6e65e2c
·
verified ·
1 Parent(s): 1a6fc6b

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +6 -4
  2. app.py +32 -0
  3. requirements.txt +2 -0
  4. utils.py +42 -0
README.md CHANGED
@@ -1,9 +1,11 @@
1
  ---
2
  title: Ai Text Detector
3
- emoji: 📉
4
- colorFrom: yellow
5
- colorTo: yellow
6
- sdk: static
 
 
7
  pinned: false
8
  license: apache-2.0
9
  ---
 
1
  ---
2
  title: Ai Text Detector
3
+ emoji: 🐢
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: gradio
7
+ sdk_version: 5.10.0
8
+ app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
  ---
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import gradio as gr
4
+ import torch
5
+ from transformers import pipeline
6
+
7
+ from utils import clean_text
8
+
9
+
10
+ pipeline = pipeline(
11
+ task="text-classification",
12
+ model="fakespot-ai/roberta-base-ai-text-detection-v1",
13
+ device="cuda" if torch.cuda.is_available() else "cpu",
14
+ token=os.environ.get("ACCESS_TOKEN")
15
+ )
16
+
17
+
18
+ def predict(text):
19
+ cleaned_text = clean_text(text)
20
+ predictions = pipeline(cleaned_text, top_k=None)[0]
21
+ return {
22
+ p["label"]: p["score"] for p in predictions
23
+ }
24
+
25
+
26
+ demo = gr.Interface(
27
+ predict,
28
+ inputs=gr.Textbox(),
29
+ outputs=gr.Label(num_top_classes=2),
30
+ title="AI Text Detector"
31
+ )
32
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ transformers
2
+ torch
utils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from html import unescape
3
+
4
+
5
+ def clean_text(t):
6
+ t = clean_markdown(t)
7
+ t = t.replace("\n"," ")
8
+ t = t.replace("\t"," ")
9
+ t = t.replace("^M"," ")
10
+ t = t.replace("\r"," ")
11
+ t = t.replace(" ,", ",")
12
+ t = re.sub(" +", " ", t)
13
+ return t
14
+
15
+
16
+ def clean_markdown(md_text):
17
+ # Remove code blocks
18
+ md_text = re.sub(r'```.*?```', '', md_text, flags=re.DOTALL)
19
+ # Remove inline code
20
+ md_text = re.sub(r'`[^`]*`', '', md_text)
21
+ # Remove images
22
+ md_text = re.sub(r'!\[.*?\]\(.*?\)', '', md_text)
23
+ # Remove links but keep link text
24
+ md_text = re.sub(r'\[([^\]]+)\]\(.*?\)', r'\1', md_text)
25
+ # Remove bold and italic (groups of *, _)
26
+ md_text = re.sub(r'(\*\*|__)(.*?)\1', r'\2', md_text)
27
+ md_text = re.sub(r'(\*|_)(.*?)\1', r'\2', md_text)
28
+ # Remove headings
29
+ md_text = re.sub(r'#+ ', '', md_text)
30
+ # Remove blockquotes
31
+ md_text = re.sub(r'^>.*$', '', md_text, flags=re.MULTILINE)
32
+ # Remove list markers
33
+ md_text = re.sub(r'^(\s*[-*+]|\d+\.)\s+', '', md_text, flags=re.MULTILINE)
34
+ # Remove horizontal rules
35
+ md_text = re.sub(r'^\s*[-*_]{3,}\s*$', '', md_text, flags=re.MULTILINE)
36
+ # Remove tables
37
+ md_text = re.sub(r'\|.*?\|', '', md_text)
38
+ # Remove raw HTML tags
39
+ md_text = re.sub(r'<.*?>', '', md_text)
40
+ # Decode HTML entities
41
+ md_text = unescape(md_text)
42
+ return md_text