NoaiGPT commited on
Commit
f0089a1
·
1 Parent(s): 9a04025
Files changed (1) hide show
  1. app.py +59 -49
app.py CHANGED
@@ -3,8 +3,7 @@ import json
3
  import gradio as gr
4
  import spaces
5
  import torch
6
- import random
7
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
8
  from sentence_splitter import SentenceSplitter
9
  from itertools import product
10
 
@@ -16,15 +15,18 @@ device = torch.device("cuda" if cuda_available else "cpu")
16
  print(f"Using device: {device}")
17
 
18
  # Initialize paraphraser model and tokenizer
19
- paraphraser_model_name = "Ateeqq/Text-Rewriter-Paraphraser"
20
- paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, token=hf_token)
21
- paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, token=hf_token).to(device)
22
 
23
  # Initialize classifier model and tokenizer
24
  classifier_model_name = "andreas122001/roberta-mixed-detector"
25
  classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
26
  classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
27
 
 
 
 
28
  # Initialize sentence splitter
29
  splitter = SentenceSplitter(language='en')
30
 
@@ -38,45 +40,50 @@ def classify_text(text):
38
  main_score = probabilities[0][predicted_class].item()
39
  return main_label, main_score
40
 
41
- def introduce_errors(text):
42
- words = text.split()
43
- if len(words) > 3:
44
- i = random.randint(0, len(words) - 1)
45
- words[i] = words[i].lower() if words[i][0].isupper() else words[i].capitalize()
46
- return ' '.join(words)
47
 
48
  @spaces.GPU
49
  def generate_paraphrases(text, setting, output_format):
50
  sentences = splitter.split(text)
51
  all_sentence_paraphrases = []
52
-
53
  if setting == 1:
54
- temperature = 0.7
55
- top_p = 0.95
56
- top_k = 50
57
- num_return_sequences = 3
 
58
  elif setting == 2:
59
- temperature = 0.8
60
- top_p = 0.9
61
- top_k = 40
62
- num_return_sequences = 4
 
63
  elif setting == 3:
64
- temperature = 0.9
65
- top_p = 0.85
66
- top_k = 30
67
- num_return_sequences = 5
 
68
  elif setting == 4:
69
- temperature = 1.0
70
- top_p = 0.8
71
- top_k = 20
72
- num_return_sequences = 6
 
73
  else:
74
- temperature = 1.1
75
- top_p = 0.75
76
- top_k = 10
77
- num_return_sequences = 7
 
78
 
79
- max_length = 128
 
 
80
 
81
  formatted_output = "Original text:\n" + text + "\n\n"
82
  formatted_output += "Paraphrased versions:\n"
@@ -89,38 +96,41 @@ def generate_paraphrases(text, setting, output_format):
89
  }
90
 
91
  for i, sentence in enumerate(sentences):
92
- inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).input_ids.to(device)
93
 
94
- # Generate paraphrases using sampling
95
  outputs = paraphraser_model.generate(
96
- inputs,
97
- do_sample=True,
98
  num_return_sequences=num_return_sequences,
 
 
99
  temperature=temperature,
100
- top_p=top_p,
101
  top_k=top_k,
102
- repetition_penalty=1.2,
103
- no_repeat_ngram_size=2,
104
- max_length=max_length
 
105
  )
106
 
107
  paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
108
- paraphrases = [introduce_errors(p) for p in paraphrases]
 
109
 
110
  formatted_output += f"Original sentence {i+1}: {sentence}\n"
111
- for j, paraphrase in enumerate(paraphrases, 1):
112
  formatted_output += f" Paraphrase {j}: {paraphrase}\n"
113
 
114
  json_output["paraphrased_versions"].append({
115
  f"original_sentence_{i+1}": sentence,
116
- "paraphrases": paraphrases
117
  })
118
 
119
- all_sentence_paraphrases.append(paraphrases)
120
  formatted_output += "\n"
121
 
122
  all_combinations = list(product(*all_sentence_paraphrases))
123
- random.shuffle(all_combinations)
124
 
125
  formatted_output += "\nCombined paraphrased versions:\n"
126
  combined_versions = []
@@ -136,7 +146,7 @@ def generate_paraphrases(text, setting, output_format):
136
  label, score = classify_text(version)
137
  formatted_output += f"Version {i}:\n{version}\n"
138
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
139
- if label == "human-produced" or (label == "machine-generated" and score < 0.95):
140
  human_versions.append((version, label, score))
141
 
142
  formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
@@ -167,7 +177,7 @@ iface = gr.Interface(
167
  fn=generate_paraphrases,
168
  inputs=[
169
  gr.Textbox(lines=5, label="Input Text"),
170
- gr.Slider(minimum=1, maximum=5, step=1, label="Diversity Setting"),
171
  gr.Radio(["text", "json"], label="Output Format")
172
  ],
173
  outputs=[
@@ -175,7 +185,7 @@ iface = gr.Interface(
175
  gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
176
  ],
177
  title="Advanced Diverse Paraphraser with Human-like Filter",
178
- description="Enter a text, select a diversity setting, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
179
  )
180
 
181
  # Launch the interface
 
3
  import gradio as gr
4
  import spaces
5
  import torch
6
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, pipeline
 
7
  from sentence_splitter import SentenceSplitter
8
  from itertools import product
9
 
 
15
  print(f"Using device: {device}")
16
 
17
  # Initialize paraphraser model and tokenizer
18
+ paraphraser_model_name = "NoaiGPT/777"
19
+ paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_auth_token=hf_token)
20
+ paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, use_auth_token=hf_token).to(device)
21
 
22
  # Initialize classifier model and tokenizer
23
  classifier_model_name = "andreas122001/roberta-mixed-detector"
24
  classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
25
  classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
26
 
27
+ # Initialize spelling correction pipeline
28
+ spelling_correction = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base", device=0 if cuda_available else -1)
29
+
30
  # Initialize sentence splitter
31
  splitter = SentenceSplitter(language='en')
32
 
 
40
  main_score = probabilities[0][predicted_class].item()
41
  return main_label, main_score
42
 
43
+ def correct_spelling(text):
44
+ corrected_text = spelling_correction(text, max_length=2048)[0]['generated_text']
45
+ print(corrected_text)
46
+ return corrected_text
 
 
47
 
48
  @spaces.GPU
49
  def generate_paraphrases(text, setting, output_format):
50
  sentences = splitter.split(text)
51
  all_sentence_paraphrases = []
52
+
53
  if setting == 1:
54
+ num_return_sequences = 5
55
+ repetition_penalty = 1.1
56
+ no_repeat_ngram_size = 2
57
+ temperature = 1.0
58
+ max_length = 128
59
  elif setting == 2:
60
+ num_return_sequences = 10
61
+ repetition_penalty = 1.2
62
+ no_repeat_ngram_size = 3
63
+ temperature = 1.2
64
+ max_length = 192
65
  elif setting == 3:
66
+ num_return_sequences = 15
67
+ repetition_penalty = 1.3
68
+ no_repeat_ngram_size = 4
69
+ temperature = 1.4
70
+ max_length = 256
71
  elif setting == 4:
72
+ num_return_sequences = 20
73
+ repetition_penalty = 1.4
74
+ no_repeat_ngram_size = 5
75
+ temperature = 1.6
76
+ max_length = 320
77
  else:
78
+ num_return_sequences = 25
79
+ repetition_penalty = 1.5
80
+ no_repeat_ngram_size = 6
81
+ temperature = 1.8
82
+ max_length = 384
83
 
84
+ top_k = 50
85
+ top_p = 0.95
86
+ length_penalty = 1.0
87
 
88
  formatted_output = "Original text:\n" + text + "\n\n"
89
  formatted_output += "Paraphrased versions:\n"
 
96
  }
97
 
98
  for i, sentence in enumerate(sentences):
99
+ inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device)
100
 
101
+ # Generate paraphrases using the specified parameters
102
  outputs = paraphraser_model.generate(
103
+ inputs.input_ids,
104
+ attention_mask=inputs.attention_mask,
105
  num_return_sequences=num_return_sequences,
106
+ repetition_penalty=repetition_penalty,
107
+ no_repeat_ngram_size=no_repeat_ngram_size,
108
  temperature=temperature,
109
+ max_length=max_length,
110
  top_k=top_k,
111
+ top_p=top_p,
112
+ do_sample=True,
113
+ early_stopping=False,
114
+ length_penalty=length_penalty
115
  )
116
 
117
  paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
118
+
119
+ corrected_paraphrases = [correct_spelling(paraphrase) for paraphrase in paraphrases]
120
 
121
  formatted_output += f"Original sentence {i+1}: {sentence}\n"
122
+ for j, paraphrase in enumerate(corrected_paraphrases, 1):
123
  formatted_output += f" Paraphrase {j}: {paraphrase}\n"
124
 
125
  json_output["paraphrased_versions"].append({
126
  f"original_sentence_{i+1}": sentence,
127
+ "paraphrases": corrected_paraphrases
128
  })
129
 
130
+ all_sentence_paraphrases.append(corrected_paraphrases)
131
  formatted_output += "\n"
132
 
133
  all_combinations = list(product(*all_sentence_paraphrases))
 
134
 
135
  formatted_output += "\nCombined paraphrased versions:\n"
136
  combined_versions = []
 
146
  label, score = classify_text(version)
147
  formatted_output += f"Version {i}:\n{version}\n"
148
  formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
149
+ if label == "human-produced" or (label == "machine-generated" and score < 0.98):
150
  human_versions.append((version, label, score))
151
 
152
  formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
 
177
  fn=generate_paraphrases,
178
  inputs=[
179
  gr.Textbox(lines=5, label="Input Text"),
180
+ gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"),
181
  gr.Radio(["text", "json"], label="Output Format")
182
  ],
183
  outputs=[
 
185
  gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
186
  ],
187
  title="Advanced Diverse Paraphraser with Human-like Filter",
188
+ description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
189
  )
190
 
191
  # Launch the interface