Spaces:

NoaiGPT
/

lexical

Build error

App Files Files Community

NoaiGPT commited on Jul 9, 2024

Commit

f0089a1

1 Parent(s): 9a04025

asd

Browse files

Files changed (1) hide show

app.py +59 -49

app.py CHANGED Viewed

@@ -3,8 +3,7 @@ import json
 import gradio as gr
 import spaces
 import torch
-import random
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
 from sentence_splitter import SentenceSplitter
 from itertools import product
@@ -16,15 +15,18 @@ device = torch.device("cuda" if cuda_available else "cpu")
 print(f"Using device: {device}")
 # Initialize paraphraser model and tokenizer
-paraphraser_model_name = "Ateeqq/Text-Rewriter-Paraphraser"
-paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, token=hf_token)
-paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, token=hf_token).to(device)
 # Initialize classifier model and tokenizer
 classifier_model_name = "andreas122001/roberta-mixed-detector"
 classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
 classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
 # Initialize sentence splitter
 splitter = SentenceSplitter(language='en')
@@ -38,45 +40,50 @@ def classify_text(text):
     main_score = probabilities[0][predicted_class].item()
     return main_label, main_score
-def introduce_errors(text):
-    words = text.split()
-    if len(words) > 3:
-        i = random.randint(0, len(words) - 1)
-        words[i] = words[i].lower() if words[i][0].isupper() else words[i].capitalize()
-    return ' '.join(words)
 @spaces.GPU
 def generate_paraphrases(text, setting, output_format):
     sentences = splitter.split(text)
     all_sentence_paraphrases = []
     if setting == 1:
-        temperature = 0.7
-        top_p = 0.95
-        top_k = 50
-        num_return_sequences = 3
     elif setting == 2:
-        temperature = 0.8
-        top_p = 0.9
-        top_k = 40
-        num_return_sequences = 4
     elif setting == 3:
-        temperature = 0.9
-        top_p = 0.85
-        top_k = 30
-        num_return_sequences = 5
     elif setting == 4:
-        temperature = 1.0
-        top_p = 0.8
-        top_k = 20
-        num_return_sequences = 6
     else:
-        temperature = 1.1
-        top_p = 0.75
-        top_k = 10
-        num_return_sequences = 7
-    max_length = 128
     formatted_output = "Original text:\n" + text + "\n\n"
     formatted_output += "Paraphrased versions:\n"
@@ -89,38 +96,41 @@ def generate_paraphrases(text, setting, output_format):
     }
     for i, sentence in enumerate(sentences):
-        inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).input_ids.to(device)
-        # Generate paraphrases using sampling
         outputs = paraphraser_model.generate(
-            inputs,
-            do_sample=True,
             num_return_sequences=num_return_sequences,
             temperature=temperature,
-            top_p=top_p,
             top_k=top_k,
-            repetition_penalty=1.2,
-            no_repeat_ngram_size=2,
-            max_length=max_length
         )
         paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        paraphrases = [introduce_errors(p) for p in paraphrases]
         formatted_output += f"Original sentence {i+1}: {sentence}\n"
-        for j, paraphrase in enumerate(paraphrases, 1):
             formatted_output += f"  Paraphrase {j}: {paraphrase}\n"
         json_output["paraphrased_versions"].append({
             f"original_sentence_{i+1}": sentence,
-            "paraphrases": paraphrases
         })
-        all_sentence_paraphrases.append(paraphrases)
         formatted_output += "\n"
     all_combinations = list(product(*all_sentence_paraphrases))
-    random.shuffle(all_combinations)
     formatted_output += "\nCombined paraphrased versions:\n"
     combined_versions = []
@@ -136,7 +146,7 @@ def generate_paraphrases(text, setting, output_format):
         label, score = classify_text(version)
         formatted_output += f"Version {i}:\n{version}\n"
         formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
-        if label == "human-produced" or (label == "machine-generated" and score < 0.95):
             human_versions.append((version, label, score))
     formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
@@ -167,7 +177,7 @@ iface = gr.Interface(
     fn=generate_paraphrases,
     inputs=[
         gr.Textbox(lines=5, label="Input Text"),
-        gr.Slider(minimum=1, maximum=5, step=1, label="Diversity Setting"),
         gr.Radio(["text", "json"], label="Output Format")
     ],
     outputs=[
@@ -175,7 +185,7 @@ iface = gr.Interface(
         gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
     ],
     title="Advanced Diverse Paraphraser with Human-like Filter",
-    description="Enter a text, select a diversity setting, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
 )
 # Launch the interface

 import gradio as gr
 import spaces
 import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, pipeline
 from sentence_splitter import SentenceSplitter
 from itertools import product
 print(f"Using device: {device}")
 # Initialize paraphraser model and tokenizer
+paraphraser_model_name = "NoaiGPT/777"
+paraphraser_tokenizer = AutoTokenizer.from_pretrained(paraphraser_model_name, use_auth_token=hf_token)
+paraphraser_model = AutoModelForSeq2SeqLM.from_pretrained(paraphraser_model_name, use_auth_token=hf_token).to(device)
 # Initialize classifier model and tokenizer
 classifier_model_name = "andreas122001/roberta-mixed-detector"
 classifier_tokenizer = AutoTokenizer.from_pretrained(classifier_model_name)
 classifier_model = AutoModelForSequenceClassification.from_pretrained(classifier_model_name).to(device)
+# Initialize spelling correction pipeline
+spelling_correction = pipeline("text2text-generation", model="oliverguhr/spelling-correction-english-base", device=0 if cuda_available else -1)
 # Initialize sentence splitter
 splitter = SentenceSplitter(language='en')
     main_score = probabilities[0][predicted_class].item()
     return main_label, main_score
+def correct_spelling(text):
+    corrected_text = spelling_correction(text, max_length=2048)[0]['generated_text']
+    print(corrected_text)
+    return corrected_text
 @spaces.GPU
 def generate_paraphrases(text, setting, output_format):
     sentences = splitter.split(text)
     all_sentence_paraphrases = []
     if setting == 1:
+        num_return_sequences = 5
+        repetition_penalty = 1.1
+        no_repeat_ngram_size = 2
+        temperature = 1.0
+        max_length = 128
     elif setting == 2:
+        num_return_sequences = 10
+        repetition_penalty = 1.2
+        no_repeat_ngram_size = 3
+        temperature = 1.2
+        max_length = 192
     elif setting == 3:
+        num_return_sequences = 15
+        repetition_penalty = 1.3
+        no_repeat_ngram_size = 4
+        temperature = 1.4
+        max_length = 256
     elif setting == 4:
+        num_return_sequences = 20
+        repetition_penalty = 1.4
+        no_repeat_ngram_size = 5
+        temperature = 1.6
+        max_length = 320
     else:
+        num_return_sequences = 25
+        repetition_penalty = 1.5
+        no_repeat_ngram_size = 6
+        temperature = 1.8
+        max_length = 384
+    top_k = 50
+    top_p = 0.95
+    length_penalty = 1.0
     formatted_output = "Original text:\n" + text + "\n\n"
     formatted_output += "Paraphrased versions:\n"
     }
     for i, sentence in enumerate(sentences):
+        inputs = paraphraser_tokenizer(f'paraphraser: {sentence}', return_tensors="pt", padding="longest", truncation=True, max_length=max_length).to(device)
+        # Generate paraphrases using the specified parameters
         outputs = paraphraser_model.generate(
+            inputs.input_ids,
+            attention_mask=inputs.attention_mask,
             num_return_sequences=num_return_sequences,
+            repetition_penalty=repetition_penalty,
+            no_repeat_ngram_size=no_repeat_ngram_size,
             temperature=temperature,
+            max_length=max_length,
             top_k=top_k,
+            top_p=top_p,
+            do_sample=True,
+            early_stopping=False,
+            length_penalty=length_penalty
         )
         paraphrases = paraphraser_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        corrected_paraphrases = [correct_spelling(paraphrase) for paraphrase in paraphrases]
         formatted_output += f"Original sentence {i+1}: {sentence}\n"
+        for j, paraphrase in enumerate(corrected_paraphrases, 1):
             formatted_output += f"  Paraphrase {j}: {paraphrase}\n"
         json_output["paraphrased_versions"].append({
             f"original_sentence_{i+1}": sentence,
+            "paraphrases": corrected_paraphrases
         })
+        all_sentence_paraphrases.append(corrected_paraphrases)
         formatted_output += "\n"
     all_combinations = list(product(*all_sentence_paraphrases))
     formatted_output += "\nCombined paraphrased versions:\n"
     combined_versions = []
         label, score = classify_text(version)
         formatted_output += f"Version {i}:\n{version}\n"
         formatted_output += f"Classification: {label} (confidence: {score:.2%})\n\n"
+        if label == "human-produced" or (label == "machine-generated" and score < 0.98):
             human_versions.append((version, label, score))
     formatted_output += "\nHuman-like or Less Confident Machine-generated versions:\n"
     fn=generate_paraphrases,
     inputs=[
         gr.Textbox(lines=5, label="Input Text"),
+        gr.Slider(minimum=1, maximum=5, step=1, label="Readability to Human-like Setting"),
         gr.Radio(["text", "json"], label="Output Format")
     ],
     outputs=[
         gr.Textbox(lines=10, label="Human-like or Less Confident Machine-generated Paraphrases")
     ],
     title="Advanced Diverse Paraphraser with Human-like Filter",
+    description="Enter a text, select a setting from readable to human-like, and choose the output format to generate diverse paraphrased versions. Combined versions are classified, and those detected as human-produced or less confidently machine-generated are presented in the final output."
 )
 # Launch the interface