Spaces:

nuojohnchen
/

XtraGPT-7B

Running on Zero

App Files Files Community

nuojohnchen commited on Mar 31

Commit

ca06ed0

verified ·

1 Parent(s): e53ff8e

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -52

app.py CHANGED Viewed

@@ -6,13 +6,13 @@ import PyPDF2
 from io import BytesIO
 import torch
-# 设置环境变量
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
 <h1 style="text-align: center;">Academic Paper Improver</h1>
-<p>This Space helps you improve sections of your academic paper using the <a href="https://huggingface.co/Xtra-Computing/XtraGPT-7B"><b>XtraGPT-7B</b></a> model.</p>
 <p>Upload your PDF paper, select a section of text you want to improve, and specify your requirements.</p>
 </div>
 '''
@@ -32,7 +32,7 @@ CITATION = """
 LICENSE = """
 <p/>
 ---
-Built with XtraGPT-7B
 """
 css = """
@@ -48,78 +48,102 @@ h1 {
 }
 """
-# 默认论文内容
 default_paper_content = """
 The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.
 """
-# 直接加载模型和分词器
-tokenizer = AutoTokenizer.from_pretrained("Xtra-Computing/XtraGPT-7B")
-model = AutoModelForCausalLM.from_pretrained("Xtra-Computing/XtraGPT-7B", device_map="auto")
 def extract_text_from_pdf(pdf_bytes):
-    """从上传的PDF文件中提取文本"""
     if pdf_bytes is None:
         return default_paper_content
     try:
-        # 确保pdf_bytes是字节类型
         if isinstance(pdf_bytes, str):
-            return pdf_bytes  # 如果已经是字符串，直接返回
-        # 直接使用字节对象
         pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
-        # 从所有页面提取文本
         text = ""
         for page_num in range(len(pdf_reader.pages)):
             page = pdf_reader.pages[page_num]
             text += page.extract_text() + "\n\n"
-        # 限制文本长度，防止超出模型最大长度
-        if len(text) > 10000:  # 保守估计，留出足够空间给提示和生成
-            text = text[:10000] + "...(文本已截断)"
         return text
     except Exception as e:
-        print(f"PDF提取错误: {str(e)}")
         return default_paper_content
 @spaces.GPU(duration=200)
-def improve_paper_section(paper_content, selected_content, improvement_prompt, temperature=0.1, max_new_tokens=512):
     """
-    改进学术论文的一个部分 - 使用非流式生成
     """
-    # 检查输入
     if not selected_content or not improvement_prompt:
-        return "请同时提供要改进的文本和改进要求。"
     try:
-        # 限制paper_content长度，防止超出模型最大长度
-        if len(paper_content) > 20000:  # 保守估计
-            paper_content = paper_content[:20000] + "...(文本已截断)"
-        # 构建提示
         content = f"""
 Please improve the selected content based on the following. Act as an expert model for improving articles **PAPER_CONTENT**.
 The output needs to answer the **QUESTION** on **SELECTED_CONTENT** in the input. Avoid adding unnecessary length, unrelated details, overclaims, or vague statements.
 Focus on clear, concise, and evidence-based improvements that align with the overall context of the paper.
 <PAPER_CONTENT>
 {paper_content}
 </PAPER_CONTENT>
 <SELECTED_CONTENT>
 {selected_content}
 </SELECTED_CONTENT>
 <QUESTION>
 {improvement_prompt}
 </QUESTION>
 """
-        # 准备输入
         messages = [
             {"role": "user", "content": content}
         ]
@@ -130,14 +154,15 @@ Focus on clear, concise, and evidence-based improvements that align with the ove
             add_generation_prompt=True
         )
-        # 检查输入长度并截断
         input_tokens = tokenizer.encode(text)
-        if len(input_tokens) > 15000:  # 为生成留出空间
-            input_tokens = input_tokens[:15000]
             text = tokenizer.decode(input_tokens)
-            print(f"输入已截断至15000个token")
-        # 使用非流式方式生成
         input_ids = tokenizer.encode(text, return_tensors="pt").to(model.device)
         with torch.no_grad():
@@ -149,38 +174,47 @@ Focus on clear, concise, and evidence-based improvements that align with the ove
                 pad_token_id=tokenizer.eos_token_id
             )
-        # 只保留新生成的部分
         generated_ids = output_ids[0, len(input_ids[0]):]
         response = tokenizer.decode(generated_ids, skip_special_tokens=True)
         return response
     except Exception as e:
         import traceback
         error_details = traceback.format_exc()
-        print(f"生成错误: {str(e)}\n{error_details}")
-        return f"生成文本时出错: {str(e)}\n\n请尝试使用不同的参数或输入。"
-# 创建Gradio界面
 with gr.Blocks(fill_height=True, css=css) as demo:
-    # 存储提取的PDF文本
     extracted_pdf_text = gr.State(default_paper_content)
     gr.Markdown(DESCRIPTION)
-    # gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
     with gr.Row():
         with gr.Column():
-            # 步骤1：上传PDF
             with gr.Group():
                 gr.Markdown("### Step 1: Upload your academic paper")
                 pdf_file = gr.File(
                     label="Upload PDF",
                     file_types=[".pdf"],
-                    type="binary"  # 直接获取二进制数据
                 )
-            # 步骤2：提取并选择文本
             with gr.Group():
                 gr.Markdown("### Step 2: Enter the text section to improve")
                 selected_content = gr.Textbox(
@@ -190,7 +224,7 @@ with gr.Blocks(fill_height=True, css=css) as demo:
                     value="The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration."
                 )
-            # 步骤3：指定改进要求
             with gr.Group():
                 gr.Markdown("### Step 3: Specify your improvement requirements")
                 improvement_prompt = gr.Textbox(
@@ -207,10 +241,10 @@ with gr.Blocks(fill_height=True, css=css) as demo:
             submit_btn = gr.Button("Improve Text")
         with gr.Column():
-            # 输出
             output = gr.Textbox(label="Improved Text", lines=20)
-            # 显示提取的PDF文本（可折叠）
             with gr.Accordion("Extracted PDF Content (for reference)", open=False):
                 pdf_content_display = gr.Textbox(
                     label="Paper Content",
@@ -218,7 +252,7 @@ with gr.Blocks(fill_height=True, css=css) as demo:
                     value=default_paper_content
                 )
-    # 当PDF上传时自动提取文本
     def update_pdf_content(pdf_bytes):
         if pdf_bytes is not None:
             content = extract_text_from_pdf(pdf_bytes)
@@ -231,15 +265,14 @@ with gr.Blocks(fill_height=True, css=css) as demo:
         outputs=[extracted_pdf_text, pdf_content_display]
     )
-    # 处理文本改进
     submit_btn.click(
         fn=improve_paper_section,
-        inputs=[extracted_pdf_text, selected_content, improvement_prompt, temperature, max_tokens],
         outputs=[output]
     )
-    # gr.Markdown(LICENSE)
-    gr.Markdown(CITATION)
 if __name__ == "__main__":
     demo.launch()

 from io import BytesIO
 import torch
+# Set environment variables
 HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
 <h1 style="text-align: center;">Academic Paper Improver</h1>
+<p>This Space helps you improve sections of your academic paper using the <a href="https://huggingface.co/Xtra-Computing/XtraGPT-7B"><b>XtraGPT</b></a> model series.</p>
 <p>Upload your PDF paper, select a section of text you want to improve, and specify your requirements.</p>
 </div>
 '''
 LICENSE = """
 <p/>
 ---
+Built with XtraGPT models
 """
 css = """
 }
 """
+# Default paper content
 default_paper_content = """
 The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experiments on two machine translation tasks show these models to be superior in quality while being more parallelizable and requiring significantly less time to train. Our model achieves 28.4 BLEU on the WMT 2014 English-to-German translation task, improving over the existing best results, including ensembles by over 2 BLEU. On the WMT 2014 English-to-French translation task, our model establishes a new single-model state-of-the-art BLEU score of 41.8 after training for 3.5 days on eight GPUs, a small fraction of the training costs of the best models from the literature. We show that the Transformer generalizes well to other tasks by applying it successfully to English constituency parsing both with large and limited training data.
 """
+# Available models
+AVAILABLE_MODELS = {
+    "XtraGPT-1.5B": "Xtra-Computing/XtraGPT-1.5B",
+    "XtraGPT-3B": "Xtra-Computing/XtraGPT-3B",
+    "XtraGPT-7B": "Xtra-Computing/XtraGPT-7B",
+    "XtraGPT-14B": "Xtra-Computing/XtraGPT-14B"
+}
+# Global variables for model and tokenizer
+current_model = None
+current_tokenizer = None
+current_model_name = None
 def extract_text_from_pdf(pdf_bytes):
+    """Extract text from uploaded PDF file"""
     if pdf_bytes is None:
         return default_paper_content
     try:
+        # Ensure pdf_bytes is bytes type
         if isinstance(pdf_bytes, str):
+            return pdf_bytes  # If already a string, return directly
+        # Use bytes object directly
         pdf_reader = PyPDF2.PdfReader(BytesIO(pdf_bytes))
+        # Extract text from all pages
         text = ""
         for page_num in range(len(pdf_reader.pages)):
             page = pdf_reader.pages[page_num]
             text += page.extract_text() + "\n\n"
         return text
     except Exception as e:
+        print(f"PDF extraction error: {str(e)}")
         return default_paper_content
+def load_model(model_name):
+    """Load model and tokenizer on demand"""
+    global current_model, current_tokenizer, current_model_name
+    # If the requested model is already loaded, return it
+    if current_model_name == model_name and current_model is not None and current_tokenizer is not None:
+        return current_tokenizer, current_model
+    # Clear GPU memory if a model is already loaded
+    if current_model is not None:
+        del current_model
+        del current_tokenizer
+        torch.cuda.empty_cache()
+    # Load the requested model
+    model_path = AVAILABLE_MODELS[model_name]
+    current_tokenizer = AutoTokenizer.from_pretrained(model_path)
+    current_model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
+    current_model_name = model_name
+    return current_tokenizer, current_model
 @spaces.GPU(duration=200)
+def improve_paper_section(model_name, paper_content, selected_content, improvement_prompt, temperature=0.1, max_new_tokens=512, progress=gr.Progress()):
     """
+    Improve a section of an academic paper - non-streaming generation
     """
+    # Check inputs
     if not selected_content or not improvement_prompt:
+        return "Please provide both text to improve and improvement requirements."
     try:
+        progress(0.1, desc="Loading model...")
+        # Load the selected model
+        tokenizer, model = load_model(model_name)
+        progress(0.3, desc="Processing input...")
+        # Build prompt
         content = f"""
 Please improve the selected content based on the following. Act as an expert model for improving articles **PAPER_CONTENT**.
 The output needs to answer the **QUESTION** on **SELECTED_CONTENT** in the input. Avoid adding unnecessary length, unrelated details, overclaims, or vague statements.
 Focus on clear, concise, and evidence-based improvements that align with the overall context of the paper.
 <PAPER_CONTENT>
 {paper_content}
 </PAPER_CONTENT>
 <SELECTED_CONTENT>
 {selected_content}
 </SELECTED_CONTENT>
 <QUESTION>
 {improvement_prompt}
 </QUESTION>
 """
+        # Prepare input
         messages = [
             {"role": "user", "content": content}
         ]
             add_generation_prompt=True
         )
+        # Check input length and truncate to first 10k tokens
         input_tokens = tokenizer.encode(text)
+        if len(input_tokens) > 10000:  # Limit to 10k tokens as requested
+            input_tokens = input_tokens[:10000]
             text = tokenizer.decode(input_tokens)
+            print(f"Input truncated to 10000 tokens")
+        progress(0.5, desc="Generating improved text...")
+        # Generate non-streaming
         input_ids = tokenizer.encode(text, return_tensors="pt").to(model.device)
         with torch.no_grad():
                 pad_token_id=tokenizer.eos_token_id
             )
+        # Only keep the newly generated part
         generated_ids = output_ids[0, len(input_ids[0]):]
         response = tokenizer.decode(generated_ids, skip_special_tokens=True)
+        progress(1.0, desc="Complete!")
         return response
     except Exception as e:
         import traceback
         error_details = traceback.format_exc()
+        print(f"Generation error: {str(e)}\n{error_details}")
+        return f"Error generating text: {str(e)}\n\nPlease try with different parameters or input."
+# Create Gradio interface
 with gr.Blocks(fill_height=True, css=css) as demo:
+    # Store extracted PDF text
     extracted_pdf_text = gr.State(default_paper_content)
     gr.Markdown(DESCRIPTION)
     with gr.Row():
         with gr.Column():
+            # Step 1: Upload PDF
             with gr.Group():
                 gr.Markdown("### Step 1: Upload your academic paper")
                 pdf_file = gr.File(
                     label="Upload PDF",
                     file_types=[".pdf"],
+                    type="binary"  # Get binary data directly
+                )
+            # Model selection
+            with gr.Group():
+                gr.Markdown("### Select Model")
+                model_dropdown = gr.Dropdown(
+                    choices=list(AVAILABLE_MODELS.keys()),
+                    value="XtraGPT-7B",  # Default selection
+                    label="Select XtraGPT Model"
                 )
+            # Step 2: Extract and select text
             with gr.Group():
                 gr.Markdown("### Step 2: Enter the text section to improve")
                 selected_content = gr.Textbox(
                     value="The dominant sequence transduction models are based on complex recurrent or convolutional neural networks in an encoder-decoder configuration."
                 )
+            # Step 3: Specify improvement requirements
             with gr.Group():
                 gr.Markdown("### Step 3: Specify your improvement requirements")
                 improvement_prompt = gr.Textbox(
             submit_btn = gr.Button("Improve Text")
         with gr.Column():
+            # Output
             output = gr.Textbox(label="Improved Text", lines=20)
+            # Display extracted PDF text (collapsible)
             with gr.Accordion("Extracted PDF Content (for reference)", open=False):
                 pdf_content_display = gr.Textbox(
                     label="Paper Content",
                     value=default_paper_content
                 )
+    # Automatically extract text when PDF is uploaded
     def update_pdf_content(pdf_bytes):
         if pdf_bytes is not None:
             content = extract_text_from_pdf(pdf_bytes)
         outputs=[extracted_pdf_text, pdf_content_display]
     )
+    # Process text improvement
     submit_btn.click(
         fn=improve_paper_section,
+        inputs=[model_dropdown, extracted_pdf_text, selected_content, improvement_prompt, temperature, max_tokens],
         outputs=[output]
     )
+    gr.HTML(CITATION)
 if __name__ == "__main__":
     demo.launch()