Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -4,6 +4,7 @@ import PyPDF2
|
|
4 |
import pandas as pd
|
5 |
from transformers import pipeline, AutoTokenizer
|
6 |
import gradio as gr
|
|
|
7 |
|
8 |
# Function to clean text by keeping only alphanumeric characters and spaces
|
9 |
def clean_text(text):
|
@@ -27,6 +28,7 @@ def split_text(text, chunk_size=1024):
|
|
27 |
led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
|
28 |
|
29 |
# Function to classify text using LED model
|
|
|
30 |
def classify_text(text):
|
31 |
classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
|
32 |
try:
|
@@ -35,6 +37,7 @@ def classify_text(text):
|
|
35 |
return "Unable to classify"
|
36 |
|
37 |
# Function to summarize text using BGE-m3 model
|
|
|
38 |
def summarize_text(text, max_length=100, min_length=30):
|
39 |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
|
40 |
try:
|
@@ -43,6 +46,7 @@ def summarize_text(text, max_length=100, min_length=30):
|
|
43 |
return "Unable to summarize"
|
44 |
|
45 |
# Function to extract a title-like summary from the beginning of the text
|
|
|
46 |
def extract_title(text, max_length=20):
|
47 |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
|
48 |
try:
|
@@ -51,6 +55,7 @@ def extract_title(text, max_length=20):
|
|
51 |
return "Unable to extract title"
|
52 |
|
53 |
# Function to process PDF files and generate summaries
|
|
|
54 |
def process_pdfs(pdf_files):
|
55 |
data = []
|
56 |
|
@@ -99,6 +104,9 @@ gr.Interface(
|
|
99 |
fn=process_pdfs,
|
100 |
inputs=pdf_input,
|
101 |
outputs=csv_output,
|
102 |
-
title="
|
103 |
-
description="Upload PDF files and get a summarized CSV file."
|
104 |
-
|
|
|
|
|
|
|
|
4 |
import pandas as pd
|
5 |
from transformers import pipeline, AutoTokenizer
|
6 |
import gradio as gr
|
7 |
+
import spaces
|
8 |
|
9 |
# Function to clean text by keeping only alphanumeric characters and spaces
|
10 |
def clean_text(text):
|
|
|
28 |
led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
|
29 |
|
30 |
# Function to classify text using LED model
|
31 |
+
@spaces.GPU(duration=120)
|
32 |
def classify_text(text):
|
33 |
classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
|
34 |
try:
|
|
|
37 |
return "Unable to classify"
|
38 |
|
39 |
# Function to summarize text using BGE-m3 model
|
40 |
+
@spaces.GPU(duration=120)
|
41 |
def summarize_text(text, max_length=100, min_length=30):
|
42 |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
|
43 |
try:
|
|
|
46 |
return "Unable to summarize"
|
47 |
|
48 |
# Function to extract a title-like summary from the beginning of the text
|
49 |
+
@spaces.GPU(duration=120)
|
50 |
def extract_title(text, max_length=20):
|
51 |
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
|
52 |
try:
|
|
|
55 |
return "Unable to extract title"
|
56 |
|
57 |
# Function to process PDF files and generate summaries
|
58 |
+
@spaces.GPU(duration=120)
|
59 |
def process_pdfs(pdf_files):
|
60 |
data = []
|
61 |
|
|
|
104 |
fn=process_pdfs,
|
105 |
inputs=pdf_input,
|
106 |
outputs=csv_output,
|
107 |
+
title="Dataset creation",
|
108 |
+
description="Upload PDF files and get a summarized CSV file.",
|
109 |
+
article="""<p>This is an experimental app that allows you to create a dataset from research papers.</p>
|
110 |
+
<p>This app uses the allenai/led-base-16384-multi_lexsum-source-long and sshleifer/distilbart-cnn-12-6 AI models.</p>
|
111 |
+
<p>The output file is a CSV with 3 columns: title, abstract, and content.</p>"""
|
112 |
+
).launch(share=True)
|