Yoxas commited on
Commit
e960059
·
verified ·
1 Parent(s): b6579fb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -3
app.py CHANGED
@@ -4,6 +4,7 @@ import PyPDF2
4
  import pandas as pd
5
  from transformers import pipeline, AutoTokenizer
6
  import gradio as gr
 
7
 
8
  # Function to clean text by keeping only alphanumeric characters and spaces
9
  def clean_text(text):
@@ -27,6 +28,7 @@ def split_text(text, chunk_size=1024):
27
  led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
28
 
29
  # Function to classify text using LED model
 
30
  def classify_text(text):
31
  classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
32
  try:
@@ -35,6 +37,7 @@ def classify_text(text):
35
  return "Unable to classify"
36
 
37
  # Function to summarize text using BGE-m3 model
 
38
  def summarize_text(text, max_length=100, min_length=30):
39
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
40
  try:
@@ -43,6 +46,7 @@ def summarize_text(text, max_length=100, min_length=30):
43
  return "Unable to summarize"
44
 
45
  # Function to extract a title-like summary from the beginning of the text
 
46
  def extract_title(text, max_length=20):
47
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
48
  try:
@@ -51,6 +55,7 @@ def extract_title(text, max_length=20):
51
  return "Unable to extract title"
52
 
53
  # Function to process PDF files and generate summaries
 
54
  def process_pdfs(pdf_files):
55
  data = []
56
 
@@ -99,6 +104,9 @@ gr.Interface(
99
  fn=process_pdfs,
100
  inputs=pdf_input,
101
  outputs=csv_output,
102
- title="PDF Summarizer",
103
- description="Upload PDF files and get a summarized CSV file."
104
- ).launch()
 
 
 
 
4
  import pandas as pd
5
  from transformers import pipeline, AutoTokenizer
6
  import gradio as gr
7
+ import spaces
8
 
9
  # Function to clean text by keeping only alphanumeric characters and spaces
10
  def clean_text(text):
 
28
  led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
29
 
30
  # Function to classify text using LED model
31
+ @spaces.GPU(duration=120)
32
  def classify_text(text):
33
  classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
34
  try:
 
37
  return "Unable to classify"
38
 
39
  # Function to summarize text using BGE-m3 model
40
+ @spaces.GPU(duration=120)
41
  def summarize_text(text, max_length=100, min_length=30):
42
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
43
  try:
 
46
  return "Unable to summarize"
47
 
48
  # Function to extract a title-like summary from the beginning of the text
49
+ @spaces.GPU(duration=120)
50
  def extract_title(text, max_length=20):
51
  summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
52
  try:
 
55
  return "Unable to extract title"
56
 
57
  # Function to process PDF files and generate summaries
58
+ @spaces.GPU(duration=120)
59
  def process_pdfs(pdf_files):
60
  data = []
61
 
 
104
  fn=process_pdfs,
105
  inputs=pdf_input,
106
  outputs=csv_output,
107
+ title="Dataset creation",
108
+ description="Upload PDF files and get a summarized CSV file.",
109
+ article="""<p>This is an experimental app that allows you to create a dataset from research papers.</p>
110
+ <p>This app uses the allenai/led-base-16384-multi_lexsum-source-long and sshleifer/distilbart-cnn-12-6 AI models.</p>
111
+ <p>The output file is a CSV with 3 columns: title, abstract, and content.</p>"""
112
+ ).launch(share=True)