Creatingdataset / app.py
Yoxas's picture
Update app.py
adb34bf verified
raw
history blame
4.25 kB
import os
import re
import pandas as pd
from PyPDF2 import PdfReader
from transformers import pipeline, AutoTokenizer
from gradio import Interface, File
import gradio as gr
import spaces
# Initialize a list to store the data
data = []
# Load the LED tokenizer and model
led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")
# Load the summarization model and tokenizer
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")
# Function to clean text by keeping only alphanumeric characters and spaces
def clean_text(text):
return re.sub(r'[^a-zA-Z0-9\s]', '', text)
# Function to split text into chunks of a specified size
def split_text(text, chunk_size=1024):
words = text.split()
for i in range(0, len(words), chunk_size):
yield ' '.join(words[i:i + chunk_size])
# Function to classify text using LED model
@spaces.GPU(duration=120)
def classify_text(text):
try:
return classifier(text)[0]['label']
except IndexError:
return "Unable to classify"
# Function to summarize text using the summarizer model
@spaces.GPU(duration=120)
def summarize_text(text, max_length=100, min_length=30):
try:
return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
except IndexError:
return "Unable to summarize"
# Function to extract a title-like summary from the beginning of the text
@spaces.GPU(duration=120)
def extract_title(text, max_length=20):
try:
return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
except IndexError:
return "Unable to extract title"
# Define the folder path and CSV file path
# output_folder_path = '/content/drive/My Drive/path_to_output' # Adjust this to your actual path
# Define the Gradio interface for file upload and download
@spaces.GPU(duration=120)
def process_files(pdf_files):
for pdf_file in pdf_files:
text = extract_text(pdf_file)
# Skip encrypted files
if text is None:
continue
# Extract a title from the beginning of the text
title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction
title = extract_title(title_text)
# Initialize placeholders for combined results
combined_abstract = []
combined_cleaned_text = []
# Split text into chunks and process each chunk
for chunk in split_text(text, chunk_size=512):
# Summarize the text chunk
abstract = summarize_text(chunk)
combined_abstract.append(abstract)
# Clean the text chunk
cleaned_text = clean_text(chunk)
combined_cleaned_text.append(cleaned_text)
# Combine results from all chunks
final_abstract = ' '.join(combined_abstract)
final_cleaned_text = ' '.join(combined_cleaned_text)
# Append the data to the list
data.append([title, final_abstract, final_cleaned_text])
# Create a DataFrame from the data list
df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])
# Save the DataFrame to a CSV file
output_file_path = 'processed_pdfs.csv'
df.to_csv(output_file_path, index=False)
return output_file_path
# Gradio interface
pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
csv_output = gr.File(label="Download CSV")
gr.Interface(
fn=process_pdfs,
inputs=pdf_input,
outputs=csv_output,
title="Dataset creation",
description="Upload PDF files and get a summarized CSV file.",
article="""<p>This is an experimental app that allows you to create a dataset from research papers.</p>
<p>This app uses the allenai/led-base-16384-multi_lexsum-source-long and sshleifer/distilbart-cnn-12-6 AI models.</p>
<p>The output file is a CSV with 3 columns: title, abstract, and content.</p>"""
).launch(share=True)