Spaces:

Yoxas
/

Creatingdataset

Runtime error

App Files Files Community

Creatingdataset / app.py

Yoxas

Update app.py

adb34bf verified about 1 year ago

raw

history blame

4.25 kB

	import os
	import re
	import pandas as pd
	from PyPDF2 import PdfReader
	from transformers import pipeline, AutoTokenizer
	from gradio import Interface, File
	import gradio as gr
	import spaces

	# Initialize a list to store the data
	data = []

	# Load the LED tokenizer and model
	led_tokenizer = AutoTokenizer.from_pretrained("allenai/led-base-16384-multi_lexsum-source-long")
	classifier = pipeline("text-classification", model="allenai/led-base-16384-multi_lexsum-source-long", tokenizer=led_tokenizer, framework="pt")

	# Load the summarization model and tokenizer
	summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6", tokenizer="sshleifer/distilbart-cnn-12-6", framework="pt")

	# Function to clean text by keeping only alphanumeric characters and spaces
	def clean_text(text):
	return re.sub(r'[^a-zA-Z0-9\s]', '', text)

	# Function to split text into chunks of a specified size
	def split_text(text, chunk_size=1024):
	words = text.split()
	for i in range(0, len(words), chunk_size):
	yield ' '.join(words[i:i + chunk_size])

	# Function to classify text using LED model
	@spaces.GPU(duration=120)
	def classify_text(text):
	try:
	return classifier(text)[0]['label']
	except IndexError:
	return "Unable to classify"

	# Function to summarize text using the summarizer model
	@spaces.GPU(duration=120)
	def summarize_text(text, max_length=100, min_length=30):
	try:
	return summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)[0]['summary_text']
	except IndexError:
	return "Unable to summarize"

	# Function to extract a title-like summary from the beginning of the text
	@spaces.GPU(duration=120)
	def extract_title(text, max_length=20):
	try:
	return summarizer(text, max_length=max_length, min_length=5, do_sample=False)[0]['summary_text']
	except IndexError:
	return "Unable to extract title"

	# Define the folder path and CSV file path
	# output_folder_path = '/content/drive/My Drive/path_to_output' # Adjust this to your actual path

	# Define the Gradio interface for file upload and download
	@spaces.GPU(duration=120)
	def process_files(pdf_files):
	for pdf_file in pdf_files:
	text = extract_text(pdf_file)

	# Skip encrypted files
	if text is None:
	continue

	# Extract a title from the beginning of the text
	title_text = ' '.join(text.split()[:512]) # Take the first 512 tokens for title extraction
	title = extract_title(title_text)

	# Initialize placeholders for combined results
	combined_abstract = []
	combined_cleaned_text = []

	# Split text into chunks and process each chunk
	for chunk in split_text(text, chunk_size=512):
	# Summarize the text chunk
	abstract = summarize_text(chunk)
	combined_abstract.append(abstract)

	# Clean the text chunk
	cleaned_text = clean_text(chunk)
	combined_cleaned_text.append(cleaned_text)

	# Combine results from all chunks
	final_abstract = ' '.join(combined_abstract)
	final_cleaned_text = ' '.join(combined_cleaned_text)

	# Append the data to the list
	data.append([title, final_abstract, final_cleaned_text])

	# Create a DataFrame from the data list
	df = pd.DataFrame(data, columns=['Title', 'Abstract', 'Content'])

	# Save the DataFrame to a CSV file
	output_file_path = 'processed_pdfs.csv'
	df.to_csv(output_file_path, index=False)

	return output_file_path

	# Gradio interface
	pdf_input = gr.File(label="Upload PDF Files", file_types=[".pdf"], file_count="multiple")
	csv_output = gr.File(label="Download CSV")

	gr.Interface(
	fn=process_pdfs,
	inputs=pdf_input,
	outputs=csv_output,
	title="Dataset creation",
	description="Upload PDF files and get a summarized CSV file.",
	article="""<p>This is an experimental app that allows you to create a dataset from research papers.</p>
	<p>This app uses the allenai/led-base-16384-multi_lexsum-source-long and sshleifer/distilbart-cnn-12-6 AI models.</p>
	<p>The output file is a CSV with 3 columns: title, abstract, and content.</p>"""
	).launch(share=True)