Spaces:

unpaper
/

AddPaper

Sleeping

App Files Files Community

AddPaper / app.py

katsukiai

Update app.py

1f53a43 verified 3 months ago

raw

history blame

7.69 kB

	import streamlit as st
	import arxiv
	import requests
	import os
	from pathlib import Path
	from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
	from huggingface_hub import login, HfApi
	import fitz # PyMuPDF
	import pandas as pd
	from collections import Counter
	import re
	import json

	# Constants
	MODEL_NAME = "google/flan-t5-large"
	SECONDARY_MODEL = "facebook/bart-large-cnn"
	HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "your_username/<name>")
	SPACE_NAME = f"unpaper/<name>" if not HUGGINGFACE_TOKEN.startswith("your_username") else f"your_username/<name>"
	HF_API_URL = "https://huggingface.co/api/models"

	# CSS
	st.markdown("""
	<style>
	.main { background-color: #f5f5f5; }
	.sidebar .sidebar-content { background-color: #ffffff; }
	.badge {
	background-color: #ff4b4b;
	color: white;
	padding: 5px 10px;
	border-radius: 5px;
	display: inline-block;
	}
	.warning {
	background-color: #fff3cd;
	color: #856404;
	padding: 10px;
	border-radius: 5px;
	margin: 10px 0;
	}
	</style>
	""", unsafe_allow_html=True)

	# Sidebar
	st.sidebar.title("arXiv Paper Converter")
	st.sidebar.header("Settings")
	arxiv_id = st.sidebar.text_input("Enter arXiv ID", "2407.21783")
	upload_pdf = st.sidebar.file_uploader("Upload PDF", type="pdf")
	space_name = st.sidebar.text_input("Hugging Face Space Name", SPACE_NAME)
	token = st.sidebar.text_input("Hugging Face Token", HUGGINGFACE_TOKEN, type="password")
	model_choice = st.sidebar.selectbox("Select Model", ["Text-to-Text (FLAN-T5)", "Text Generation (BART)"])

	# Login to Hugging Face
	if token:
	login(token=token)

	# Fetch available models from Hugging Face API
	@st.cache_data(ttl=3600)
	def fetch_hf_models():
	try:
	response = requests.get(HF_API_URL, headers={"Authorization": f"Bearer {token}"})
	if response.status_code == 200:
	return response.json()
	else:
	st.warning("Failed to fetch models from Hugging Face API. Using default models.")
	return None
	except Exception as e:
	st.warning(f"Error fetching models: {str(e)}. Using default models.")
	return None

	hf_models = fetch_hf_models()

	# Initialize models
	@st.cache_resource
	def load_models():
	if model_choice == "Text-to-Text (FLAN-T5)":
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
	pipeline_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
	else:
	tokenizer = AutoTokenizer.from_pretrained(SECONDARY_MODEL)
	model = AutoModelForSeq2SeqLM.from_pretrained(SECONDARY_MODEL)
	pipeline_model = pipeline("summarization", model=model, tokenizer=tokenizer)
	return tokenizer, model, pipeline_model

	tokenizer, model, pipeline_model = load_models()

	# Functions
	def fetch_arxiv_paper(paper_id):
	client = arxiv.Client()
	search = arxiv.Search(id_list=[paper_id])
	paper = next(client.results(search))
	return paper

	def download_pdf(paper, filename):
	paper.download_pdf(filename=filename)
	return filename

	def extract_text_from_pdf(pdf_path):
	doc = fitz.open(pdf_path)
	text = ""
	for page in doc:
	text += page.get_text()
	return text

	def analyze_authors(text):
	author_pattern = r"Author[s]?:\s*(.+?)(?:\n\|$)"
	authors = re.findall(author_pattern, text, re.IGNORECASE)
	author_list = []
	for author in authors:
	names = author.split(',')
	author_list.extend([name.strip() for name in names])
	return Counter(author_list)

	def process_text_with_model(text, task="summarize"):
	if model_choice == "Text-to-Text (FLAN-T5)":
	prompt = f"{task} the following text: {text[:1000]}"
	result = pipeline_model(prompt, max_length=512, num_beams=4)
	else:
	result = pipeline_model(text[:1000], max_length=512, min_length=30, do_sample=False)
	return result[0]['generated_text']

	def create_huggingface_space(space_name, metadata):
	api = HfApi()
	try:
	api.create_repo(repo_id=space_name, repo_type="space", space_sdk="static", private=False)
	# Upload metadata
	with open("metadata.json", "w") as f:
	json.dump(metadata, f, indent=2)
	api.upload_file(
	path_or_fileobj="metadata.json",
	path_in_repo="metadata.json",
	repo_id=space_name,
	repo_type="space"
	)
	api.upload_file(
	path_or_fileobj="README.md",
	path_in_repo="README.md",
	repo_id=space_name,
	repo_type="space"
	)
	return f"https://huggingface.co/spaces/{space_name}"
	except Exception as e:
	st.error(f"Failed to create space: {str(e)}")
	return None
	finally:
	if os.path.exists("metadata.json"):
	os.remove("metadata.json")

	# Main App
	st.title("arXiv Paper to Hugging Face Space Converter")
	st.markdown("<div class='badge'>Beta Community - Open Discussion in Community Tab</div>", unsafe_allow_html=True)

	# Warning about model usage
	st.markdown("""
	<div class='warning'>
	<strong>Warning:</strong> Ensure you have proper permissions to use selected models.
	Model outputs are stored in metadata and will be publicly visible in the space.
	</div>
	""", unsafe_allow_html=True)

	# Process arXiv or PDF
	if arxiv_id or upload_pdf:
	if upload_pdf:
	pdf_path = "temp.pdf"
	with open(pdf_path, "wb") as f:
	f.write(upload_pdf.getbuffer())
	else:
	paper = fetch_arxiv_paper(arxiv_id)
	pdf_path = download_pdf(paper, "temp.pdf")

	# Extract and analyze
	text = extract_text_from_pdf(pdf_path)
	author_analysis = analyze_authors(text)

	# Model processing
	summary = process_text_with_model(text, "summarize")
	key_points = process_text_with_model(text, "extract key points" if model_choice == "Text-to-Text (FLAN-T5)" else "summarize")

	# Display results
	st.header("Paper Analysis")
	st.subheader("Authors")
	st.dataframe(pd.DataFrame.from_dict(author_analysis, orient='index', columns=['Count']))

	st.subheader("AI Analysis")
	st.write("Summary:", summary)
	st.write("Key Points:", key_points)

	# Enhanced metadata
	metadata = {
	"title": paper.title if arxiv_id else "Uploaded PDF",
	"authors": list(author_analysis.keys()),
	"arxiv_id": arxiv_id if arxiv_id else "N/A",
	"model_analysis": {
	"summary": summary,
	"key_points": key_points,
	"model_used": model_choice,
	"model_name": MODEL_NAME if model_choice == "Text-to-Text (FLAN-T5)" else SECONDARY_MODEL,
	"model_license": "Check model card on Hugging Face",
	"processing_date": pd.Timestamp.now().isoformat()
	},
	"warnings": {
	"model_usage": "Ensure proper model licensing",
	"content_visibility": "All outputs will be public in space",
	"data_source": "Verify arXiv/paper permissions"
	}
	}

	# Create Space
	if st.button("Create Hugging Face Space"):
	space_url = create_huggingface_space(space_name, metadata)
	if space_url:
	st.success(f"Space created: {space_url}")
	st.markdown(f"""
	<a href="{space_url}" target="_blank">
	<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg"
	alt="Hugging Face Space" width="150">
	</a>
	""", unsafe_allow_html=True)

	# Cleanup
	if os.path.exists("temp.pdf"):
	os.remove("temp.pdf")