|
import streamlit as st |
|
import arxiv |
|
import requests |
|
import os |
|
from pathlib import Path |
|
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer |
|
from huggingface_hub import login, HfApi |
|
import fitz |
|
import pandas as pd |
|
from collections import Counter |
|
import re |
|
import json |
|
|
|
|
|
MODEL_NAME = "google/flan-t5-large" |
|
SECONDARY_MODEL = "facebook/bart-large-cnn" |
|
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "your_username/<name>") |
|
SPACE_NAME = f"unpaper/<name>" if not HUGGINGFACE_TOKEN.startswith("your_username") else f"your_username/<name>" |
|
HF_API_URL = "https://huggingface.co/api/models" |
|
|
|
|
|
st.markdown(""" |
|
<style> |
|
.main { background-color: #f5f5f5; } |
|
.sidebar .sidebar-content { background-color: #ffffff; } |
|
.badge { |
|
background-color: #ff4b4b; |
|
color: white; |
|
padding: 5px 10px; |
|
border-radius: 5px; |
|
display: inline-block; |
|
} |
|
.warning { |
|
background-color: #fff3cd; |
|
color: #856404; |
|
padding: 10px; |
|
border-radius: 5px; |
|
margin: 10px 0; |
|
} |
|
</style> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
st.sidebar.title("arXiv Paper Converter") |
|
st.sidebar.header("Settings") |
|
arxiv_id = st.sidebar.text_input("Enter arXiv ID", "2407.21783") |
|
upload_pdf = st.sidebar.file_uploader("Upload PDF", type="pdf") |
|
space_name = st.sidebar.text_input("Hugging Face Space Name", SPACE_NAME) |
|
token = st.sidebar.text_input("Hugging Face Token", HUGGINGFACE_TOKEN, type="password") |
|
model_choice = st.sidebar.selectbox("Select Model", ["Text-to-Text (FLAN-T5)", "Text Generation (BART)"]) |
|
|
|
|
|
if token: |
|
login(token=token) |
|
|
|
|
|
@st.cache_data(ttl=3600) |
|
def fetch_hf_models(): |
|
try: |
|
response = requests.get(HF_API_URL, headers={"Authorization": f"Bearer {token}"}) |
|
if response.status_code == 200: |
|
return response.json() |
|
else: |
|
st.warning("Failed to fetch models from Hugging Face API. Using default models.") |
|
return None |
|
except Exception as e: |
|
st.warning(f"Error fetching models: {str(e)}. Using default models.") |
|
return None |
|
|
|
hf_models = fetch_hf_models() |
|
|
|
|
|
@st.cache_resource |
|
def load_models(): |
|
if model_choice == "Text-to-Text (FLAN-T5)": |
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME) |
|
pipeline_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer) |
|
else: |
|
tokenizer = AutoTokenizer.from_pretrained(SECONDARY_MODEL) |
|
model = AutoModelForSeq2SeqLM.from_pretrained(SECONDARY_MODEL) |
|
pipeline_model = pipeline("summarization", model=model, tokenizer=tokenizer) |
|
return tokenizer, model, pipeline_model |
|
|
|
tokenizer, model, pipeline_model = load_models() |
|
|
|
|
|
def fetch_arxiv_paper(paper_id): |
|
client = arxiv.Client() |
|
search = arxiv.Search(id_list=[paper_id]) |
|
paper = next(client.results(search)) |
|
return paper |
|
|
|
def download_pdf(paper, filename): |
|
paper.download_pdf(filename=filename) |
|
return filename |
|
|
|
def extract_text_from_pdf(pdf_path): |
|
doc = fitz.open(pdf_path) |
|
text = "" |
|
for page in doc: |
|
text += page.get_text() |
|
return text |
|
|
|
def analyze_authors(text): |
|
author_pattern = r"Author[s]?:\s*(.+?)(?:\n|$)" |
|
authors = re.findall(author_pattern, text, re.IGNORECASE) |
|
author_list = [] |
|
for author in authors: |
|
names = author.split(',') |
|
author_list.extend([name.strip() for name in names]) |
|
return Counter(author_list) |
|
|
|
def process_text_with_model(text, task="summarize"): |
|
if model_choice == "Text-to-Text (FLAN-T5)": |
|
prompt = f"{task} the following text: {text[:1000]}" |
|
result = pipeline_model(prompt, max_length=512, num_beams=4) |
|
else: |
|
result = pipeline_model(text[:1000], max_length=512, min_length=30, do_sample=False) |
|
return result[0]['generated_text'] |
|
|
|
def create_huggingface_space(space_name, metadata): |
|
api = HfApi() |
|
try: |
|
api.create_repo(repo_id=space_name, repo_type="space", space_sdk="static", private=False) |
|
|
|
with open("metadata.json", "w") as f: |
|
json.dump(metadata, f, indent=2) |
|
api.upload_file( |
|
path_or_fileobj="metadata.json", |
|
path_in_repo="metadata.json", |
|
repo_id=space_name, |
|
repo_type="space" |
|
) |
|
api.upload_file( |
|
path_or_fileobj="README.md", |
|
path_in_repo="README.md", |
|
repo_id=space_name, |
|
repo_type="space" |
|
) |
|
return f"https://huggingface.co/spaces/{space_name}" |
|
except Exception as e: |
|
st.error(f"Failed to create space: {str(e)}") |
|
return None |
|
finally: |
|
if os.path.exists("metadata.json"): |
|
os.remove("metadata.json") |
|
|
|
|
|
st.title("arXiv Paper to Hugging Face Space Converter") |
|
st.markdown("<div class='badge'>Beta Community - Open Discussion in Community Tab</div>", unsafe_allow_html=True) |
|
|
|
|
|
st.markdown(""" |
|
<div class='warning'> |
|
<strong>Warning:</strong> Ensure you have proper permissions to use selected models. |
|
Model outputs are stored in metadata and will be publicly visible in the space. |
|
</div> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
if arxiv_id or upload_pdf: |
|
if upload_pdf: |
|
pdf_path = "temp.pdf" |
|
with open(pdf_path, "wb") as f: |
|
f.write(upload_pdf.getbuffer()) |
|
else: |
|
paper = fetch_arxiv_paper(arxiv_id) |
|
pdf_path = download_pdf(paper, "temp.pdf") |
|
|
|
|
|
text = extract_text_from_pdf(pdf_path) |
|
author_analysis = analyze_authors(text) |
|
|
|
|
|
summary = process_text_with_model(text, "summarize") |
|
key_points = process_text_with_model(text, "extract key points" if model_choice == "Text-to-Text (FLAN-T5)" else "summarize") |
|
|
|
|
|
st.header("Paper Analysis") |
|
st.subheader("Authors") |
|
st.dataframe(pd.DataFrame.from_dict(author_analysis, orient='index', columns=['Count'])) |
|
|
|
st.subheader("AI Analysis") |
|
st.write("Summary:", summary) |
|
st.write("Key Points:", key_points) |
|
|
|
|
|
metadata = { |
|
"title": paper.title if arxiv_id else "Uploaded PDF", |
|
"authors": list(author_analysis.keys()), |
|
"arxiv_id": arxiv_id if arxiv_id else "N/A", |
|
"model_analysis": { |
|
"summary": summary, |
|
"key_points": key_points, |
|
"model_used": model_choice, |
|
"model_name": MODEL_NAME if model_choice == "Text-to-Text (FLAN-T5)" else SECONDARY_MODEL, |
|
"model_license": "Check model card on Hugging Face", |
|
"processing_date": pd.Timestamp.now().isoformat() |
|
}, |
|
"warnings": { |
|
"model_usage": "Ensure proper model licensing", |
|
"content_visibility": "All outputs will be public in space", |
|
"data_source": "Verify arXiv/paper permissions" |
|
} |
|
} |
|
|
|
|
|
if st.button("Create Hugging Face Space"): |
|
space_url = create_huggingface_space(space_name, metadata) |
|
if space_url: |
|
st.success(f"Space created: {space_url}") |
|
st.markdown(f""" |
|
<a href="{space_url}" target="_blank"> |
|
<img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" |
|
alt="Hugging Face Space" width="150"> |
|
</a> |
|
""", unsafe_allow_html=True) |
|
|
|
|
|
if os.path.exists("temp.pdf"): |
|
os.remove("temp.pdf") |