Spaces:

unpaper
/

AddPaper

Sleeping

File size: 7,687 Bytes

import streamlit as st
import arxiv
import requests
import os
from pathlib import Path
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
from huggingface_hub import login, HfApi
import fitz  # PyMuPDF
import pandas as pd
from collections import Counter
import re
import json

# Constants
MODEL_NAME = "google/flan-t5-large"
SECONDARY_MODEL = "facebook/bart-large-cnn"
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "your_username/<name>")
SPACE_NAME = f"unpaper/<name>" if not HUGGINGFACE_TOKEN.startswith("your_username") else f"your_username/<name>"
HF_API_URL = "https://huggingface.co/api/models"

# CSS
st.markdown("""
    <style>
    .main { background-color: #f5f5f5; }
    .sidebar .sidebar-content { background-color: #ffffff; }
    .badge { 
        background-color: #ff4b4b; 
        color: white; 
        padding: 5px 10px; 
        border-radius: 5px; 
        display: inline-block; 
    }
    .warning {
        background-color: #fff3cd;
        color: #856404;
        padding: 10px;
        border-radius: 5px;
        margin: 10px 0;
    }
    </style>
""", unsafe_allow_html=True)

# Sidebar
st.sidebar.title("arXiv Paper Converter")
st.sidebar.header("Settings")
arxiv_id = st.sidebar.text_input("Enter arXiv ID", "2407.21783")
upload_pdf = st.sidebar.file_uploader("Upload PDF", type="pdf")
space_name = st.sidebar.text_input("Hugging Face Space Name", SPACE_NAME)
token = st.sidebar.text_input("Hugging Face Token", HUGGINGFACE_TOKEN, type="password")
model_choice = st.sidebar.selectbox("Select Model", ["Text-to-Text (FLAN-T5)", "Text Generation (BART)"])

# Login to Hugging Face
if token:
    login(token=token)

# Fetch available models from Hugging Face API
@st.cache_data(ttl=3600)
def fetch_hf_models():
    try:
        response = requests.get(HF_API_URL, headers={"Authorization": f"Bearer {token}"})
        if response.status_code == 200:
            return response.json()
        else:
            st.warning("Failed to fetch models from Hugging Face API. Using default models.")
            return None
    except Exception as e:
        st.warning(f"Error fetching models: {str(e)}. Using default models.")
        return None

hf_models = fetch_hf_models()

# Initialize models
@st.cache_resource
def load_models():
    if model_choice == "Text-to-Text (FLAN-T5)":
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
        pipeline_model = pipeline("text2text-generation", model=model, tokenizer=tokenizer)
    else:
        tokenizer = AutoTokenizer.from_pretrained(SECONDARY_MODEL)
        model = AutoModelForSeq2SeqLM.from_pretrained(SECONDARY_MODEL)
        pipeline_model = pipeline("summarization", model=model, tokenizer=tokenizer)
    return tokenizer, model, pipeline_model

tokenizer, model, pipeline_model = load_models()

# Functions
def fetch_arxiv_paper(paper_id):
    client = arxiv.Client()
    search = arxiv.Search(id_list=[paper_id])
    paper = next(client.results(search))
    return paper

def download_pdf(paper, filename):
    paper.download_pdf(filename=filename)
    return filename

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def analyze_authors(text):
    author_pattern = r"Author[s]?:\s*(.+?)(?:\n|$)"
    authors = re.findall(author_pattern, text, re.IGNORECASE)
    author_list = []
    for author in authors:
        names = author.split(',')
        author_list.extend([name.strip() for name in names])
    return Counter(author_list)

def process_text_with_model(text, task="summarize"):
    if model_choice == "Text-to-Text (FLAN-T5)":
        prompt = f"{task} the following text: {text[:1000]}"
        result = pipeline_model(prompt, max_length=512, num_beams=4)
    else:
        result = pipeline_model(text[:1000], max_length=512, min_length=30, do_sample=False)
    return result[0]['generated_text']

def create_huggingface_space(space_name, metadata):
    api = HfApi()
    try:
        api.create_repo(repo_id=space_name, repo_type="space", space_sdk="static", private=False)
        # Upload metadata
        with open("metadata.json", "w") as f:
            json.dump(metadata, f, indent=2)
        api.upload_file(
            path_or_fileobj="metadata.json",
            path_in_repo="metadata.json",
            repo_id=space_name,
            repo_type="space"
        )
        api.upload_file(
            path_or_fileobj="README.md",
            path_in_repo="README.md",
            repo_id=space_name,
            repo_type="space"
        )
        return f"https://huggingface.co/spaces/{space_name}"
    except Exception as e:
        st.error(f"Failed to create space: {str(e)}")
        return None
    finally:
        if os.path.exists("metadata.json"):
            os.remove("metadata.json")

# Main App
st.title("arXiv Paper to Hugging Face Space Converter")
st.markdown("<div class='badge'>Beta Community - Open Discussion in Community Tab</div>", unsafe_allow_html=True)

# Warning about model usage
st.markdown("""
    <div class='warning'>
        <strong>Warning:</strong> Ensure you have proper permissions to use selected models.
        Model outputs are stored in metadata and will be publicly visible in the space.
    </div>
""", unsafe_allow_html=True)

# Process arXiv or PDF
if arxiv_id or upload_pdf:
    if upload_pdf:
        pdf_path = "temp.pdf"
        with open(pdf_path, "wb") as f:
            f.write(upload_pdf.getbuffer())
    else:
        paper = fetch_arxiv_paper(arxiv_id)
        pdf_path = download_pdf(paper, "temp.pdf")

    # Extract and analyze
    text = extract_text_from_pdf(pdf_path)
    author_analysis = analyze_authors(text)
    
    # Model processing
    summary = process_text_with_model(text, "summarize")
    key_points = process_text_with_model(text, "extract key points" if model_choice == "Text-to-Text (FLAN-T5)" else "summarize")

    # Display results
    st.header("Paper Analysis")
    st.subheader("Authors")
    st.dataframe(pd.DataFrame.from_dict(author_analysis, orient='index', columns=['Count']))

    st.subheader("AI Analysis")
    st.write("Summary:", summary)
    st.write("Key Points:", key_points)

    # Enhanced metadata
    metadata = {
        "title": paper.title if arxiv_id else "Uploaded PDF",
        "authors": list(author_analysis.keys()),
        "arxiv_id": arxiv_id if arxiv_id else "N/A",
        "model_analysis": {
            "summary": summary,
            "key_points": key_points,
            "model_used": model_choice,
            "model_name": MODEL_NAME if model_choice == "Text-to-Text (FLAN-T5)" else SECONDARY_MODEL,
            "model_license": "Check model card on Hugging Face",
            "processing_date": pd.Timestamp.now().isoformat()
        },
        "warnings": {
            "model_usage": "Ensure proper model licensing",
            "content_visibility": "All outputs will be public in space",
            "data_source": "Verify arXiv/paper permissions"
        }
    }

    # Create Space
    if st.button("Create Hugging Face Space"):
        space_url = create_huggingface_space(space_name, metadata)
        if space_url:
            st.success(f"Space created: {space_url}")
            st.markdown(f"""
                <a href="{space_url}" target="_blank">
                    <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" 
                         alt="Hugging Face Space" width="150">
                </a>
            """, unsafe_allow_html=True)

# Cleanup
if os.path.exists("temp.pdf"):
    os.remove("temp.pdf")