Spaces:

mmr44
/

CS670-Project-milestone3

Sleeping

File size: 2,677 Bytes

import streamlit as st
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Load the sentiment analysis model and tokenizer
model_name = "mmr44/fine-tuned-hupd-model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Create a sentiment analysis pipeline
sentiment_analysis = pipeline('text-classification', model=model, tokenizer=tokenizer)

# Load dataset to get patent numbers
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
    trust_remote_code=True
)

train_set = dataset_dict['train']
# Convert to DataFrame to get patent numbers
train_df = train_set.to_pandas()
patent_numbers = train_df['patent_number'].unique().tolist()

# Create a dropdown menu for patent application numbers
st.title("Patent Application Sentiment Analysis")

application_number = st.selectbox(
    "Select Patent Application Number",
    patent_numbers  # Populate dropdown with patent numbers from the dataset
)

# Show abstract and claims
selected_patent = train_df[train_df['patent_number'] == application_number].iloc[0]
abstract_text = st.text_area("Abstract", selected_patent['abstract'])
claims_text = st.text_area("Claims", selected_patent['claims'])

# Function to truncate text
def truncate_text(text, tokenizer, max_length=512):
    tokens = tokenizer.encode(text, add_special_tokens=False, truncation=True, max_length=max_length)
    return tokenizer.decode(tokens, skip_special_tokens=True)

# Submit button
if st.button("Submit"):
    # Prepare the text for analysis
    text_to_analyze = f"Abstract: {abstract_text} Claims: {claims_text}"
    
    # Truncate the text if it's too long
    truncated_text = truncate_text(text_to_analyze, tokenizer)

    # Perform sentiment analysis only if the text is non-empty
    if truncated_text.strip():
        inputs = tokenizer(truncated_text, return_tensors="pt", max_length=512, truncation=True)
        outputs = model(**inputs)
        probs = outputs.logits.softmax(dim=-1)
        labels = sentiment_analysis.model.config.id2label
        label = labels[probs.argmax().item()]
        score = probs.max().item()
        
        # Display the result
        st.write(f"Sentiment: {label}, Score: {score}")
    else:
        st.write("The text is too short for analysis.")