Spaces:
Sleeping
Sleeping
File size: 2,677 Bytes
8573615 e70e028 8573615 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import streamlit as st
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
# Load the sentiment analysis model and tokenizer
model_name = "mmr44/fine-tuned-hupd-model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
# Create a sentiment analysis pipeline
sentiment_analysis = pipeline('text-classification', model=model, tokenizer=tokenizer)
# Load dataset to get patent numbers
dataset_dict = load_dataset('HUPD/hupd',
name='sample',
data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
icpr_label=None,
train_filing_start_date='2016-01-01',
train_filing_end_date='2016-01-21',
val_filing_start_date='2016-01-22',
val_filing_end_date='2016-01-31',
trust_remote_code=True
)
train_set = dataset_dict['train']
# Convert to DataFrame to get patent numbers
train_df = train_set.to_pandas()
patent_numbers = train_df['patent_number'].unique().tolist()
# Create a dropdown menu for patent application numbers
st.title("Patent Application Sentiment Analysis")
application_number = st.selectbox(
"Select Patent Application Number",
patent_numbers # Populate dropdown with patent numbers from the dataset
)
# Show abstract and claims
selected_patent = train_df[train_df['patent_number'] == application_number].iloc[0]
abstract_text = st.text_area("Abstract", selected_patent['abstract'])
claims_text = st.text_area("Claims", selected_patent['claims'])
# Function to truncate text
def truncate_text(text, tokenizer, max_length=512):
tokens = tokenizer.encode(text, add_special_tokens=False, truncation=True, max_length=max_length)
return tokenizer.decode(tokens, skip_special_tokens=True)
# Submit button
if st.button("Submit"):
# Prepare the text for analysis
text_to_analyze = f"Abstract: {abstract_text} Claims: {claims_text}"
# Truncate the text if it's too long
truncated_text = truncate_text(text_to_analyze, tokenizer)
# Perform sentiment analysis only if the text is non-empty
if truncated_text.strip():
inputs = tokenizer(truncated_text, return_tensors="pt", max_length=512, truncation=True)
outputs = model(**inputs)
probs = outputs.logits.softmax(dim=-1)
labels = sentiment_analysis.model.config.id2label
label = labels[probs.argmax().item()]
score = probs.max().item()
# Display the result
st.write(f"Sentiment: {label}, Score: {score}")
else:
st.write("The text is too short for analysis.")
|