Spaces:

mmr44
/

CS670-Project-milestone3

Sleeping

App Files Files Community

CS670-Project-milestone3 / app.py

mmr44

Update app.py

e70e028 verified 10 months ago

raw

history blame contribute delete

2.68 kB

	import streamlit as st
	import pandas as pd
	from datasets import load_dataset
	from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

	# Load the sentiment analysis model and tokenizer
	model_name = "mmr44/fine-tuned-hupd-model"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)

	# Create a sentiment analysis pipeline
	sentiment_analysis = pipeline('text-classification', model=model, tokenizer=tokenizer)

	# Load dataset to get patent numbers
	dataset_dict = load_dataset('HUPD/hupd',
	name='sample',
	data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
	icpr_label=None,
	train_filing_start_date='2016-01-01',
	train_filing_end_date='2016-01-21',
	val_filing_start_date='2016-01-22',
	val_filing_end_date='2016-01-31',
	trust_remote_code=True
	)

	train_set = dataset_dict['train']
	# Convert to DataFrame to get patent numbers
	train_df = train_set.to_pandas()
	patent_numbers = train_df['patent_number'].unique().tolist()

	# Create a dropdown menu for patent application numbers
	st.title("Patent Application Sentiment Analysis")

	application_number = st.selectbox(
	"Select Patent Application Number",
	patent_numbers # Populate dropdown with patent numbers from the dataset
	)

	# Show abstract and claims
	selected_patent = train_df[train_df['patent_number'] == application_number].iloc[0]
	abstract_text = st.text_area("Abstract", selected_patent['abstract'])
	claims_text = st.text_area("Claims", selected_patent['claims'])

	# Function to truncate text
	def truncate_text(text, tokenizer, max_length=512):
	tokens = tokenizer.encode(text, add_special_tokens=False, truncation=True, max_length=max_length)
	return tokenizer.decode(tokens, skip_special_tokens=True)

	# Submit button
	if st.button("Submit"):
	# Prepare the text for analysis
	text_to_analyze = f"Abstract: {abstract_text} Claims: {claims_text}"

	# Truncate the text if it's too long
	truncated_text = truncate_text(text_to_analyze, tokenizer)

	# Perform sentiment analysis only if the text is non-empty
	if truncated_text.strip():
	inputs = tokenizer(truncated_text, return_tensors="pt", max_length=512, truncation=True)
	outputs = model(**inputs)
	probs = outputs.logits.softmax(dim=-1)
	labels = sentiment_analysis.model.config.id2label
	label = labels[probs.argmax().item()]
	score = probs.max().item()

	# Display the result
	st.write(f"Sentiment: {label}, Score: {score}")
	else:
	st.write("The text is too short for analysis.")