mmr44 commited on
Commit
8573615
·
verified ·
1 Parent(s): 2e5b48f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from datasets import load_dataset
4
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
5
+
6
+ # Load the sentiment analysis model and tokenizer
7
+ model_name = "mmr44/fine-tuned-hupd-model"
8
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
9
+ model = AutoModelForSequenceClassification.from_pretrained(model_name)
10
+
11
+ # Create a sentiment analysis pipeline
12
+ sentiment_analysis = pipeline('text-classification', model=model, tokenizer=tokenizer)
13
+
14
+ # Load dataset to get patent numbers
15
+ dataset_dict = load_dataset('HUPD/hupd',
16
+ name='sample',
17
+ data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather",
18
+ icpr_label=None,
19
+ train_filing_start_date='2016-01-01',
20
+ train_filing_end_date='2016-01-21',
21
+ val_filing_start_date='2016-01-22',
22
+ val_filing_end_date='2016-01-31',
23
+ )
24
+
25
+ train_set = dataset_dict['train']
26
+ # Convert to DataFrame to get patent numbers
27
+ train_df = train_set.to_pandas()
28
+ patent_numbers = train_df['patent_number'].unique().tolist()
29
+
30
+ # Create a dropdown menu for patent application numbers
31
+ st.title("Patent Application Sentiment Analysis")
32
+
33
+ application_number = st.selectbox(
34
+ "Select Patent Application Number",
35
+ patent_numbers # Populate dropdown with patent numbers from the dataset
36
+ )
37
+
38
+ # Show abstract and claims
39
+ selected_patent = train_df[train_df['patent_number'] == application_number].iloc[0]
40
+ abstract_text = st.text_area("Abstract", selected_patent['abstract'])
41
+ claims_text = st.text_area("Claims", selected_patent['claims'])
42
+
43
+ # Function to truncate text
44
+ def truncate_text(text, tokenizer, max_length=512):
45
+ tokens = tokenizer.encode(text, add_special_tokens=False, truncation=True, max_length=max_length)
46
+ return tokenizer.decode(tokens, skip_special_tokens=True)
47
+
48
+ # Submit button
49
+ if st.button("Submit"):
50
+ # Prepare the text for analysis
51
+ text_to_analyze = f"Abstract: {abstract_text} Claims: {claims_text}"
52
+
53
+ # Truncate the text if it's too long
54
+ truncated_text = truncate_text(text_to_analyze, tokenizer)
55
+
56
+ # Perform sentiment analysis only if the text is non-empty
57
+ if truncated_text.strip():
58
+ inputs = tokenizer(truncated_text, return_tensors="pt", max_length=512, truncation=True)
59
+ outputs = model(**inputs)
60
+ probs = outputs.logits.softmax(dim=-1)
61
+ labels = sentiment_analysis.model.config.id2label
62
+ label = labels[probs.argmax().item()]
63
+ score = probs.max().item()
64
+
65
+ # Display the result
66
+ st.write(f"Sentiment: {label}, Score: {score}")
67
+ else:
68
+ st.write("The text is too short for analysis.")