File size: 3,512 Bytes
b695582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0eef6be
 
 
 
 
0e78b28
 
 
5afdbc6
b695582
0e78b28
b695582
 
 
5afdbc6
 
 
b695582
2a4bc1d
b695582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Import statements
import streamlit as st
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import numpy as np
from datasets import load_dataset

# Torch and torch dataloader
import torch
from torch.utils.data import DataLoader

st.title('Patentability Decision App')

# Input all validation patent files
dataset_dict = load_dataset('HUPD/hupd',
    name='sample',
    data_files="https://huggingface.co/datasets/HUPD/hupd/blob/main/hupd_metadata_2022-02-22.feather", 
    icpr_label=None,
    train_filing_start_date='2016-01-01',
    train_filing_end_date='2016-01-21',
    val_filing_start_date='2016-01-22',
    val_filing_end_date='2016-01-31',
)

# Remove all untrained decisions
# Label-to-index mapping for the decision status field
decision_to_str = {'REJECTED': 0, 'ACCEPTED': 1, 'PENDING': 2, 'CONT-REJECTED': 3, 'CONT-ACCEPTED': 4, 'CONT-PENDING': 5}

# Helper function
def map_decision_to_string(example):
    return {'decision': decision_to_str[example['decision']]}
  
# Re-labeling/mapping in validation set
val_set = dataset_dict['validation'].map(map_decision_to_string)
# Filtering only those patents that have decisions as accepted/rejected
val_set = val_set.filter(lambda e: e['decision'] <= 1)

# Display all patent numbers to select a file
patent_num = st.selectbox("Select a patent based on its number",  val_set['patent_number'])


# Keeping the session state
if "button_clicked" not in st.session_state:    
    st.session_state.button_clicked = False

# The button was clicked
def callback():
    st.session_state.button_clicked = True

# Get the abstract and claims data to predict
if patent_num and (st.button('Get Data to predict!', on_click = callback) or st.session_state.button_clicked):
  # Display the abstract and claims
  val_set = val_set.filter(lambda e: e['patent_number'] == patent_num)
  
  abstract_text = st.text_area('Abstract', val_set['abstract'][0])
  claims_text = st.text_area('Claims', val_set['claims'][0])
  
  # Predict on those texts   
  if abstract_text and claims_text and st.button('Predict!'):
    # Model/tokenizer name or path to finetuned model
    model_name_or_path = './models/'
    model_name = 'distilbert-base-uncased'
    # Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Model
    model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)
    
    # Tokenize the validation dataset and pass it to the model for prediction
    _SECTION_ = 'claims'
    val_set = val_set.map(lambda e: tokenizer((e[_SECTION_]), truncation=True, padding='max_length'),batched=True)
    val_set.set_format(type='torch', columns=['input_ids', 'attention_mask', 'decision'])
    # Creating a dataloader and only passing one row
    val_dataloader = DataLoader(val_set, batch_size=16)
    batch = next(iter(val_dataloader))
    inputs = (batch['input_ids'][0])
    decisions = (batch['decision'][0])
    
    # Predict
    with torch.no_grad():
        outputs = model(input_ids=inputs, labels=decisions).logits
    
    # Display prediction
    prediction = np.argmax(outputs, axis=-1).stride()[0] # prediction
    value = {i for i in decision_to_str if decision_to_str[i]==prediction}
    st.text('This is the predicted decision: ' + str(value))
    
    # Patentability score
    st.text('Probability that it will be rejected : ' + str(outputs[0][0].item() * 100))
    st.text('Probability that it will be accepted : ' + str(outputs[0][1].item() * 100))