import pandas as pd import streamlit as st from keybert import KeyBERT import yake from keyphrase_vectorizers import KeyphraseCountVectorizer @st.cache(allow_output_mutation=True, suppress_st_warning=True, show_spinner=True) def load_model(): model = KeyBERT("AI-Growth-Lab/PatentSBERTa") return model model = load_model() def predict_fn(text, model): kw_extractor = yake.KeywordExtractor(top=50) candidates = kw_extractor.extract_keywords(text) keyphrases = [candidate[0] for candidate in candidates] #kw_model = KeyBERT(model=kw_model) keywords=model.extract_keywords(text, keyphrases, keyphrase_ngram_range=(1, 3), top_n=50) return keywords st.title("Patent Text Extractor") placeholder = st.empty() text = placeholder.text_area("Paste or write text", height=300) button = st.button("Extract Keywords") #top_n = st.sidebar.slider("Select a number of keywords", 1, 10, 50,20) #min_ngram = st.sidebar.number_input("Minimum number of words in each keyword", 1) #max_ngram = st.sidebar.number_input("Maximum number of words in each keyword", 3) #st.sidebar.code(f"ngram_range=({min_ngram}, {max_ngram})") #params = {"docs": text_input, "top_n": top_n, "stop_words": 'english',"vectorizer":KeyphraseCountVectorizer()} #add_diversity = st.sidebar.checkbox("Adjust diversity of keywords") #if add_diversity: #method = st.sidebar.selectbox("Select a method", ("Max Sum Similarity", "Maximal Marginal Relevance")) #if method == "Max Sum Similarity": #nr_candidates = st.sidebar.slider("nr_candidates", 20, 50, 20, 2) #params["use_maxsum"] = True #params["nr_candidates"] = nr_candidates #elif method == "Maximal Marginal Relevance": #diversity = st.sidebar.slider("diversity", 0.1, 1.0, 0.6, 0.01) #params["use_mmr"] = True #params["diversity"] = diversity #kw_extractor = yake.KeywordExtractor(top=50) #candidates = kw_extractor.extract_keywords(text_input) #keyphrases = [candidate[0] for candidate in candidates] #kw_model = KeyBERT(model="google/bigbird-pegasus-large-bigpatent") keywords=predict_fn(text, model) #if keywords != []: #keywords = model.extract_keywords(text_input,keyphrases, keyphrase_ngram_range=(1, 3), #top_n=50,stop_words='english',vectorizer=KeyphraseCountVectorizer()) if keywords != []: st.info("Extracted keywords") keywords = pd.DataFrame(keywords, columns=["Keyword", "Score"]) st.table(keywords)