Spaces:

zhixiusue
/

Edutube

Sleeping

App Files Files Community

zhixiusue commited on 29 days ago

Commit

ba2392f

verified ·

1 Parent(s): 6ce3538

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +109 -38

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,111 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+from transformers import AutoTokenizer, AutoModelForTokenClassification
+import torch
+id_to_label = {
+    0: 'O',
+    1: 'B-TOPIC',
+    2: 'I-TOPIC',
+    3: 'B-STYLE',
+    4: 'I-STYLE',
+    5: 'B-LENGTH',
+    6: 'I-LENGTH',
+    7: 'B-LANGUAGE',
+    8: 'I-LANGUAGE'
+}
+@st.cache_resource
+def load_model():
+    tokenizer = AutoTokenizer.from_pretrained(".")
+    model = AutoModelForTokenClassification.from_pretrained(".")
+    return tokenizer, model
+tokenizer, model = load_model()
+def predict(text, model, tokenizer, id_to_label):
+    tokens = list(text)
+    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=128)
+    model.eval()
+    with torch.no_grad():
+        outputs = model(**inputs)
+    logits = outputs.logits
+    predictions = torch.argmax(logits, dim=-1)
+    word_ids = inputs.word_ids(batch_index=0)
+    pred_labels = []
+    tokens_out = []
+    for idx, word_idx in enumerate(word_ids):
+        if word_idx is None:
+            continue
+        token = tokens[word_idx]
+        label = id_to_label[predictions[0][idx].item()]
+        tokens_out.append(token)
+        pred_labels.append(label)
+    return tokens_out, pred_labels
+def post_process(tokens, labels):
+    words, word_labels = [], []
+    current_word = ""
+    current_label = None
+    for token, label in zip(tokens, labels):
+        if token in ["[CLS]", "[SEP]", "[PAD]"]:
+            continue
+        if token.startswith("##"):
+            current_word += token[2:]
+        else:
+            if current_word:
+                words.append(current_word)
+                word_labels.append(current_label)
+            current_word = token
+            current_label = label
+    if current_word:
+        words.append(current_word)
+        word_labels.append(current_label)
+    return words, word_labels
+def align_words_labels(words, labels):
+    return list(zip(words, labels))
+def extract_entities(aligned_result):
+    entities, current_entity, current_text = [], None, ""
+    for word, label in aligned_result:
+        if label == "O":
+            if current_entity:
+                entities.append({"entity": current_entity, "text": current_text})
+                current_entity, current_text = None, ""
+            continue
+        prefix, entity_type = label.split("-", 1)
+        if prefix == "B":
+            if current_entity:
+                entities.append({"entity": current_entity, "text": current_text})
+            current_entity, current_text = entity_type, word
+        elif prefix == "I" and current_entity == entity_type:
+            current_text += word
+        else:
+            if current_entity:
+                entities.append({"entity": current_entity, "text": current_text})
+            current_entity, current_text = entity_type, word
+    if current_entity:
+        entities.append({"entity": current_entity, "text": current_text})
+    return entities
+# Streamlit UI
+st.title("🎯 Learning Condition Extractor")
+st.write("사용자의 학습 목표 문장에서 조건(TOPIC, STYLE, LENGTH, LANGUAGE)을 추출합니다.")
+user_input = st.text_input("학습 목표를 입력하세요:", value="딥러닝을 실습 위주로 30분 이내에 배우고 싶어요")
+if st.button("추론 시작"):
+    tokens, pred_labels = predict(user_input, model, tokenizer, id_to_label)
+    words, word_labels = post_process(tokens, pred_labels)
+    aligned = align_words_labels(words, word_labels)
+    entities = extract_entities(aligned)
+    result_dict = {'TOPIC': None, 'STYLE': None, 'LENGTH': None, 'LANGUAGE': None}
+    for ent in entities:
+        result_dict[ent['entity']] = ent['text']
+    st.subheader("📌 추출된 조건")
+    st.json(result_dict)