Spaces:

zhixiusue
/

Edutube

Sleeping

App Files Files Community

zhixiusue commited on 27 days ago

Commit

5def19a

verified ·

1 Parent(s): 7f029d3

Update streamlit_app.py

Browse files

Files changed (1) hide show

streamlit_app.py +92 -67

streamlit_app.py CHANGED Viewed

@@ -1,21 +1,22 @@
 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForTokenClassification
 import torch
-# ✅ Hugging Face model repo
 MODEL_REPO = "zhixiusue/EduTubeNavigator"
-# ✅ ID to label mapping
-id_to_label = {
-    0: 'O',
-    1: 'B-TOPIC',
-    2: 'I-TOPIC',
-    3: 'B-STYLE',
-    4: 'I-STYLE',
-    5: 'B-LENGTH',
-    6: 'I-LENGTH',
-    7: 'B-LANGUAGE',
-    8: 'I-LANGUAGE'
 }
 @st.cache_resource
@@ -24,69 +25,93 @@ def load_model():
     model = AutoModelForTokenClassification.from_pretrained(MODEL_REPO)
     return tokenizer, model
-tokenizer, model = load_model()
-def predict(text, model, tokenizer, id_to_label):
-    words = text.split()
-    inputs = tokenizer(words, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=128)
     model.eval()
     with torch.no_grad():
         outputs = model(**inputs)
-        predictions = torch.argmax(outputs.logits, dim=-1)
     word_ids = inputs.word_ids(batch_index=0)
-    aligned = []
-    for idx, word_idx in enumerate(word_ids):
-        if word_idx is not None:
-            word = words[word_idx]
-            label = id_to_label[predictions[0][idx].item()]
-            aligned.append((word, label))
-    return aligned
-def extract_entities(aligned_result):
-    entities = []
-    current_entity, current_text = None, ""
-    for word, label in aligned_result:
-        if label == "O":
-            if current_entity:
-                entities.append({"entity": current_entity, "text": current_text})
-                current_entity, current_text = None, ""
             continue
-        prefix, entity_type = label.split("-", 1)
-        if prefix == "B":
-            if current_entity:
-                entities.append({"entity": current_entity, "text": current_text})
-            current_entity = entity_type
-            current_text = word
-        elif prefix == "I" and current_entity == entity_type:
-            current_text += word
         else:
-            if current_entity:
-                entities.append({"entity": current_entity, "text": current_text})
-            current_entity, current_text = None, ""
-    if current_entity:
-        entities.append({"entity": current_entity, "text": current_text})
     return entities
-# ✅ Streamlit UI
-st.title("🎯 Learning Condition Extractor")
-st.write("사용자의 학습 목표 문장에서 조건 (TOPIC, STYLE, LENGTH, LANGUAGE)을 추출합니다.")
-user_input = st.text_input("💬 학습 목표를 입력하세요", value="유튜브 영상은 실습 위주로 30분 이내에 배우고 싶어요")
-if st.button("🔍 추출 시작"):
-    aligned = predict(user_input, model, tokenizer, id_to_label)
-    entities = extract_entities(aligned)
-    # 결과 정리
-    result_dict = {"TOPIC": None, "STYLE": None, "LENGTH": None, "LANGUAGE": None}
-    for ent in entities:
-        result_dict[ent["entity"]] = ent["text"]
-    st.subheader("📌 추출된 조건")
-    st.json(result_dict)

 import streamlit as st
 from transformers import AutoTokenizer, AutoModelForTokenClassification
+from sentence_transformers import SentenceTransformer
+from youtube_transcript_api import YouTubeTranscriptApi
+from googleapiclient.discovery import build
+from torch.nn.functional import cosine_similarity
 import torch
+# Hugging Face 모델
 MODEL_REPO = "zhixiusue/EduTubeNavigator"
+# YouTube API Key
+YOUTUBE_API_KEY = "AIzaSyA8SG7--MfQvWET6UOam0PVAcC5MDm4sbc"
+youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
+# ID → 라벨 매핑
+token_label_map = {
+    0: 'O', 1: 'B-TOPIC', 2: 'I-TOPIC', 3: 'B-STYLE', 4: 'I-STYLE',
+    5: 'B-LENGTH', 6: 'I-LENGTH', 7: 'B-LANGUAGE', 8: 'I-LANGUAGE'
 }
 @st.cache_resource
     model = AutoModelForTokenClassification.from_pretrained(MODEL_REPO)
     return tokenizer, model
+def predict_entities(text, tokenizer, model):
+    tokens = list(text)
+    inputs = tokenizer(tokens, is_split_into_words=True, return_tensors="pt", truncation=True, max_length=128)
     model.eval()
     with torch.no_grad():
         outputs = model(**inputs)
+        logits = outputs.logits
+        predictions = torch.argmax(logits, dim=-1)[0]
     word_ids = inputs.word_ids(batch_index=0)
+    entities = {}
+    current_entity = ""
+    current_type = ""
+    for idx, word_id in enumerate(word_ids):
+        if word_id is None:
             continue
+        label = token_label_map[predictions[idx].item()]
+        if label.startswith("B-"):
+            if current_type:
+                entities[current_type] = current_entity
+            current_type = label[2:]
+            current_entity = tokens[word_id]
+        elif label.startswith("I-") and label[2:] == current_type:
+            current_entity += tokens[word_id]
         else:
+            if current_type:
+                entities[current_type] = current_entity
+                current_type = ""
+                current_entity = ""
+    if current_type:
+        entities[current_type] = current_entity
     return entities
+def search_youtube_videos(query, max_results=10):
+    response = youtube.search().list(q=query, part="snippet", type="video", maxResults=max_results).execute()
+    results = []
+    for item in response['items']:
+        results.append({
+            'video_id': item['id']['videoId'],
+            'title': item['snippet']['title'],
+            'description': item['snippet']['description']
+        })
+    return results
+def get_transcript(video_id):
+    try:
+        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['ko', 'en'])
+        return " ".join([t['text'] for t in transcript])
+    except:
+        return ""
+@st.cache_resource
+def load_embedder():
+    return SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')
+def embed_texts(embedder, texts):
+    return embedder.encode(texts, convert_to_tensor=True)
+def recommend_video(embedder, user_conditions, video_infos):
+    user_text = " ".join([v for v in user_conditions.values() if v])
+    user_embedding = embed_texts(embedder, [user_text])[0]
+    scored = []
+    for video in video_infos:
+        video_text = video['title'] + " " + video['description'] + " " + get_transcript(video['video_id'])
+        video_embedding = embed_texts(embedder, [video_text])[0]
+        score = cosine_similarity(user_embedding, video_embedding, dim=0).item()
+        scored.append((score, video))
+    return sorted(scored, reverse=True, key=lambda x: x[0])[:3]
+# Streamlit UI
+st.title("🎯 학습 조건 기반 유튜브 추천기")
+st.write("학습 목표를 입력하면 조건을 추출하고 유튜브 영상을 추천합니다.")
+user_input = st.text_input("💬 학습 목표를 입력하세요", "딥러닝을 실습 위주로 30분 안에 배우고 싶어요")
+if st.button("🔍 추천 시작"):
+    tokenizer, model = load_model()
+    embedder = load_embedder()
+    entities = predict_entities(user_input, tokenizer, model)
+    st.subheader("📌 추출된 조건")
+    st.json(entities)
+    search_query = " ".join([v for v in entities.values() if v])
+    video_candidates = search_youtube_videos(search_query)
+    top_recommendations = recommend_video(embedder, entities, video_candidates)
+    st.subheader("📺 추천 유튜브 영상")
+    for score, video in top_recommendations:
+        st.markdown(f"**{video['title']}**  ")
+        st.markdown(f"🔗 [링크](https://www.youtube.com/watch?v={video['video_id']})")